<h1> **Gaussian Mixture Model on Cars Dataset** </h1>

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from matplotlib import colors
import matplotlib
import pylab as pl
import pandas as pd
%matplotlib inline

In [None]:
## Cleaning the data to fill replace NA fields with zeroes
data = pd.read_csv('cars.csv')
data = data.fillna(0)
data.brand = pd.Categorical(data.brand)
## converting the string brand into a categorical variable
data['brand_categorical_variable'] = data.brand.cat.codes


In [None]:
features_to_be_included = ['mpg', 'cylinders', 'cubicinches', 'hp', 'weightlbs', 'time-to-60', 'year' ,'brand_categorical_variable' ]
data = data[features_to_be_included]

In [None]:
## Code referred from professor reading assignment links
from matplotlib.patches import Ellipse

def draw_ellipse(position, covariance, ax=None, **kwargs):
    """Draw an ellipse with a given position and covariance"""
    ax = ax or plt.gca()
    
    # Convert covariance to principal axes
    if covariance.shape == (2, 2):
        U, s, Vt = np.linalg.svd(covariance)
        angle = np.degrees(np.arctan2(U[1, 0], U[0, 0]))
        width, height = 2 * np.sqrt(s)
    else:
        angle = 0
        width, height = 2 * np.sqrt(covariance)
    
    # Draw the Ellipse
    for nsig in range(1, 4):
        ax.add_patch(Ellipse(position, nsig * width, nsig * height,
                             angle, **kwargs))
        
def plot_gmm(gmm, X, label=True, ax=None):
    ax = ax or plt.gca()
    labels = gmm.fit(X).predict(X)
    if label:
        ax.scatter(X[:, 0], X[:, 4], c=labels, s=40, cmap='viridis', zorder=2)
    else:
        ax.scatter(X[:, 0], X[:, 4], s=40, zorder=2)
    ax.axis('equal')
    
    w_factor = 0.2 / gmm.weights_.max()
    for pos, covar, w in zip(gmm.means_, gmm.covars_, gmm.weights_):
        draw_ellipse(pos, covar, alpha=w * w_factor)

In [None]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=4,verbose=1,verbose_interval=10,max_iter=500).fit(data.values)
labels = gmm.predict(data.values)
X = data.values
plt.scatter(X[:, 0], X[:, 4], c=labels, s=40, cmap='viridis');

In [None]:
cylinders = features_to_be_included_4_kmeans[1]
matplotlib.pyplot.hist(kmeans_data[cylinders])

In [None]:
cubicinches = features_to_be_included_4_kmeans[2]
matplotlib.pyplot.hist(kmeans_data[cubicinches])

In [None]:
hp = features_to_be_included_4_kmeans[3]
matplotlib.pyplot.hist(kmeans_data[hp])

In [None]:
weightlbs = features_to_be_included_4_kmeans[4]
matplotlib.pyplot.hist(kmeans_data[weightlbs])

In [None]:
timeto60 = features_to_be_included_4_kmeans[5]
matplotlib.pyplot.hist(kmeans_data[timeto60])

In [None]:
## finding optimal number of clusters using Silhouttee coefficients and kmeans scores
silhoutte_coefficients = []
kmeans_scores = []
X = []
for i in range(1,11):
    model = KMeans( init='k-means++',n_clusters=i)
    data = kmeans_data[features_to_be_included_4_kmeans]
    kmeans = model.fit(data)
    label = kmeans.labels_
    if (i > 1):
        sil_coeff = silhouette_score(data, label, metric='euclidean')
    else:
        sil_coeff = 0
    score_from_kmeans_fit = kmeans.score(data)
    kmeans_scores.append(score_from_kmeans_fit)
    silhoutte_coefficients.append(sil_coeff)
    X.append(i)
    
    print("Number of Clusters = {}: The Silhouette Coefficient is {}\n Number of Clusters = {}: Score from Kmeans fit {}".format(i, sil_coeff,i, score_from_kmeans_fit))

In [None]:
finalized_cluster_count = 2
model = KMeans( init='k-means++',n_clusters=finalized_cluster_count)
data = kmeans_data[features_to_be_included_4_kmeans]
kmeans = model.fit(data)
label = kmeans.labels_
sil_coeff = silhouette_score(data, label, metric='euclidean')
Score = kmeans.score(data)
print (Score)
print("Number of Clusters = {}, The Silhouette Coefficient is {}".format(finalized_cluster_count, sil_coeff))

In [None]:
finalized_cluster_count = 2
model = KMeans(n_clusters=finalized_cluster_count)
data = kmeans_data[features_to_be_included_4_kmeans]
kmeans = model.fit(data)
label = kmeans.labels_
sil_coeff = silhouette_score(data, label, metric='euclidean')
Score = kmeans.score(data)
print (Score)
print("Number of Clusters = {}, The Silhouette Coefficient is {}".format(finalized_cluster_count, sil_coeff))

In [None]:
kmeans_scores

In [None]:
### Elbow method for plotting parameter K vs k-means score to compute optimal K value
L = matplotlib.pyplot.plot( X, kmeans_scores, linewidth=5)
## Elbow method shows there that k-means-score flatten at K = 5

In [None]:
### plotting parameter K vs silhoutte score to compute optimal K value
SHS = matplotlib.pyplot.plot( X, silhoutte_coefficients, linewidth=5)
## Silhoutte scores range between -1 to 1. 
# Silhoutte scores promote cluster density and more distance between clusters.
## There are two silhoutte peaks at K=2 and K =5

In [None]:
# Although silhoutte has 2 peaks, 
#Based on silhoutte method and elbow method, we can choose K = 5 as optimal number of clusters

In [None]:
model = KMeans(n_clusters=5)
test_features= ['mpg','weightlbs']
features_to_be_included_4_kmeans= test_features;
data = kmeans_data[features_to_be_included_4_kmeans]
kmeans = model.fit(data)


In [None]:
predict_label_per_point = model.predict(data)
import collections
collections.Counter(predict_label_per_point)

In [None]:
#mpg vs weight
color_map=['red', 'green', 'blue', 'orange', 'magenta']
import matplotlib.pyplot as plt
plt.scatter(data.iloc[:, 0], data.iloc[:, 1], c=predict_label_per_point, s=20, cmap=colors.ListedColormap(color_map))
plt.show()

In [None]:
# There are 5 clusters, Counter({0: 46, 1: 72, 2: 56, 3: 84, 4: 3})
#0 = red (46 points)
#1 = blue 72 points
#2 = green (56 points)
#3 = orange (84 points)
#4 = magenta (3 points)
#mpg vs weight
color_map=['red', 'lightgreen', 'lightblue', 'magenta', 'orange']
import matplotlib.pyplot as plt
plt.scatter(data.iloc[:, 0], data.iloc[:, 1], c=predict_label_per_point, cmap=colors.ListedColormap(color_map))
centers = model.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', marker='*', s=200);
plt.show()