<h1> **Gaussian Mixture Model on Cars Dataset** </h1>

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from matplotlib import colors
import matplotlib
import pylab as pl
import pandas as pd
%matplotlib inline

In [None]:
## Cleaning the data to fill replace NA fields with zeroes
data = pd.read_csv('cars.csv')
data = data.fillna(0)
data.brand = pd.Categorical(data.brand)
## converting the string brand into a categorical variable
data['brand_categorical_variable'] = data.brand.cat.codes


In [None]:
features_to_be_included = ['mpg', 'cylinders', 'cubicinches', 'hp', 'weightlbs', 'time-to-60', 'year' ,'brand_categorical_variable' ]
data = data[features_to_be_included]
data[:2]



<h1>** Code referred from professor reading assignment links **

</h1>

In [None]:
from matplotlib.patches import Ellipse
import numpy as np
def draw_ellipse(position, covariance, ax=None, **kwargs):
    """Draw an ellipse with a given position and covariance"""
    ax = ax or plt.gca()
    
    # Convert covariance to principal axes
    if covariance.shape == (2, 2):
        U, s, Vt = np.linalg.svd(covariance)
        angle = np.degrees(np.arctan2(U[1, 0], U[0, 0]))
        width, height = 2 * np.sqrt(s)
    else:
        angle = 0
        width, height = 2 * np.sqrt(covariance)
    
    # Draw the Ellipse
    for nsig in range(1, 4):
        ax.add_patch(Ellipse(position, nsig * width, nsig * height,
                             angle, **kwargs))
        
def plot_gmm(gmm, X, label=True, ax=None):
    ax = ax or plt.gca()
    labels = gmm.fit(X).predict(X)
    if label:
        ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis', zorder=2)
    else:
        ax.scatter(X[:, 0], X[:, 1], s=40, zorder=2)
    ax.axis('equal')
    
    w_factor = 0.2 / gmm.weights_.max()
    for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_):
        draw_ellipse(pos, covar, alpha=w * w_factor)

In [None]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=5,covariance_type='full',verbose=1,verbose_interval=10,max_iter=5000).fit(data.values)
labels = gmm.predict(data.values)
test_features= ['mpg','time-to-60']
X = data[test_features].values
plt.scatter(X[:, 0], X[:, 1], c=labels, s=5, cmap='viridis');
plot_gmm(gmm,X)

<h1> ** Identifying optimal number of n components using AIC/ BIC criteria**</h1>

In [None]:
gmm_models = [GaussianMixture(n_components=i,
                              covariance_type='full',max_iter=500).fit(data.values) 
              for i in range(1,26)] 
n_components = [i for i in range(1,26)]
plt.plot(n_components, [m.bic(data.values) for m in gmm_models], label='BIC')
plt.plot(n_components, [m.aic(data.values) for m in gmm_models], label='AIC')
plt.legend(loc='best')
plt.xlabel('number_of_components');

In [None]:
#density estimation using GMM as generator
gmm = GaussianMixture(n_components=5,covariance_type='full', random_state=0)
model = gmm.fit(data.values)
(genX, genY) = model.sample(500)
plt.scatter(genX[:,0], genX[:,5],c=genY,s=20,cmap='viridis')
test_features= ['mpg','time-to-60']
X = data[test_features].values
plot_gmm(gmm,X)