In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import datasets, metrics

In [2]:
from sqlalchemy import create_engine
import warnings

warnings.filterwarnings('ignore')

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'heartdisease'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

heartdisease = pd.read_sql_query('select * from heartdisease',con=engine)

engine.dispose()

In [3]:
# Make sure the number of rows divides evenly into four samples.
rows = heartdisease.shape[0] - heartdisease.shape[0] % 2
df = heartdisease.iloc[:rows, :]

# Break into a set of features and a variable for the known outcome.
X = df.iloc[:, :13]
y = df.iloc[:, 13]

# Replace some random string values.
X = X.replace(to_replace='?', value=0)

# Binarize y so that 1 means heart disease diagnosis and 0 means no diagnosis.
y = np.where(y > 0, 0, 1)

# Normalize
X_std = StandardScaler().fit_transform(X)

# Create the two-feature PCA for graphing purposes.
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_std)


1. Apply GMM to the heart disease data by setting n_components=2. Get ARI and silhoutte scores for your solution and compare it with those of the k-means and hierarchical clustering solutions that you implemented in the assignments of the previous checkpoints. Which algorithm does perform better?

In [7]:
from sklearn.mixture import GaussianMixture

gmm_cluster = GaussianMixture(n_components=2, random_state=123)

clusters = gmm_cluster.fit_predict(X_std)

In [8]:
print("Adjusted Rand Index of the GMM solution: {}"
      .format(metrics.adjusted_rand_score(y, clusters)))
print("The silhoutte score of the GMM solution: {}"
      .format(metrics.silhouette_score(X_std, clusters, metric='euclidean')))

Adjusted Rand Index of the GMM solution: 0.18230716541111341
The silhoutte score of the GMM solution: 0.13560123273712887


The k means solution is still performing the best with an ARI score of 0.44 and silhouette score of 0.17. The average linkage method had the best score for agglomerative clustering with an ARI of 0.29 and silhouette of 0.15. The GMM scores were the worst so far with scores of 0.18 and 0.14. 

2. GMM implementation of scikit-learn has a parameter called covariance_type. This parameter determines the type of covariance parameters to use. Specifically, there are four types you can specify:

full: This is the default. Each component has its own general covariance matrix.

tied: All components share the same general covariance matrix.

diag: Each component has its own diagonal covariance matrix.

spherical: Each component has its own single variance.

Try all of these. Which one does perform better in terms of ARI and silhouette scores?

In [11]:
# tied covariance type
gmm_cluster = GaussianMixture(n_components=2, covariance_type = 'tied', random_state=123)
clusters = gmm_cluster.fit_predict(X_std)

print("Adjusted Rand Index of the GMM solution: {}"
      .format(metrics.adjusted_rand_score(y, clusters)))
print("The silhoutte score of the GMM solution: {}"
      .format(metrics.silhouette_score(X_std, clusters, metric='euclidean')))

Adjusted Rand Index of the GMM solution: 0.18230716541111341
The silhoutte score of the GMM solution: 0.13560123273712887


In [12]:
# diag covariance type
gmm_cluster = GaussianMixture(n_components=2, covariance_type = 'diag', random_state=123)
clusters = gmm_cluster.fit_predict(X_std)

print("Adjusted Rand Index of the GMM solution: {}"
      .format(metrics.adjusted_rand_score(y, clusters)))
print("The silhoutte score of the GMM solution: {}"
      .format(metrics.silhouette_score(X_std, clusters, metric='euclidean')))

Adjusted Rand Index of the GMM solution: 0.18230716541111341
The silhoutte score of the GMM solution: 0.13560123273712887


In [13]:
# spherical covariance type
gmm_cluster = GaussianMixture(n_components=2, covariance_type = 'spherical', random_state=123)
clusters = gmm_cluster.fit_predict(X_std)

print("Adjusted Rand Index of the GMM solution: {}"
      .format(metrics.adjusted_rand_score(y, clusters)))
print("The silhoutte score of the GMM solution: {}"
      .format(metrics.silhouette_score(X_std, clusters, metric='euclidean')))

Adjusted Rand Index of the GMM solution: 0.2060175349560907
The silhoutte score of the GMM solution: 0.12345483213377387


The GMM using the spherical covariance parameter had the best ARI score at 0.21, however it also had the worst silhoutte score at 0.12. All the other covariance parameters achieved the same ARI and silhoutte scores.