#1 Apply GMM to the heart disease dataset by setting n_components=2. Get ARI and silhouette scores for your solution and compare it with those of the k-means and hierarchical clustering solutions that you implemented in the previous checkpoint assignments. Which algorithm performs best?

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn import metrics
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings("ignore")

In [None]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'heartdisease'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
heartdisease_df = pd.read_sql_query('select * from heartdisease',con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()

In [None]:
# Define the features and the outcome
X = heartdisease_df.iloc[:, :13]
y = heartdisease_df.iloc[:, 13]

# Replace missing values (marked by ?) with a 0
X = X.replace(to_replace='?', value=0)

# Binarize y so that 1 means heart disease diagnosis and 0 means no diagnosis.
y = np.where(y > 0, 0, 1)

# Standardize the data.
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [None]:
gmm = GaussianMixture(n_components=2)
clusters = gmm.fit_predict(X_std)

print(f"The GMM models adjusted rand index score is {metrics.adjusted_rand_score(y, clusters):.4f}")
print(f"The GMM models silhouette score is {metrics.silhouette_score(X_std, clusters, metric='euclidean'):.4f}")

The GMM models adjusted rand index score is 0.4207
The GMM models silhouette score is 0.1612


The GMM model's ARI and silhouette score is higher than the K-means or hierarchical clustering methods. 

#2 GMM implementation of scikit-learn has a parameter called covariance_type. This parameter determines the type of covariance parameters to use. There are four types that you can specify:

1. full: This is the default. Each component has its own general covariance matrix.
2. tied: All components share the same general covariance matrix.
3. diag: Each component has its own diagonal covariance matrix.
4. spherical: Each component has its own single variance.

Try all of these. Which one performs best in terms of ARI and silhouette scores?

In [None]:
covariance = ['full','tied','diag','spherical']

for c in covariance:
  gmm = GaussianMixture(n_components=2, covariance_type=c)
  clusters = gmm.fit_predict(X_std)
  print(f'The GMM model with {c} covariance has an adjusted rand index score of {metrics.adjusted_rand_score(y, clusters):.4f}')
  print(f"The GMM model with {c} covariance has a silhouette score of {metrics.silhouette_score(X_std, clusters, metric='euclidean'):.4f}")

The GMM model with full covariance has an adjusted rand index score of 0.1839
The GMM model with full covariance has a silhouette score of 0.1363
The GMM model with tied covariance has an adjusted rand index score of 0.4558
The GMM model with tied covariance has a silhouette score of 0.1672
The GMM model with diag covariance has an adjusted rand index score of 0.3788
The GMM model with diag covariance has a silhouette score of 0.1584
The GMM model with spherical covariance has an adjusted rand index score of 0.2077
The GMM model with spherical covariance has a silhouette score of 0.1247


The GMM model with the 'tied' covariance hyperparameter had the best ARI and Silhouette scores. 