In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
import sklearn.tree as tree
from sklearn.metrics import plot_confusion_matrix, adjusted_rand_score

# Cluster
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

from mpl_toolkits.mplot3d import Axes3D

# dimensionality reduction
from sklearn.decomposition import PCA, FastICA
from sklearn.random_projection import GaussianRandomProjection
from sklearn.manifold import TSNE

import pickle
from sklearn.preprocessing import MinMaxScaler
import os

dir_in = '../data/'
dir_out = '../plots/'
dataName = 'Heart Disease'
target = 'target'
fn_scaler = "../data/heart_scaler.sav"
random_state = 109

In [2]:
# load scaled data
df = pd.read_csv(dir_in + 'heart_None_None.csv')
df.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,cp_1,cp_2,cp_3,restecg_1,restecg_2,slope_1,slope_2,thal_1,thal_2,thal_3,target
0,0.708333,1.0,0.481132,0.244292,1.0,0.603053,0.0,0.370968,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
1,0.166667,1.0,0.339623,0.283105,0.0,0.885496,0.0,0.564516,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
2,0.25,0.0,0.339623,0.178082,0.0,0.770992,0.0,0.225806,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1
3,0.5625,1.0,0.245283,0.251142,0.0,0.816794,0.0,0.129032,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
4,0.583333,0.0,0.245283,0.520548,0.0,0.70229,1.0,0.096774,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1


In [3]:
X = df.drop(['target'],axis=1)
y = df['target']

In [4]:
# load scaler
scaler_x = pickle.load(open(fn_scaler, 'rb'))
X_origin = X.copy()
X_origin.iloc[:,:] = scaler_x.inverse_transform(X_origin)
X_origin.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,cp_1,cp_2,cp_3,restecg_1,restecg_2,slope_1,slope_2,thal_1,thal_2,thal_3
0,63.0,1.0,145.0,233.0,1.0,150.0,0.0,2.3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,37.0,1.0,130.0,250.0,0.0,187.0,0.0,3.5,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,41.0,0.0,130.0,204.0,0.0,172.0,0.0,1.4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,56.0,1.0,120.0,236.0,0.0,178.0,0.0,0.8,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,57.0,0.0,120.0,354.0,0.0,163.0,1.0,0.6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


In [5]:
DR = 'None'
Cluster = 'None'

n_clusters = 4
n_components = n_clusters

def scatter(data, var1, var2, clusters=[]):
    if len(clusters) == 0:
        clusters = np.zeros(len(data))
    fig = plt.figure(figsize=(7, 7))
    plt.rcParams.update({'font.size': 15})
    plt.scatter(data[var1], data[var2], c = clusters, s=20, cmap='jet')
    plt.xlabel(var1)
    plt.ylabel(var2)
    plt.title(dataName +
              '\nDimension Reduction: '+ DR +
              '\nCluster: ' + Cluster + ', n=' + str(n_clusters))
    #plt.legend(loc="lower right")
    plt.tight_layout()
    plt.savefig(dir_out + dataName + '_' + DR + '_' + Cluster + '_Scatter.png')
    plt.close()

In [6]:
var1 = 'age'
var2 = 'trestbps'
scatter(X, var1, var2, [])

## 1. Cluster without Dimension Reduction

In [7]:
df_scores = pd.DataFrame(columns = ["Dimension Reduction", "KMeans", "EM"])
df_scores.set_index("Dimension Reduction", inplace = True)
df_scores

Unnamed: 0_level_0,KMeans,EM
Dimension Reduction,Unnamed: 1_level_1,Unnamed: 2_level_1


In [8]:
DR = 'None'

# KMeans
Cluster = 'KMeans'
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X)

y_kmeans = kmeans.predict(X)
centers = kmeans.cluster_centers_
scatter(X, var1, var2, y_kmeans)
df_scores.loc[DR, Cluster] = adjusted_rand_score(y, y_kmeans)


# EM
Cluster = 'EM'
em = GaussianMixture(n_components=n_components)
em.fit(X)

y_em = em.predict(X)
#centers = em.cluster_centers_
scatter(X, var1, var2, y_em)
df_scores.loc[DR, Cluster] = adjusted_rand_score(y, y_em)


## 2. Cluster with Dimension Reduction

In [9]:
# PCA ######################################
DR = 'PCA'
Cluster = 'None'
pca = PCA(n_components=10, random_state = random_state)
X_PCA = pca.fit_transform(X)
X_PCA = pd.DataFrame(X_PCA)
print(pca.explained_variance_ratio_)

# export dimension reduction data
df_DR = X_PCA
df_DR['target'] = y
df_DR.to_csv(dir_in + 'heart_'+ DR + '_' + Cluster +'.csv', index = False)

# KMneas
Cluster = 'KMeans'
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X_PCA)
y_kmeans = kmeans.predict(X_PCA)
centers = kmeans.cluster_centers_
scatter(X_PCA, X_PCA.columns[0], X_PCA.columns[1], y_kmeans)
df_scores.loc[DR, Cluster] = adjusted_rand_score(y, y_kmeans)

# export Kmeans data
df_DR_Cluster = df_DR.copy() 
df_DR_Cluster['Cluster'] = y_kmeans
df_DR_Cluster.to_csv(dir_in + 'heart_'+ DR + '_' + Cluster +'.csv', index = False)

# EM
Cluster = 'EM'
em = GaussianMixture(n_components=n_components)
em.fit(X_PCA)
y_em = em.predict(X_PCA)
#centers = em.cluster_centers_
scatter(X_PCA,  X_PCA.columns[0], X_PCA.columns[1],  y_em)
df_scores.loc[DR, Cluster] = adjusted_rand_score(y, y_em)

# export EM data
df_DR_Cluster = df_DR.copy() 
df_DR_Cluster['Cluster'] = y_em
df_DR_Cluster.to_csv(dir_in + 'heart_'+ DR + '_' + Cluster +'.csv', index = False)


[0.28558813 0.15275067 0.10153692 0.09213523 0.07234356 0.0670144
 0.05139202 0.04006441 0.03105716 0.02706583]


In [10]:
# ICA ######################################
DR = 'ICA'
Cluster = 'None'
ica = FastICA(n_components=n_components, random_state = random_state)
X_ICA = ica.fit_transform(X)
X_ICA = pd.DataFrame(X_ICA)
#print(ica.)

# export dimension reduction data
df_DR = X_ICA
df_DR['target'] = y
df_DR.to_csv(dir_in + 'heart_'+ DR + '_' + Cluster +'.csv', index = False)

# KMeans
Cluster = 'KMeans'
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X_ICA)
y_kmeans = kmeans.predict(X_ICA)
centers = kmeans.cluster_centers_
scatter(X_ICA, X_ICA.columns[0], X_ICA.columns[1], y_kmeans)
df_scores.loc[DR, Cluster] = adjusted_rand_score(y, y_kmeans)

# export Kmeans data
df_DR_Cluster = df_DR.copy() 
df_DR_Cluster['Cluster'] = y_kmeans
df_DR_Cluster.to_csv(dir_in + 'heart_'+ DR + '_' + Cluster +'.csv', index = False)

# EM
Cluster = 'EM'
em = GaussianMixture(n_components=n_components)
em.fit(X_ICA)
y_em = em.predict(X_ICA)
#centers = em.cluster_centers_
scatter(X_ICA,  X_ICA.columns[0], X_ICA.columns[1],  y_em)
df_scores.loc[DR, Cluster] = adjusted_rand_score(y, y_em)

# export EM data
df_DR_Cluster = df_DR.copy() 
df_DR_Cluster['Cluster'] = y_em
df_DR_Cluster.to_csv(dir_in + 'heart_'+ DR + '_' + Cluster +'.csv', index = False)


In [25]:
# RP ######################################
DR = 'RP'
Cluster = 'None'
rp = GaussianRandomProjection(n_components = 5, random_state = random_state)
X_RP = rp.fit_transform(X)
X_RP = pd.DataFrame(X_RP)

# export dimension reduction data
df_DR = X_RP
df_DR['target'] = y
df_DR.to_csv(dir_in + 'heart_'+ DR + '_' + Cluster +'.csv', index = False)

# KMeans
Cluster = 'KMeans'
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X_RP)
y_kmeans = kmeans.predict(X_RP)
centers = kmeans.cluster_centers_
scatter(X_RP, X_RP.columns[0], X_RP.columns[1], y_kmeans)
df_scores.loc[DR, Cluster] = adjusted_rand_score(y, y_kmeans)

# export Kmeans data
df_DR_Cluster = df_DR.copy() 
df_DR_Cluster['Cluster'] = y_kmeans
df_DR_Cluster.to_csv(dir_in + 'heart_'+ DR + '_' + Cluster +'.csv', index = False)

# EM
Cluster = 'EM'
em = GaussianMixture(n_components=n_components)
em.fit(X_RP)
y_em = em.predict(X_RP)
#centers = em.cluster_centers_
scatter(X_RP,  X_RP.columns[0], X_RP.columns[1],  y_em)
df_scores.loc[DR, Cluster] = adjusted_rand_score(y, y_em)

# export EM data
df_DR_Cluster = df_DR.copy() 
df_DR_Cluster['Cluster'] = y_em
df_DR_Cluster.to_csv(dir_in + 'heart_'+ DR + '_' + Cluster +'.csv', index = False)


In [12]:
# TSNE ######################################
DR = 'TSNE'
Cluster = 'None'
tsne = TSNE(n_components = 3, random_state = random_state)
X_tsne = tsne.fit_transform(X)
X_tsne = pd.DataFrame(X_tsne)

# export dimension reduction data
df_DR = X_tsne
df_DR['target'] = y
df_DR.to_csv(dir_in + 'heart_'+ DR + '_' + Cluster +'.csv', index = False)

# KMeans
Cluster = 'KMeans'
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X_tsne)
y_kmeans = kmeans.predict(X_tsne)
centers = kmeans.cluster_centers_
scatter(X_tsne, X_tsne.columns[0], X_tsne.columns[1], y_kmeans)
df_scores.loc[DR, Cluster] = adjusted_rand_score(y, y_kmeans)

# export Kmeans data
df_DR_Cluster = df_DR.copy() 
df_DR_Cluster['Cluster'] = y_kmeans
df_DR_Cluster.to_csv(dir_in + 'heart_'+ DR + '_' + Cluster +'.csv', index = False)

# EM
Cluster = 'EM'
em = GaussianMixture(n_components=n_components)
em.fit(X_tsne)
y_em = em.predict(X_tsne)
#centers = em.cluster_centers_
scatter(X_tsne,  X_tsne.columns[0], X_tsne.columns[1],  y_em)
df_scores.loc[DR, Cluster] = adjusted_rand_score(y, y_em)

# export EM data
df_DR_Cluster = df_DR.copy() 
df_DR_Cluster['Cluster'] = y_em
df_DR_Cluster.to_csv(dir_in + 'heart_'+ DR + '_' + Cluster +'.csv', index = False)


In [26]:
df_scores

Unnamed: 0_level_0,KMeans,EM
Dimension Reduction,Unnamed: 1_level_1,Unnamed: 2_level_1
,0.225301,0.204973
PCA,0.241297,0.199059
ICA,0.510821,0.611519
RP,0.17912,0.816602
TSNE,0.0949935,0.784008


## 3. NN after Dimensionality Reduction

### Run the following

python NN.py PCA

python NN.py ICA

python NN.py RP

python NN.py TSNE

## 4. NN after Dimensionality Reduction and Cluatering

### Run the following

python NN.py PCA KMneas

python NN.py PCA EM

python NN.py ICA KMneas

python NN.py ICA EM

python NN.py RP KMneas

python NN.py Rp EM

python NN.py TSNE KMneas

python NN.py TSNE EM