In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
import sklearn.tree as tree
from sklearn.metrics import plot_confusion_matrix, adjusted_rand_score

# Cluster
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from mpl_toolkits.mplot3d import Axes3D

# dimensionality reduction
from sklearn.decomposition import PCA, FastICA
from sklearn.random_projection import GaussianRandomProjection
from sklearn.manifold import TSNE

import pickle
from sklearn.preprocessing import MinMaxScaler
import os

dir_in = '../data/'
dir_out = '../plots/'
dataName = 'Gestures'
target = 'gestures'
fn_scaler = "../data/gestures_scaler.sav"

random_state = 233

In [2]:
# load scaled data
df = pd.read_csv(dir_in + 'gestures_None_None.csv')
df.head()

Unnamed: 0,muscle reading 1 sensor 1,muscle reading 1 sensor 2,muscle reading 1 sensor 3,muscle reading 1 sensor 4,muscle reading 1 sensor 5,muscle reading 1 sensor 6,muscle reading 1 sensor 7,muscle reading 1 sensor 8,muscle reading 2 sensor 1,muscle reading 2 sensor 2,...,muscle reading 7 sensor 8,muscle reading 8 sensor 1,muscle reading 8 sensor 2,muscle reading 8 sensor 3,muscle reading 8 sensor 4,muscle reading 8 sensor 5,muscle reading 8 sensor 6,muscle reading 8 sensor 7,muscle reading 8 sensor 8,gestures
0,0.625551,0.556701,0.567164,0.638462,0.56338,0.437751,0.07451,0.244094,0.42616,0.555556,...,0.413223,0.728395,0.566524,0.72,0.632,0.502347,0.47451,0.270588,0.557769,0
1,0.303965,0.505155,0.41791,0.523077,0.629108,0.485944,0.639216,0.464567,0.506329,0.529915,...,0.42562,0.670782,0.575107,0.693333,0.632,0.544601,0.584314,0.937255,0.553785,0
2,0.427313,0.494845,0.373134,0.515385,0.469484,0.465863,0.192157,0.551181,0.464135,0.568376,...,0.18595,0.506173,0.579399,0.626667,0.528,0.516432,0.584314,0.94902,0.685259,0
3,0.519824,0.551546,0.492537,0.592308,0.568075,0.578313,0.917647,0.448819,0.396624,0.538462,...,0.371901,0.432099,0.566524,0.706667,0.68,0.638498,0.654902,0.968627,0.665339,0
4,0.537445,0.536082,0.492537,0.561538,0.502347,0.53012,0.301961,0.523622,0.493671,0.547009,...,0.68595,0.333333,0.515021,0.64,0.64,0.422535,0.407843,0.062745,0.219124,0


In [3]:
X = df.drop([target],axis=1)
y = df[target]

In [4]:
# load scaler
scaler_x = pickle.load(open(fn_scaler, 'rb'))
X_origin = X.copy()
X_origin.iloc[:,:] = scaler_x.inverse_transform(X_origin)
X_origin.head()

Unnamed: 0,muscle reading 1 sensor 1,muscle reading 1 sensor 2,muscle reading 1 sensor 3,muscle reading 1 sensor 4,muscle reading 1 sensor 5,muscle reading 1 sensor 6,muscle reading 1 sensor 7,muscle reading 1 sensor 8,muscle reading 2 sensor 1,muscle reading 2 sensor 2,...,muscle reading 7 sensor 7,muscle reading 7 sensor 8,muscle reading 8 sensor 1,muscle reading 8 sensor 2,muscle reading 8 sensor 3,muscle reading 8 sensor 4,muscle reading 8 sensor 5,muscle reading 8 sensor 6,muscle reading 8 sensor 7,muscle reading 8 sensor 8
0,26.0,4.0,5.0,8.0,-1.0,-13.0,-109.0,-66.0,-9.0,2.0,...,21.0,-28.0,61.0,4.0,8.0,5.0,4.0,-7.0,-59.0,16.0
1,-47.0,-6.0,-5.0,-7.0,13.0,-1.0,35.0,-10.0,10.0,-4.0,...,-105.0,-25.0,47.0,6.0,6.0,5.0,13.0,21.0,111.0,15.0
2,-19.0,-8.0,-8.0,-8.0,-21.0,-6.0,-79.0,12.0,1.315614e-14,5.0,...,-128.0,-83.0,7.0,7.0,1.0,-8.0,7.0,21.0,114.0,48.0
3,2.0,3.0,0.0,2.0,2.364775e-14,22.0,106.0,-14.0,-16.0,-2.0,...,-54.0,-38.0,-11.0,4.0,7.0,11.0,33.0,39.0,119.0,43.0
4,6.0,0.0,0.0,-2.0,-14.0,10.0,-51.0,5.0,7.0,0.0,...,60.0,38.0,-35.0,-8.0,2.0,6.0,-13.0,-24.0,-112.0,-69.0


In [5]:
DR = 'None'
Cluster = 'None'

# parameter for cluster, not for Dimension Reduction
n_clusters = 4
n_components = n_clusters

def scatter(data, var1, var2, clusters=[]):
    if len(clusters) == 0:
        clusters = np.zeros(len(data))
    fig = plt.figure(figsize=(7, 7))
    plt.rcParams.update({'font.size': 15})
    plt.scatter(data[var1], data[var2], c = clusters, s=5, cmap='jet')
    plt.xlabel(var1)
    plt.ylabel(var2)
    plt.title(dataName +
              '\nDimension Reduction: '+ DR +
              '\nCluster: ' + Cluster + ', n=' + str(n_clusters))
    #plt.legend(loc="lower right")
    plt.tight_layout()
    plt.savefig(dir_out + dataName + '_' + DR + '_' + Cluster + '_Scatter.png')
    plt.close()

In [6]:
var1 = 'muscle reading 1 sensor 1'
var2 = 'muscle reading 1 sensor 2'
scatter(X, var1, var2, [])

## 1. Cluster without Dimension Reduction

In [7]:
df_scores = pd.DataFrame(columns = ["Dimension Reduction", "KMeans", "EM"])
df_scores.set_index("Dimension Reduction", inplace = True)
df_scores

Unnamed: 0_level_0,KMeans,EM
Dimension Reduction,Unnamed: 1_level_1,Unnamed: 2_level_1


In [8]:
DR = 'None'

# KMeans
Cluster = 'KMeans'
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X)

y_kmeans = kmeans.predict(X)
centers = kmeans.cluster_centers_
scatter(X, var1, var2, y_kmeans)
df_scores.loc[DR, Cluster] = adjusted_rand_score(y, y_kmeans)

# EM
Cluster = 'EM'
em = GaussianMixture(n_components=n_components)
em.fit(X)

y_em = em.predict(X)
#centers = em.cluster_centers_
scatter(X, var1, var2, y_em)
df_scores.loc[DR, Cluster] = adjusted_rand_score(y, y_em)

## 2. Cluster with Dimension Reduction


In [50]:
# PCA
DR = 'PCA'
Cluster = 'None'
pca = PCA(n_components=32, random_state = random_state)
X_PCA = pca.fit_transform(X)
X_PCA = pd.DataFrame(X_PCA)
print(pca.explained_variance_ratio_)

# KMeans
Cluster = 'KMeans'
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X_PCA)
y_kmeans = kmeans.predict(X_PCA)
centers = kmeans.cluster_centers_
scatter(X_PCA, X_PCA.columns[0], X_PCA.columns[1], y_kmeans)
df_scores.loc[DR, Cluster] = adjusted_rand_score(y, y_kmeans)

# EM
Cluster = 'EM'
em = GaussianMixture(n_components=n_components)
em.fit(X_PCA)
y_em = em.predict(X_PCA)
scatter(X_PCA,  X_PCA.columns[0], X_PCA.columns[1],  y_em)
df_scores.loc[DR, Cluster] = adjusted_rand_score(y, y_em)


[0.06355192 0.06068093 0.05232303 0.04682559 0.0444351  0.04126497
 0.03950002 0.03874431 0.03412771 0.02693589 0.02604264 0.02536223
 0.02469688 0.02411636 0.02279136 0.02208154 0.02161354 0.02055802
 0.01881599 0.01807665 0.01708849 0.01645508 0.01575867 0.01515992
 0.01365084 0.01282818 0.01258667 0.01174399 0.01145099 0.01113386
 0.01071465 0.01007469]


In [51]:
# ICA
DR = 'ICA'
Cluster = 'None'
ica = FastICA(n_components=32, random_state = random_state)
X_ICA = ica.fit_transform(X)
X_ICA = pd.DataFrame(X_ICA)

#print(ica.)


# KMeans
Cluster = 'KMeans'
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X_ICA)
y_kmeans = kmeans.predict(X_ICA)
centers = kmeans.cluster_centers_
scatter(X_ICA, X_ICA.columns[0], X_ICA.columns[1], y_kmeans)
df_scores.loc[DR, Cluster] = adjusted_rand_score(y, y_kmeans)

# EM
Cluster = 'EM'
em = GaussianMixture(n_components=n_components)
em.fit(X_ICA)
y_em = em.predict(X_ICA)
#centers = em.cluster_centers_
scatter(X_ICA,  X_ICA.columns[0], X_ICA.columns[1],  y_em)
df_scores.loc[DR, Cluster] = adjusted_rand_score(y, y_em)


In [52]:
# RP
DR = 'RP'
Cluster = 'None'
rp = GaussianRandomProjection(n_components = 32, random_state = random_state)
X_RP = rp.fit_transform(X)
X_RP = pd.DataFrame(X_RP)

# KMeans
Cluster = 'KMeans'
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X_RP)
y_kmeans = kmeans.predict(X_RP)
centers = kmeans.cluster_centers_
scatter(X_RP, X_RP.columns[0], X_RP.columns[1], y_kmeans)
df_scores.loc[DR, Cluster] = adjusted_rand_score(y, y_kmeans)

# EM
Cluster = 'EM'
em = GaussianMixture(n_components=n_components)
em.fit(X_RP)
y_em = em.predict(X_RP)
#centers = em.cluster_centers_
scatter(X_RP,  X_RP.columns[0], X_RP.columns[1],  y_em)
df_scores.loc[DR, Cluster] = adjusted_rand_score(y, y_em)


In [12]:
# TSNE ######################################
DR = 'TSNE'
Cluster = 'None'
tsne = TSNE(n_components = 3, random_state = random_state)

############ TSNE is too slow on large datasize, So we do TSNE on 20% of the data
RANDOM_STATE_DATA = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.4, random_state=RANDOM_STATE_DATA) 
############

X_tsne = tsne.fit_transform(X_train)
X_tsne = pd.DataFrame(X_tsne)

# KMeans
Cluster = 'KMeans'
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X_tsne)
y_kmeans = kmeans.predict(X_tsne)
centers = kmeans.cluster_centers_
scatter(X_tsne, X_tsne.columns[0], X_tsne.columns[1], y_kmeans)
df_scores.loc[DR, Cluster] = adjusted_rand_score(y_train, y_kmeans)

# EM
Cluster = 'EM'
em = GaussianMixture(n_components=n_components)
em.fit(X_tsne)
y_em = em.predict(X_tsne)
#centers = em.cluster_centers_
scatter(X_tsne,  X_tsne.columns[0], X_tsne.columns[1],  y_em)
df_scores.loc[DR, Cluster] = adjusted_rand_score(y_train, y_em)


In [53]:
df_scores

Unnamed: 0_level_0,KMeans,EM
Dimension Reduction,Unnamed: 1_level_1,Unnamed: 2_level_1
,0.000802377,0.636161
PCA,0.00108241,0.507594
ICA,0.00902957,0.655531
RP,0.00649537,0.41519
TSNE,0.124712,0.188173


In [14]:
np.shape(X)

(11678, 64)

In [15]:
y

0        0
1        0
2        0
3        0
4        0
        ..
11673    3
11674    3
11675    3
11676    3
11677    3
Name: gestures, Length: 11678, dtype: int64