In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import phate

import seaborn as sns
from ggplot import *
import matplotlib.pyplot as plt
%matplotlib inline

# 0.load data

In [None]:
name = ["ASD","HP","MI","NORM","DCM"]

for index, i in enumerate(name):
    data_dir = "..." + i + "_data.txt"
    data = pd.read_csv(data_dir,header = None)
    
    txt_dir = "..." + i + "_name.txt"
    txt = pd.read_csv(txt_dir,header = None)
    
    print(i)
    print(data.shape)
    
    feat_cols = [ 'pixel_'+str(i) for i in range(data.shape[1]) ]
    if data.shape[0]>4000:
        rndperm = np.random.permutation(data.shape[0])
        
        data = pd.DataFrame(data.loc[rndperm[:4000],:].values,columns=feat_cols)
        txt = pd.DataFrame(txt.loc[rndperm[:4000],:].values,columns=["name"])
        
    
    data["name"] = list(txt.name)
    data["label"] = index+1
    data["color"] = i
    
    if index == 0:
        data_sum = data.copy()
    else:
        data_sum = pd.concat([data_sum, data], ignore_index=True)

# 1.PAC analysis

In [None]:
df = data_sum.copy()
pca = PCA(n_components=10)
pca_result = pca.fit_transform(df[feat_cols].values)

df['pca-one'] = pca_result[:,0]
df['pca-two'] = pca_result[:,1] 
df['pca-three'] = pca_result[:,2]
df['pca-four'] = pca_result[:,3]

print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

In [None]:
# plot density map of pca1 and pca2

rndperm = np.random.permutation(df.shape[0])
chart = ggplot( df.loc[rndperm,:], aes(x='pca-one', y='pca-two', color='color') ) \
        + geom_point(size=5,alpha=1) \
        + ggtitle("First and Second Principal Components colored by digit")
chart

In [None]:
#plot 2-D density contour map

name = ["ASD","DCM","HP","MI","NORM","PFO"]
number = 1

f, ax = plt.subplots(figsize=(10, 10))
ax = sns.kdeplot(df[df["color"]==name[number]]["pca-one"].values, df[df["color"]==name[number]]["pca-two"].values,
                 cmap="rainbow",
                 n_levels=18,
                 # shade=True,
                 # bw=0.5,
                 shade_lowest=False
                )
ax.set_xlabel("PC1",fontsize=25)
ax.set_ylabel("PC2",fontsize=25)
ax.set_xlim(-2, 12); ax.set_ylim(-7, 19)

# 2.k_mean clustering

In [None]:
from copy import deepcopy
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = (9, 9)
plt.style.use('ggplot')

In [None]:
data = pd.DataFrame(data = df[df["color"]==name[number]].values, columns = df.columns)

# Getting the values and plotting it
f1 = data['pca-one'].values
f2 = data['pca-two'].values
X = np.array(list(zip(f1, f2)))
plt.scatter(f1, f2, c='black', s=7)

In [None]:
# Euclidean Distance Caculator
def dist(a, b, ax=1):
    return np.linalg.norm(a - b, axis=ax)

# Number of clusters
k = 2
# X coordinates of random centroids
C_x = np.random.randint(0, np.max(X)-10, size=k)
# Y coordinates of random centroids
C_y = np.random.randint(0, np.max(X)-10, size=k)
C = np.array(list(zip(C_x, C_y)), dtype=np.float32)
print(C)



In [None]:
# To store the value of centroids when it updates
C_old = np.zeros(C.shape)
# Cluster Lables(0, 1, 2)
clusters = np.zeros(len(X))
# Error func. - Distance between new centroids and old centroids
error = dist(C, C_old, None)
# Loop will run till the error becomes zero
while error != 0:
    # Assigning each value to its closest cluster
    for i in range(len(X)):
        distances = dist(X[i], C)
        cluster = np.argmin(distances)
        clusters[i] = cluster
    # Storing the old centroid values
    C_old = deepcopy(C)
    # Finding the new centroids by taking the average value
    for i in range(k):
        points = [X[j] for j in range(len(X)) if clusters[j] == i]
        C[i] = np.mean(points, axis=0)
    error = dist(C, C_old, None)

colors = ['r', 'g', 'b', 'y', 'c', 'm']
fig, ax = plt.subplots()
for i in range(k):
        points = np.array([X[j] for j in range(len(X)) if clusters[j] == i])
        ax.scatter(points[:, 0], points[:, 1], s=7, c=colors[i])
ax.scatter(C[:, 0], C[:, 1], marker='*', s=200, c='#050505')

# 3.PAHTE

In [None]:
tree_data = data_sum.iloc[::,:2048].values
tree_clusters = data_sum.iloc[::,2049].values
print(tree_data.shape,tree_clusters.shape)
tree_clusters

In [None]:
phate_operator = phate.PHATE(k=15, t=100)
tree_phate = phate_operator.fit_transform(tree_data)
plt.scatter(tree_phate[:,0], tree_phate[:,1], c=tree_clusters)
plt.show()

In [None]:
tree_phate2 = pd.DataFrame(data = tree_phate, columns=["tree_phate_1","tree_phate_2"])
tree_phate2["label"] = tree_clusters
tree_phate2["color"] = data_sum.iloc[:,2050].values

tree_phate2.head()
sns.set(style="ticks")
f, ax = plt.subplots(figsize=(10, 10))
ax = sns.scatterplot(x="tree_phate_1", y='tree_phate_2',hue='color', palette="husl", markers="o", sizes=5,
                     data=tree_phate2,edgecolor="w", linewidth=0)