In [15]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN, MeanShift
from sklearn.metrics import silhouette_score

from IPython.display import Markdown, display

In [16]:
df = pd.read_excel('adel.xlsx', sheet_name=1, index_col=0)

In [17]:
tsne = TSNE(n_components=6, method='exact')

In [18]:
reduced = tsne.fit_transform(df)

## DBSCAN

In [19]:
eps = 0.5

while True:
    labels = DBSCAN(eps=eps).fit_predict(reduced)
    if sum(labels == -1) <= 60:
        break
    
    eps += 0.1

print('eps =', eps)
print('labels =', np.unique(labels))
print('number of individual clusters =', sum(labels == -1))
print('silhouette score =', silhouette_score(reduced, labels))

table = 'Cluster|# Points\n---:|:---\n'
for i in range(len(np.unique(labels))):
    table += str(i-1) + '|'  # To account for -1
    table += str(sum(labels == i-1))
    table += '\n'

display(Markdown(table))

eps = 8.599999999999985
labels = [-1  0  1  2  3  4  5  6]
number of individual clusters = 40
silhouette score = -0.010803931


Cluster|# Points
---:|:---
-1|40
0|12
1|152
2|294
3|52
4|22
5|41
6|5


A Silhouette score of 0 indicates a worthless pursuit.

## Mean Shift

In [20]:
ms = MeanShift()
preds = ms.fit_predict(reduced)
print(np.unique(preds))

[0 1 2 3 4 5 6 7 8]


In [21]:
table = 'Cluster|# Points\n---:|:---\n'
for i in range(len(np.unique(preds))):
    table += str(i) + '|'
    table += str(sum(preds == i))
    table += '\n'

display(Markdown(table))

Cluster|# Points
---:|:---
0|609
1|2
2|1
3|1
4|1
5|1
6|1
7|1
8|1


## PCA and then DBSCAN

In [22]:
from sklearn.decomposition import PCA

In [23]:
pca = PCA(n_components=6)

In [24]:
reduced = pca.fit_transform(df)

In [26]:
eps = 0.5

while True:
    labels = DBSCAN(eps=eps).fit_predict(reduced)
    if sum(labels == -1) <= 60:
        break
    
    eps += 0.1

print('eps =', eps)
print('labels =', np.unique(labels))
print('number of individual clusters =', sum(labels == -1))
print('silhouette score =', silhouette_score(reduced, labels))

table = 'Cluster|# Points\n---:|:---\n'
for i in range(len(np.unique(labels))):
    table += str(i-1) + '|'  # To account for -1
    table += str(sum(labels == i-1))
    table += '\n'

display(Markdown(table))

eps = 139.49999999999636
labels = [-1  0  1  2  3]
number of individual clusters = 56
silhouette score = 0.594985427555076


Cluster|# Points
---:|:---
-1|56
0|533
1|9
2|15
3|5
