In [1]:
from sklearn.cluster import DBSCAN, SpectralClustering, MeanShift,KMeans
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score



import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import pickle

from collections import Counter

plt.style.use("seaborn")
%matplotlib inline
pd.set_option("display.max_columns", 101)

In [2]:
df = pickle.load(open('model_df.p','rb'))

In [3]:
df.dropna(inplace=True)

In [4]:
df.shape

(18537, 24)

In [5]:
X = Normalizer()
X = X.fit_transform(df)

In [20]:
db = DBSCAN(eps=.30, min_samples=5).fit(X)

Counter(db.fit_predict(X))

Counter({0: 18496, 1: 7, -1: 29, 2: 5})

In [18]:
db = DBSCAN(eps=5, min_samples=5).fit(X)

Counter(db.fit_predict(X))


Counter({0: 18537})

In [8]:

km = KMeans(n_clusters=6)
km.fit(X)
pred =km.predict(X)
print(Counter(pred))


Counter({2: 4958, 0: 4625, 1: 3768, 5: 2465, 3: 2355, 4: 366})


In [25]:
scores = []
cluster_count = []
for i in range(2,29,1):
    km = KMeans(n_clusters=i)
    km.fit(X)
    pred =km.predict(X)
    scores.append(silhouette_score(X, pred))
    cluster_count.append(i)
    print(Counter(pred))
    print(silhouette_score(X, pred))
    print('------')





Counter({0: 9941, 1: 8596})
0.371993871202
------
Counter({2: 7637, 0: 7446, 1: 3454})
0.310384471388
------
Counter({2: 7056, 1: 6061, 0: 3339, 3: 2081})
0.325062795068
------
Counter({0: 5060, 2: 5042, 1: 4080, 4: 2343, 3: 2012})
0.288621692642
------
Counter({3: 4967, 0: 4632, 2: 3755, 1: 2462, 5: 2355, 4: 366})
0.301907526664
------
Counter({2: 4248, 3: 3889, 6: 3437, 0: 2380, 5: 2226, 1: 2005, 4: 352})
0.263307330647
------
Counter({7: 3817, 2: 3404, 0: 2972, 6: 2284, 1: 2126, 5: 2028, 3: 1582, 4: 324})
0.268940251216
------
Counter({2: 3807, 1: 3277, 0: 2840, 4: 2471, 8: 2182, 6: 2138, 5: 1359, 3: 318, 7: 145})
0.267703223277
------
Counter({7: 3442, 1: 3225, 0: 2858, 6: 2009, 2: 1962, 3: 1859, 5: 1754, 9: 974, 8: 309, 4: 145})
0.26788872001
------
Counter({7: 3419, 3: 3227, 9: 2851, 2: 2002, 5: 1953, 8: 1860, 1: 1763, 4: 961, 6: 309, 0: 142, 10: 50})
0.271274022418
------
Counter({6: 2730, 8: 2639, 0: 2555, 7: 2455, 3: 1861, 2: 1840, 1: 1671, 4: 1340, 5: 945, 9: 309, 11: 142, 10

In [None]:
sns.lineplot(cluster_count,scores)

In [None]:
Sum_of_squared_distances = []
K = range(1,15)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(X)
    Sum_of_squared_distances.append(km.inertia_)


plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [23]:
scores = []
EP =[]
number =[]
eps= [.001,.005,.01,.05]
size= [5,10,15,20,25,30,35]
for ep in eps:
    for siz in size:
        db = DBSCAN(eps=ep, min_samples=siz)   
        pred = db.fit_predict(X)
        
        print('ep:'+ str(ep) + ' size:'+ str(siz))
        print(Counter(pred))
        try:
            scores.append(silhouette_score(X, pred))
        except:
            scores.append(np.nan)
        EP.append(ep)
        number.append(siz)



ep:0.001 size:5
Counter({-1: 18537})
ep:0.001 size:10
Counter({-1: 18537})
ep:0.001 size:15
Counter({-1: 18537})
ep:0.001 size:20
Counter({-1: 18537})
ep:0.001 size:25
Counter({-1: 18537})
ep:0.001 size:30
Counter({-1: 18537})
ep:0.001 size:35
Counter({-1: 18537})
ep:0.005 size:5
Counter({-1: 18537})
ep:0.005 size:10
Counter({-1: 18537})
ep:0.005 size:15
Counter({-1: 18537})
ep:0.005 size:20
Counter({-1: 18537})
ep:0.005 size:25
Counter({-1: 18537})
ep:0.005 size:30
Counter({-1: 18537})
ep:0.005 size:35
Counter({-1: 18537})
ep:0.01 size:5
Counter({-1: 18454, 7: 17, 3: 13, 8: 11, 1: 9, 2: 7, 9: 6, 6: 5, 5: 5, 0: 5, 4: 5})
ep:0.01 size:10
Counter({-1: 18537})
ep:0.01 size:15
Counter({-1: 18537})
ep:0.01 size:20
Counter({-1: 18537})
ep:0.01 size:25
Counter({-1: 18537})
ep:0.01 size:30
Counter({-1: 18537})
ep:0.01 size:35
Counter({-1: 18537})
ep:0.05 size:5
Counter({0: 16057, -1: 2325, 2: 78, 7: 17, 10: 17, 4: 11, 5: 6, 1: 5, 3: 5, 11: 5, 6: 4, 9: 4, 8: 3})
ep:0.05 size:10
Counter({0: 1563

In [None]:
sns.lineplot(EP,scores)

In [None]:
sns.lineplot(number,scores)

In [None]:
df['preds'] = pred

In [None]:
scores[2]

In [None]:
sns.pairplot(df)

In [32]:
from sklearn.cluster import AgglomerativeClustering


links=[]
score=[]
clusters=[]
for linkage in ('ward', 'average', 'complete'):
    for number in range(3,12,1):
        clustering = AgglomerativeClustering(linkage=linkage, n_clusters=number)
        pred = clustering.fit_predict(X)
        try:
            print('linkage:' + str(linkage) +' clusters:' +str(number))
            print(silhouette_score(X,pred))
            print(Counter(pred))
            
        except:
            pass
        links.append(linkage)
        score.append(silhouette_score(X,pred))
        clusters.append(number)



linkage:ward clusters:3
0.271786906222
Counter({0: 9285, 1: 6702, 2: 2550})
linkage:ward clusters:4
0.284560913325
Counter({0: 7569, 3: 6702, 2: 2550, 1: 1716})
linkage:ward clusters:5
0.23098170093
Counter({1: 6702, 4: 4014, 3: 3555, 2: 2550, 0: 1716})
linkage:ward clusters:6
0.239494143021
Counter({0: 6702, 4: 4014, 3: 3555, 2: 2550, 5: 1388, 1: 328})
linkage:ward clusters:7
0.240853832294
Counter({4: 4014, 2: 3644, 3: 3555, 6: 3058, 0: 2550, 5: 1388, 1: 328})
linkage:ward clusters:8
0.225024708219
Counter({4: 4014, 2: 3644, 1: 3555, 6: 3058, 3: 1625, 5: 1388, 7: 925, 0: 328})
linkage:ward clusters:9
0.227460737986
Counter({1: 4014, 2: 3644, 0: 3555, 6: 3058, 3: 1625, 5: 1388, 7: 925, 4: 185, 8: 143})
linkage:ward clusters:10
0.206878087902
Counter({1: 4014, 0: 3644, 2: 3058, 4: 2772, 3: 1625, 5: 1388, 7: 925, 6: 783, 9: 185, 8: 143})
linkage:ward clusters:11
0.175626604562
Counter({1: 4014, 0: 3058, 4: 2772, 6: 2128, 3: 1625, 10: 1516, 5: 1388, 7: 925, 2: 783, 9: 185, 8: 143})
linka