In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from numpy.random import random_sample
from math import sqrt, log

In [2]:
df=pd.read_csv('./whiskies.csv')
#print(df)
df_col_sliced=df.loc[:,
             ['Body',
              'Sweetness',
              'Smoky',
              'Medicinal',
              'Tobacco',
              'Honey',
              'Spicy',
              'Winey',
              'Nutty',
              'Malty',
              'Fruity',
              'Floral',
             ]]
#print(df_col_sliced)
df_to_array=df_col_sliced.as_matrix()
#print(df_to_array)

In [3]:
x_scaled=preprocessing.scale(df_to_array)
print(x_scaled)

[[-0.07542547 -0.40765084  0.54172898 ...,  0.31606376  0.25509887
   0.35566418]
 [ 1.00567291  0.99466805 -0.62298833 ...,  1.91497456  1.54559904
   0.35566418]
 [-1.15652385  0.99466805  0.54172898 ...,  0.31606376  1.54559904
   0.35566418]
 ..., 
 [-2.23762223  0.99466805 -0.62298833 ...,  0.31606376 -1.0354013
   0.35566418]
 [-0.07542547 -0.40765084 -0.62298833 ..., -1.28284703 -2.32590146
  -1.99719114]
 [-0.07542547  0.99466805 -1.78770563 ...,  0.31606376  0.25509887
  -0.82076348]]




In [4]:
def gap_stat_calculator(data, max_k=10, ref_data_numbers=5):
    shape=data.shape
    #print(shape)
    
    min_max_diff=data.max()-data.min()
    #print(min_max_diff)
    
    gaps=np.zeros(max_k)
    sk=np.zeros(max_k)
    for k in range(1,max_k+1):
        
        y_pred_actual=KMeans(n_clusters=k).fit(data)
        y_pred_actual_inertia=y_pred_actual.inertia_
        #print("actual inertia={} at n_clusters={}".format(y_pred_actual_inertia,k))
        
        ref_inertia=np.zeros(ref_data_numbers)
        #print(ref_inertia)
        
        for rk in range(1,ref_data_numbers+1):
            x_ref=random_sample((shape[0],shape[1]))*min_max_diff+data.min()
            #print(x_ref)
            #print(x_ref.shape)
            
            y_pred_ref=KMeans(n_clusters=k).fit(x_ref)
            ref_inertia[rk-1]=log(y_pred_ref.inertia_)
            #print("ref inertia={} at n_clusters={} at {}th loop".format(y_pred_ref.inertia_,k,rk))
            
        ref_inertia_mean=np.mean(ref_inertia)
        #print(ref_inertia)
        print("mean ref inertia={}".format(ref_inertia_mean))
        sd=np.sqrt(sum((ref_inertia-ref_inertia_mean)**2)/ref_data_numbers)
        print(sd)
        sk[k-1]=sd
        gaps[k-1]=ref_inertia_mean-log(y_pred_actual_inertia)
    sk=sk*np.sqrt(1+(1/ref_data_numbers))
    print(sk)
    print(gaps)
    return gaps, sk

    
testing=gap_stat_calculator(x_scaled)

mean ref inertia=8.154682152518031
0.0220193506588
mean ref inertia=8.040909477002774
0.0365667332946
mean ref inertia=7.953448012792272
0.0324961080403
mean ref inertia=7.890860475229131
0.0204106388263
mean ref inertia=7.830493643200654
0.0169772638982
mean ref inertia=7.793065663161293
0.0334033402809
mean ref inertia=7.728022072882473
0.0301455202549
mean ref inertia=7.718972186946796
0.0109992589567
mean ref inertia=7.68396796986994
0.0304193301345
mean ref inertia=7.602894875297362
0.0124477404229
[ 0.02412099  0.04005685  0.0355977   0.02235873  0.01859766  0.03659153
  0.03302276  0.01204908  0.03332271  0.01363582]
[ 1.21542821  1.29779658  1.35193902  1.36880047  1.37632262  1.41090189
  1.39136233  1.44801022  1.43814047  1.37911573]


In [5]:
for x in range(0,10):
    gapk=testing[0][x]
    #print("k={}".format(gapk))
    if x<9:
        gapk1=testing[0][x+1]
        #print("k+1={}".format(gapk1))
        gapkm=gapk1-testing[1][x+1]
        #print("gapkm={}".format(gapkm))
        print(gapk-gapkm,x)

-0.0423115213441 0
-0.0185447405205 1
0.00549728972616 2
0.0110755028648 3
0.00201225935534 4
0.05256232154 5
-0.044598800463 6
0.0431924494622 7
0.0726605644786 8


In [7]:
distortions=[]
for x in range(1,20):
    y_pred_inertia=KMeans(n_clusters=x,init='k-means++').fit(x_scaled)
    distortions.append(y_pred_inertia.inertia_)
    if x==1:
        prev_inertia=y_pred_inertia.inertia_
        #print("num_c= 1 current={:.2f}".format(prev_inertia))
    else:
        print("num_c= {} prev={:.2f} current={:.2f}".format(x,prev_inertia,y_pred_inertia.inertia_))
        diminishing_return=prev_inertia-y_pred_inertia.inertia_
        print("dimin={}".format(diminishing_return))
        prev_inertia=y_pred_inertia.inertia_

num_c= 2 prev=1032.00 current=848.20
dimin=183.80301722674108
num_c= 3 prev=848.20 current=736.61
dimin=111.58243134411464
num_c= 4 prev=736.61 current=673.14
dimin=63.47268404813326
num_c= 5 prev=673.14 current=633.67
dimin=39.47582285454382
num_c= 6 prev=633.67 current=585.18
dimin=48.48437011293515
num_c= 7 prev=585.18 current=555.32
dimin=29.85857380311927
num_c= 8 prev=555.32 current=542.92
dimin=12.40662488722262
num_c= 9 prev=542.92 current=520.26
dimin=22.657034480501352
num_c= 10 prev=520.26 current=488.23
dimin=32.02829862633615
num_c= 11 prev=488.23 current=475.36
dimin=12.871907655491952
num_c= 12 prev=475.36 current=442.95
dimin=32.40678201112246
num_c= 13 prev=442.95 current=438.03
dimin=4.918015565168616
num_c= 14 prev=438.03 current=410.81
dimin=27.222712581359247
num_c= 15 prev=410.81 current=394.38
dimin=16.42821633427019
num_c= 16 prev=394.38 current=393.53
dimin=0.8570787696705793
num_c= 17 prev=393.53 current=371.11
dimin=22.41529163831899
num_c= 18 prev=371.11 cur

In [None]:
plt.plot(range(1,20),distortions,marker='o') 
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()

In [6]:
y_pred=KMeans(n_clusters=3).fit(x_scaled)

In [7]:
print(y_pred.cluster_centers_)

[[ 0.90739124 -1.04506852  1.60056289  2.21563617  1.05523818 -0.93096057
   0.09078782 -0.46509205  0.09834071 -0.5560694  -0.8007649  -1.35550333]
 [ 0.46512372  0.04604057  0.0621395  -0.37595879 -0.17923531  0.75237801
   0.33756492  0.7545676   0.43872743  0.45714413  0.14123121  0.00965604]
 [-0.62915878  0.24220426 -0.48094963 -0.28266827 -0.13447852 -0.37415332
  -0.30428959 -0.50095819 -0.39020684 -0.22990578  0.0977208   0.35566418]]


In [51]:
score=[0,1,2]

def cluster_selector(characters,cluster_centers):
    print(characters)
    centers_selector=cluster_centers[0:,characters]
    #print(centers_selector[0])
    max_buffer=np.zeros(len(characters))
  
    for i,centers in enumerate(centers_selector):
        print(centers)
        inner_sum=sum(centers)
        print(inner_sum)
        max_buffer[i]=inner_sum
        
        
    print(max_buffer)
    print(max_buffer.max())
    cluster_index=np.where(max_buffer==max_buffer.max())[0][0]+1
    print(cluster_index)
    
    
    
cluster_selector(score,y_pred.cluster_centers_)

[0, 1, 2]
[ 0.90739124 -1.04506852  1.60056289]
1.46288561247
[ 0.46512372  0.04604057  0.0621395 ]
0.573303786992
[-0.62915878  0.24220426 -0.48094963]
-0.867904158411
[ 1.46288561  0.57330379 -0.86790416]
1.46288561247
1
