In [22]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [2]:
df=pd.read_csv('./whiskies.csv')
print(df)

    RowID      Distillery  Body  Sweetness  Smoky  Medicinal  Tobacco  Honey  \
0       1       Aberfeldy     2          2      2          0        0      2   
1       2        Aberlour     3          3      1          0        0      4   
2       3          AnCnoc     1          3      2          0        0      2   
3       4          Ardbeg     4          1      4          4        0      0   
4       5         Ardmore     2          2      2          0        0      1   
5       6     ArranIsleOf     2          3      1          1        0      1   
6       7    Auchentoshan     0          2      0          0        0      1   
7       8       Auchroisk     2          3      1          0        0      2   
8       9        Aultmore     2          2      1          0        0      1   
9      10        Balblair     2          3      2          1        0      0   
10     11       Balmenach     4          3      2          0        0      2   
11     12        Belvenie     3         

In [3]:
df_col_sliced=df.loc[:,
             ['Body',
              'Sweetness',
              'Smoky',
              'Medicinal',
              'Tobacco',
              'Honey',
              'Spicy',
              'Winey',
              'Nutty',
              'Malty',
              'Fruity',
              'Floral',
             ]]
print(df_col_sliced)

    Body  Sweetness  Smoky  Medicinal  Tobacco  Honey  Spicy  Winey  Nutty  \
0      2          2      2          0        0      2      1      2      2   
1      3          3      1          0        0      4      3      2      2   
2      1          3      2          0        0      2      0      0      2   
3      4          1      4          4        0      0      2      0      1   
4      2          2      2          0        0      1      1      1      2   
5      2          3      1          1        0      1      1      1      0   
6      0          2      0          0        0      1      1      0      2   
7      2          3      1          0        0      2      1      2      2   
8      2          2      1          0        0      1      0      0      2   
9      2          3      2          1        0      0      2      0      2   
10     4          3      2          0        0      2      1      3      3   
11     3          2      1          0        0      3      2    

In [4]:
df_to_array=df_col_sliced.as_matrix()
print(df_to_array)

[[2 2 2 ..., 2 2 2]
 [3 3 1 ..., 3 3 2]
 [1 3 2 ..., 2 3 2]
 ..., 
 [0 3 1 ..., 2 1 2]
 [2 2 1 ..., 1 0 0]
 [2 3 0 ..., 2 2 1]]


In [5]:
x_scaled=preprocessing.scale(df_to_array)
print(x_scaled)

[[-0.07542547 -0.40765084  0.54172898 ...,  0.31606376  0.25509887
   0.35566418]
 [ 1.00567291  0.99466805 -0.62298833 ...,  1.91497456  1.54559904
   0.35566418]
 [-1.15652385  0.99466805  0.54172898 ...,  0.31606376  1.54559904
   0.35566418]
 ..., 
 [-2.23762223  0.99466805 -0.62298833 ...,  0.31606376 -1.0354013
   0.35566418]
 [-0.07542547 -0.40765084 -0.62298833 ..., -1.28284703 -2.32590146
  -1.99719114]
 [-0.07542547  0.99466805 -1.78770563 ...,  0.31606376  0.25509887
  -0.82076348]]




In [6]:
y_pred = KMeans(n_clusters=3).fit(x_scaled)
print(y_pred.cluster_centers_)

[[ 1.46900079 -1.20897592  2.20561084  2.63786667  1.86551036 -1.29843095
   0.24061677 -0.43708231 -0.21960293 -0.59759955 -0.66668696 -1.49300786]
 [ 0.43509321 -0.01811782  0.12113662 -0.32947549 -0.18942991  0.69458549
   0.39830712  0.71404043  0.38272363  0.40489214  0.07586274 -0.00379983]
 [-0.60340375  0.21197844 -0.46046963 -0.15358021 -0.14509525 -0.37014095
  -0.37263659 -0.52664836 -0.28467047 -0.24169582  0.04501745  0.24622904]]


In [7]:
distortions=[]
for x in range(1,20):
    y_pred_inertia=KMeans(n_clusters=x,init='k-means++').fit(x_scaled)
    distortions.append(y_pred_inertia.inertia_)
    if x==1:
        prev_inertia=y_pred_inertia.inertia_
        #print("num_c= 1 current={:.2f}".format(prev_inertia))
    else:
        print("num_c= {} prev={:.2f} current={:.2f}".format(x,prev_inertia,y_pred_inertia.inertia_))
        diminishing_return=prev_inertia-y_pred_inertia.inertia_
        print("dimin={}".format(diminishing_return))
        prev_inertia=y_pred_inertia.inertia_

num_c= 2 prev=1032.00 current=848.20
dimin=183.80301722674108
num_c= 3 prev=848.20 current=736.61
dimin=111.58243134411464
num_c= 4 prev=736.61 current=673.14
dimin=63.47268404813326
num_c= 5 prev=673.14 current=633.67
dimin=39.47582285454382
num_c= 6 prev=633.67 current=585.18
dimin=48.48437011293515
num_c= 7 prev=585.18 current=555.32
dimin=29.85857380311927
num_c= 8 prev=555.32 current=542.92
dimin=12.40662488722262
num_c= 9 prev=542.92 current=520.26
dimin=22.657034480501352
num_c= 10 prev=520.26 current=488.23
dimin=32.02829862633615
num_c= 11 prev=488.23 current=475.36
dimin=12.871907655491952
num_c= 12 prev=475.36 current=442.95
dimin=32.40678201112246
num_c= 13 prev=442.95 current=438.03
dimin=4.918015565168616
num_c= 14 prev=438.03 current=410.81
dimin=27.222712581359247
num_c= 15 prev=410.81 current=394.38
dimin=16.42821633427019
num_c= 16 prev=394.38 current=393.53
dimin=0.8570787696705793
num_c= 17 prev=393.53 current=371.11
dimin=22.41529163831899
num_c= 18 prev=371.11 cur

In [None]:
plt.plot(range(1,20),distortions,marker='o') 
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()

In [8]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, MiniBatchKMeans
from numpy.random import random_sample
from math import sqrt, log

from sklearn.datasets import load_iris

# famous iris data set
iris = load_iris()
iris_data = pd.DataFrame(iris['data'], columns=iris['feature_names'])
iris_target = iris['target']

In [12]:
print(iris_data)

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                  5.1               3.5                1.4               0.2
1                  4.9               3.0                1.4               0.2
2                  4.7               3.2                1.3               0.2
3                  4.6               3.1                1.5               0.2
4                  5.0               3.6                1.4               0.2
5                  5.4               3.9                1.7               0.4
6                  4.6               3.4                1.4               0.3
7                  5.0               3.4                1.5               0.2
8                  4.4               2.9                1.4               0.2
9                  4.9               3.1                1.5               0.1
10                 5.4               3.7                1.5               0.2
11                 4.8               3.4                1.6     

In [13]:
def get_rand_data(col):
    rng = col.max() - col.min()
    return pd.Series(random_sample(len(col))*rng + col.min())


In [27]:
ref=get_rand_data(iris_data)

ValueError: operands could not be broadcast together with shapes (4,) (150,) 

In [15]:
print(ref)

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0             7.105345          2.746960           4.304048          1.144057
1             6.793938          3.886503           5.911577          1.722092
2             7.583165          2.738122           1.105403          0.756726
3             6.039297          2.792666           3.345708          0.709844
4             7.211497          3.813749           3.097155          1.799075
5             5.136044          4.213264           6.136564          0.733970
6             5.921667          2.188325           3.250919          0.756957
7             5.673111          4.007073           6.252518          2.002909
8             5.532621          3.202849           5.944684          2.227087
9             6.136528          2.197926           5.762372          1.277872
10            5.963337          2.609697           4.174087          1.630203
11            7.825752          2.727166           1.362037     

In [16]:
ref_1=df_col_sliced.apply(get_rand_data)

In [26]:
print(len(ref_1))

86


In [25]:
scaler=preprocessing.scale(ref_1)
print(scaler)

[[-0.81518206 -0.75646452 -1.50661737 ...,  1.16578256  1.22606206
   0.57355643]
 [ 1.35667581  0.41893839 -0.8647689  ...,  1.16311168 -1.21051764
   0.81575761]
 [ 0.65100912  1.04497924  1.41500518 ...,  0.53990469  1.43172597
  -1.52835159]
 ..., 
 [ 0.73419137 -1.04677406 -0.71358302 ..., -0.38603835 -0.90320581
   1.73926895]
 [ 1.20918901 -1.15689814  0.4823619  ...,  0.24757752  0.84673444
   1.60962648]
 [ 0.00304255  0.67158746 -0.36726049 ...,  1.52840104  1.52809889
  -0.01923261]]


In [21]:
print(test.cluster_centers_)

[[ 1.49576087  2.67190435  1.70537075  1.54521595  0.50575308  1.20639199
   1.45728915  1.24443578  1.85427307  1.31613935  1.81595305  1.52492316]
 [ 1.75096873  2.26357661  2.06739333  1.91160405  0.46199949  2.39361436
   1.40384501  3.32890924  2.25456663  1.50576289  1.36274464  1.77344474]
 [ 2.34158588  2.51307183  1.99204236  3.22748562  0.50771541  2.44337692
   1.42723238  1.20625486  1.74688801  1.51086821  1.51419116  2.89644953]]


In [31]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, MiniBatchKMeans
from numpy.random import random_sample
from math import sqrt, log

from sklearn.datasets import load_iris

# famous iris data set
iris = load_iris()
iris_data = pd.DataFrame(iris['data'], columns=iris['feature_names'])
iris_target = iris['target']


# returns series of random values sampled between min and max values of passed col
def get_rand_data(col):
	rng = col.max() - col.min()
	return pd.Series(random_sample(len(col))*rng + col.min())

def iter_kmeans(df, n_clusters, num_iters=5):
	rng =  range(1, num_iters + 1)
	vals = pd.Series(index=rng)
	for i in rng:
		k = KMeans(n_clusters=n_clusters, n_init=3)
		k.fit(df)
		print ("Ref k: %s" % k.get_params()['n_clusters'])
		vals[i] = k.inertia_
	return vals

def gap_statistic(df, max_k=10):
	gaps = pd.Series(index = range(1, max_k + 1))
	for k in range(1, max_k + 1):
		km_act = KMeans(n_clusters=k, n_init=3)
		km_act.fit(df)

		# get ref dataset
		ref = df.apply(get_rand_data)
		ref_inertia = iter_kmeans(ref, n_clusters=k).mean()

		gap = log(ref_inertia - km_act.inertia_)

		print ("Ref: {}   Act: {}  Gap: {}".format( ref_inertia, km_act.inertia_, gap))
		gaps[k] = gap

	return gaps


In [35]:
test=gap_statistic(df_col_sliced)

Ref k: 1
Ref k: 1
Ref k: 1
Ref k: 1
Ref k: 1
Ref: 1086.9518095930994   Act: 665.8372093023256  Gap: 6.042905006356841
Ref k: 2
Ref k: 2
Ref k: 2
Ref k: 2
Ref k: 2
Ref: 945.4307732132318   Act: 533.4213381555155  Gap: 6.021046249712836
Ref k: 3
Ref k: 3
Ref k: 3
Ref k: 3
Ref k: 3
Ref: 826.8337921706973   Act: 449.73501969728386  Gap: 5.932507149083764
Ref k: 4
Ref k: 4
Ref k: 4
Ref k: 4
Ref k: 4
Ref: 770.8375206779396   Act: 405.0233918128655  Gap: 5.902125359773661
Ref k: 5
Ref k: 5
Ref k: 5
Ref k: 5
Ref k: 5
Ref: 745.2293797246473   Act: 386.6430934656741  Gap: 5.882169317994075
Ref k: 6
Ref k: 6
Ref k: 6
Ref k: 6
Ref k: 6
Ref: 698.4572189447351   Act: 360.715873015873  Gap: 5.8222803537995675
Ref k: 7
Ref k: 7
Ref k: 7
Ref k: 7
Ref k: 7
Ref: 671.4240522216945   Act: 351.33731170789986  Gap: 5.768592023168144
Ref k: 8
Ref k: 8
Ref k: 8
Ref k: 8
Ref k: 8
Ref: 656.6744300326961   Act: 332.54285714285714  Gap: 5.78114952252661
Ref k: 9
Ref k: 9
Ref k: 9
Ref k: 9
Ref k: 9
Ref: 612.7396557

In [39]:
print(test)

1     6.042905
2     6.021046
3     5.932507
4     5.902125
5     5.882169
6     5.822280
7     5.768592
8     5.781150
9     5.685487
10    5.550354
dtype: float64


In [37]:
plt.plot(range(1,10),test[0],marker='o') 
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()

KeyError: 0