<H1>Import libraries</H1>

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import seaborn as sns; sns.set()  # for plot styling
import numpy as np
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import silhouette_score


<H2>generate data set for custering</H2> 

In [None]:
number_sets_to_generate = 4

X, y = make_blobs(centers=number_sets_to_generate, cluster_std=0.5, random_state=0)
generated_2_sets = pd.DataFrame({'x_axis': X[:, 0], 'y_axis': X[:, 1],'set_number':y})
display( generated_2_sets.head( 10 ) )
cmap4 = colors.ListedColormap(["darkorange", "gold", "lawngreen", "lightseagreen", "red", "blue","magenta", "black", "pink", "brown"])
plt.scatter(generated_2_sets['x_axis'], generated_2_sets['y_axis'], c=generated_2_sets['set_number'],cmap=cmap4)
plt.title("Generated data set for clustering")
plt.show()



<H2>split on train and test set</H2>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(generated_2_sets[['x_axis','y_axis']], generated_2_sets[['set_number']], test_size=0.33, random_state=0)

<H2>run kmeans algorithm</H2>

In [None]:
kmeans = KMeans(n_clusters = number_sets_to_generate, random_state = 0, n_init='auto')
kmeans.fit(X_train)

<H2>Plot the results</H2>

In [None]:
clusters_num = range(number_sets_to_generate,2*number_sets_to_generate,1)

clusters = pd.DataFrame({'x_axis':X_train['x_axis'],'y_axis':X_train['y_axis'],'set_number':kmeans.labels_})
center_sets = pd.DataFrame({'x_axis': kmeans.cluster_centers_[:, 0], 'y_axis': kmeans.cluster_centers_[:, 1],'set_number':clusters_num})

df_res = pd.concat( [clusters, center_sets] )

score = silhouette_score(clusters[['x_axis','y_axis']], kmeans.labels_ )

plt.scatter( df_res['x_axis'], df_res['y_axis'], c=df_res['set_number'],cmap=cmap4)
plt.title("Clustering output when we assume 4 clusters Silhouette score = " + str( score ) ) 
plt.show()

<H2>For 3 clusters</H2>

In [None]:
number_clusters = 3
kmeans_3 = KMeans(n_clusters = number_clusters, random_state = 0, n_init='auto')
kmeans_3.fit(X_train)

clusters_num = range(number_sets_to_generate,number_sets_to_generate+number_clusters,1)

clusters = pd.DataFrame({'x_axis':X_train['x_axis'],'y_axis':X_train['y_axis'],'set_number':kmeans_3.labels_})
center_sets = pd.DataFrame({'x_axis': kmeans_3.cluster_centers_[:, 0], 'y_axis': kmeans_3.cluster_centers_[:, 1],'set_number':clusters_num})

df_res = pd.concat( [clusters, center_sets] )

plt.scatter( df_res['x_axis'], df_res['y_axis'], c=df_res['set_number'],cmap=cmap4)
plt.title("Clustering output when we assume 3 clusters")
plt.show()


<H2>For 5 clusters</H2>

In [None]:
number_clusters = 5
kmeans_5 = KMeans(n_clusters = number_clusters, random_state = 0, n_init='auto')
kmeans_5.fit(X_train)

clusters_num = range(number_sets_to_generate,number_sets_to_generate+number_clusters,1)

clusters = pd.DataFrame({'x_axis':X_train['x_axis'],'y_axis':X_train['y_axis'],'set_number':kmeans_5.labels_})
center_sets = pd.DataFrame({'x_axis': kmeans_5.cluster_centers_[:, 0], 'y_axis': kmeans_5.cluster_centers_[:, 1],'set_number':clusters_num})

df_res = pd.concat( [clusters, center_sets] )

plt.scatter( df_res['x_axis'], df_res['y_axis'], c=df_res['set_number'],cmap=cmap4)
plt.title("Clustering output when we assume 5 clusters")
plt.show()

<H2>For 6 clusters</H2>

In [None]:
number_clusters = 6
kmeans_6 = KMeans(n_clusters = number_clusters, random_state = 0, n_init='auto')
kmeans_6.fit(X_train)

clusters_num = range(number_sets_to_generate,number_sets_to_generate+number_clusters,1)

clusters = pd.DataFrame({'x_axis':X_train['x_axis'],'y_axis':X_train['y_axis'],'set_number':kmeans_6.labels_})
center_sets = pd.DataFrame({'x_axis': kmeans_6.cluster_centers_[:, 0], 'y_axis': kmeans_6.cluster_centers_[:, 1],'set_number':clusters_num})

df_res = pd.concat( [clusters, center_sets] )

plt.scatter( df_res['x_axis'], df_res['y_axis'], c=df_res['set_number'],cmap=cmap4)
plt.title("Three normally-distributed clusters")
plt.show()

<H2>How to get proper number of clusters?</H2>

<H2>Calinski Harabasz score</H2>

In [None]:
ch_index = calinski_harabasz_score(X_train, kmeans.labels_)
ch_index_3 = calinski_harabasz_score(X_train, kmeans_3.labels_)
ch_index_5 = calinski_harabasz_score(X_train, kmeans_5.labels_)
ch_index_6 = calinski_harabasz_score(X_train, kmeans_6.labels_)

calinski_score = pd.DataFrame({'clusters_number':[3,4,5,6],'calinski_metrix':[ch_index_3,ch_index,ch_index_5,ch_index_6],})

display( calinski_score )

plt.scatter( calinski_score['clusters_number'], calinski_score['calinski_metrix'], marker='^', ls='--', c='b', lw=2 )
plt.plot(calinski_score['clusters_number'], calinski_score['calinski_metrix'])
plt.title("Calinski Harabasz score")
plt.show()



<H2>lt's normalize variables</H2>

In [None]:
X_train_norm = preprocessing.normalize(X_train)
X_test_norm = preprocessing.normalize(X_test)

kmeans_norm = KMeans(n_clusters = 4, random_state = 0, n_init='auto')
kmeans_norm.fit(X_train_norm)


In [None]:
#clusters_num = [3,4,5]
clusters_num = [4,5,6,7]

clusters = pd.DataFrame({'x_axis':X_train_norm[:, 0],'y_axis':X_train_norm[:, 1],'set_number':kmeans_norm.labels_})
center_sets = pd.DataFrame({'x_axis': kmeans_norm.cluster_centers_[:, 0], 'y_axis': kmeans_norm.cluster_centers_[:, 1],'set_number':clusters_num})

df_res = pd.concat( [clusters, center_sets] )

plt.scatter( df_res['x_axis'], df_res['y_axis'], c=df_res['set_number'],cmap=cmap4)
plt.title("Three normally-distributed clusters")
plt.show()





In [None]:
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabasz_score
import matplotlib.pyplot as plt

In [None]:
iris = load_iris()
X = iris.data[:, :4]

In [None]:
number_clusters = 2
kmeans_ir_2 = KMeans(n_clusters=number_clusters, random_state=30)
kmeans_ir_2.fit_predict(X)
ch_index_ir_2 = calinski_harabasz_score(X, kmeans_ir_2.labels_)

number_clusters = 3
kmeans_ir_3 = KMeans(n_clusters=number_clusters, random_state=30)
kmeans_ir_3.fit_predict(X)
ch_index_ir_3 = calinski_harabasz_score(X, kmeans_ir_3.labels_)

number_clusters = 4
kmeans_ir_4 = KMeans(n_clusters=number_clusters, random_state=30)
kmeans_ir_4.fit_predict(X)
ch_index_ir_4 = calinski_harabasz_score(X, kmeans_ir_4.labels_)

number_clusters = 5
kmeans_ir_5 = KMeans(n_clusters=number_clusters, random_state=30)
kmeans_ir_5.fit_predict(X)
ch_index_ir_5 = calinski_harabasz_score(X, kmeans_ir_5.labels_)

number_clusters = 6
kmeans_ir_6 = KMeans(n_clusters=number_clusters, random_state=30)
kmeans_ir_6.fit_predict(X)
ch_index_ir_6 = calinski_harabasz_score(X, kmeans_ir_6.labels_)

calinski_score_ir = pd.DataFrame({'clusters_number':[2,3,4,5,6],'calinski_metrix':[ch_index_ir_2,ch_index_ir_3,ch_index_ir_4,ch_index_ir_5,ch_index_ir_6]})

display( calinski_score_ir )

plt.scatter( calinski_score_ir['clusters_number'], calinski_score_ir['calinski_metrix'], marker='^', ls='--', c='b', lw=2 )
plt.plot(calinski_score_ir['clusters_number'], calinski_score_ir['calinski_metrix'])
plt.title("Calinski Harabasz score")
plt.show()



In [None]:
df_iris = pd.DataFrame({'sepal_length': X[:, 0], 'sepal_width': X[:, 1], 'petal_length': X[:, 2],'etal_width': X[:, 3], 'cluster_label':kmeans_ir_3.labels_, 'target':iris.target})

display( df_iris )

grouped = df_iris.groupby(['cluster_label', 'target']).agg({ 'sepal_length': 'count', })

grouped

In [None]:
#iris

<h1>How k-means works</h1>

In [None]:
df_dample = pd.DataFrame({'x_axis':[3.136885,1.481533,0.349872,0.532377,2.649282,1.209101], 'y_axis':[1.565928,0.678754,4.692533,3.313389,1.056135,3.535665],'set_number':[0,0,0,0,0,0]})

cmap4 = colors.ListedColormap(["grey", "blue", "red", "green", "black", "magenta", "pink", "brown"])
plt.scatter(df_dample['x_axis'], df_dample['y_axis'], c=df_dample['set_number'],cmap=cmap4)
plt.title("Input set")
plt.show()


#    3.136885 	1.565928 	1
#1 	1.481533 	0.678754 	1
#2 	0.349872 	4.692533 	0
#3 	0.532377 	3.313389 	0
#4 	2.649282 	1.056135 	1
#5 	1.209101 	3.535665 	0
#6 	0.999149 	4.210195 	0
#7 	3.016739 	1.637921 	1
#8 	0.782607 	4.152636 	0
#9 	1.653563 	0.552889 	1

In [None]:
df_dample = pd.DataFrame({'x_axis':[3.136885,1.481533,0.349872,0.532377,2.649282,1.209101, 2,0.5], 'y_axis':[1.565928,0.678754,4.692533,3.313389,1.056135,3.535665,1,4],'set_number':[0,0,0,0,0,0,3,4]})

cmap4 = colors.ListedColormap(["grey", "blue", "red", "green", "black", "magenta", "pink", "brown"])
plt.scatter(df_dample['x_axis'], df_dample['y_axis'], c=df_dample['set_number'],cmap=cmap4)
plt.title("First step, centroids")
plt.show()

In [None]:
df_dample = pd.DataFrame({'x_axis':[3.136885,1.481533,0.349872,0.532377,2.649282,1.209101, 2,0.532377], 'y_axis':[1.565928,0.678754,4.692533,3.313389,1.056135,3.535665,1,3.313389],'set_number':[0,0,0,0,0,0,3,4]})

cmap4 = colors.ListedColormap(["grey", "blue", "red", "green", "black", "magenta", "pink", "brown"])
plt.scatter(df_dample['x_axis'], df_dample['y_axis'], c=df_dample['set_number'],cmap=cmap4)
plt.title("Second step")
plt.show()

In [None]:
df_dample = pd.DataFrame({'x_axis':[3.136885,1.481533,0.349872,0.532377,2.649282,1.209101, 1.481533,0.532377], 'y_axis':[1.565928,0.678754,4.692533,3.313389,1.056135,3.535665,0.678754,3.313389],'set_number':[0,0,0,0,0,0,3,4]})

cmap4 = colors.ListedColormap(["grey", "blue", "red", "green", "black", "magenta", "pink", "brown"])
plt.scatter(df_dample['x_axis'], df_dample['y_axis'], c=df_dample['set_number'],cmap=cmap4)
plt.title("Third step")
plt.show()

In [None]:
df_dample = pd.DataFrame({'x_axis':[3.136885,1.481533,0.349872,0.532377,2.649282,1.209101, 1.481533,0.8707389], 'y_axis':[1.565928,0.678754,4.692533,3.313389,1.056135,3.535665,0.678754,3.424527],'set_number':[0,0,0,1,0,1,3,4]})

cmap4 = colors.ListedColormap(["grey", "blue", "red", "green", "black", "magenta", "pink", "brown"])
plt.scatter(df_dample['x_axis'], df_dample['y_axis'], c=df_dample['set_number'],cmap=cmap4)
plt.title("Fourth step")
plt.show()

In [None]:
df_dample = pd.DataFrame({'x_axis':[3.136885,1.481533,0.349872,0.532377,2.649282,1.209101, 1.481533,0.69711666], 'y_axis':[1.565928,0.678754,4.692533,3.313389,1.056135,3.535665,0.678754,3.8471956],'set_number':[0,0,1,1,0,1,3,4]})

cmap4 = colors.ListedColormap(["grey", "blue", "red", "green", "black", "magenta", "pink", "brown"])
plt.scatter(df_dample['x_axis'], df_dample['y_axis'], c=df_dample['set_number'],cmap=cmap4)
plt.title("Fifth step")
plt.show()

In [None]:
df_dample = pd.DataFrame({'x_axis':[3.136885,1.481533,0.349872,0.532377,2.649282,1.209101, 2.0654075,0.69711666], 'y_axis':[1.565928,0.678754,4.692533,3.313389,1.056135,3.535665,0.8674445,3.8471956],'set_number':[0,2,1,1,2,1,3,4]})

cmap4 = colors.ListedColormap(["grey", "blue", "red", "green", "black", "magenta", "pink", "brown"])
plt.scatter(df_dample['x_axis'], df_dample['y_axis'], c=df_dample['set_number'],cmap=cmap4)
plt.title("Sixth step")
plt.show()

In [None]:
df_dample = pd.DataFrame({'x_axis':[3.136885,1.481533,0.349872,0.532377,2.649282,1.209101, 2.4225666,0.69711666], 'y_axis':[1.565928,0.678754,4.692533,3.313389,1.056135,3.535665,1.10027233,3.8471956],'set_number':[2,2,1,1,2,1,3,4]})

cmap4 = colors.ListedColormap(["grey", "blue", "red", "green", "black", "magenta", "pink", "brown"])
plt.scatter(df_dample['x_axis'], df_dample['y_axis'], c=df_dample['set_number'],cmap=cmap4)
plt.title("Final step")
plt.show()

In [None]:


kmeans = KMeans(n_clusters=2, random_state=42)
out = kmeans.fit_predict(df_dample[['x_axis','y_axis']])


score = silhouette_score(df_dample[['x_axis','y_axis']], out )

clusters = pd.DataFrame({'x_axis':df_dample['x_axis'],'y_axis':df_dample['y_axis'],'set_number':kmeans.labels_})
center_sets = pd.DataFrame({'x_axis': kmeans.cluster_centers_[:, 0], 'y_axis': kmeans.cluster_centers_[:, 1],'set_number':[2,3]})

df_res = pd.concat( [clusters, center_sets] )

plt.scatter( df_res['x_axis'], df_res['y_axis'], c=df_res['set_number'],cmap=cmap4)
plt.title("Silhouette score = " + str( score ))
plt.show()



In [None]:
(1.481533+2.649282+3.136885)/3

In [None]:
(0.678754+1.056135+1.565928)/3