In [None]:
import copy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
%matplotlib inline

In [None]:
iris=pd.read_csv("data_files\\iris_dataset_no_names.csv")

In [None]:
iris.head()

In [None]:
g = sns.pairplot(iris)

In [None]:
plt.plot(iris.sepal_length,iris.petal_width,'o')
plt.xlabel('Sepal Length')
plt.ylabel('Petal Width')
plt.show()

# K-means

The idea is to find $K$ groups of observations (clusters), denoted by $C_k$, which are similar to one another. The mathematical objective is to partition observations into $K$ sets so as to minimize the within-cluster sum of squares:

$$ Minimize \displaystyle \sum_{k=1}^K \sum_{\mathrm{x}_n \in C_k} ||\mathrm{x}_n - \mu_k ||^2 with \ respect \ to \ \displaystyle C_k, \ \mu_k$$

where $\mu_k$ is the mean point of $C_k$, and is referred to as *centroid*.

## Approach: Iterative Refinement (Lloyd's algorithm)

- Step 0: Start with an initial guess of a set of centroids $\mu_k$.
- Step 1: Create clusters containing points closest in distance to each centroid
- Step 2: Update the centroids as the means of all points in each cluster.
- Step 3: Repeat 1 and 2 until the assignments of clusters and centroids does not change (or max number of steps reached)

In [None]:
#Create a random starting point (i.e., generate a uniform random number for each dimension of data)
data=np.array(iris)
mins=data.min(axis=0)
maxs=data.max(axis=0)
print([np.random.uniform(low=x[0],high=x[1]) for x in zip(mins,maxs)])

In [None]:
K=2 #Number of clusters

#Step 0: Initial Guess
mu0= [[np.random.uniform(low=x[0],high=x[1]) for x in zip(mins,maxs)] for i in range(K)] 
  
print("Initial Guess",mu0)

In [None]:
iris['Cluster']=0

In [None]:
#get one data point (i.e., row) from the dataframe
rel_cols=['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
iris.loc[1,rel_cols].tolist()

In [None]:
#Step 1: Create clusters containing points closest in distance to each centroid

for index,row in iris.iterrows():
    
    p= np.array(row[rel_cols])
    d=np.array([np.linalg.norm(p-mu0[k]) for k in range(K)])
    bestKindex=np.argmin(d)
    iris.Cluster.loc[index]=bestKindex


In [None]:
iris.head()

In [None]:
sns.pairplot(x_vars=rel_cols, y_vars=rel_cols, data=iris, hue="Cluster", size=5)

In [None]:
#Get averages for each column for cluster 0
[iris[col][iris.Cluster==0].mean() for col in rel_cols]

In [None]:
# Step 2: Update the centroids as the means of all points in each cluster.

mu1= [np.array([iris[col][iris.Cluster==k].mean() for col in rel_cols]) for k in range(K)] 

print("Updated Guess:",mu1)

In [None]:
# Put it all together

diff=sum([np.linalg.norm(mu1[k]-mu0[k]) for k in range(K)])
n=2
nmax=100

while diff>.00001 and n<nmax:
    print("Iteration:",n)
    n+=1
    
    mu0=mu1
    
    for index,row in iris.iterrows():

        p= np.array(row[rel_cols])
        d=np.array([np.linalg.norm(p-mu0[k]) for k in range(K)])
        bestKindex=np.argmin(d)
        iris.Cluster.loc[index]=bestKindex
    
    sns.pairplot(x_vars=rel_cols, y_vars=rel_cols, data=iris, hue="Cluster", size=5)
    
    mu1= [np.array([iris[col][iris.Cluster==k].mean() for col in rel_cols]) for k in range(K)] 
    
    print("Updated Guess",mu1)
    diff=sum([np.linalg.norm(mu1[k]-mu0[k]) for k in range(K)])
    print("diff=",diff)
    


In [None]:
print("Cluster Centers:", mu1)
print(iris.Cluster.tolist())

In [None]:
#Using other libraries
from sklearn.cluster import KMeans

#Determining number of clusters
nClusters=range(2,10)
sumDistances=[]
for n in nClusters:
    kmeans=KMeans(n_clusters=n).fit(iris[rel_cols])
    sumDistances.append(kmeans.inertia_) #Proxy for SSE

In [None]:
plt.plot(nClusters,sumDistances,'-')
plt.xlabel('nClusters')
plt.ylabel('Sum Of Distances')
plt.show()

In [None]:
kmeans=KMeans(n_clusters=3).fit(iris)
iris['Cluster']=kmeans.labels_
g = sns.pairplot(iris,hue='Cluster')

In [None]:
irisNames=pd.read_csv("iris_dataset.csv")
irisNames.head()

In [None]:
g = sns.pairplot(irisNames,hue='species')

In [None]:
pd.crosstab(iris.Cluster,irisNames.species)