In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.cluster import KMeans
import sklearn.metrics as sm

df_iris = pd.read_csv("IRIS.csv")

In [2]:
iris_points  = df_iris.drop("species", axis=1).values

In [3]:
model = KMeans(n_clusters=3, n_init= 1000, n_jobs= 6).fit_predict(iris_points)

In [4]:
df_iris['kmeans'] = model
df_iris['kmeans'] = df_iris['kmeans']+1
df_iris['ID'] = range(1, 1+len(df_iris))

In [5]:
df_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,kmeans,ID
0,5.1,3.5,1.4,0.2,Iris-setosa,2,1
1,4.9,3.0,1.4,0.2,Iris-setosa,2,2
2,4.7,3.2,1.3,0.2,Iris-setosa,2,3
3,4.6,3.1,1.5,0.2,Iris-setosa,2,4
4,5.0,3.6,1.4,0.2,Iris-setosa,2,5


In [6]:
df_inf_norm_0 = pd.read_csv("iris_Assignment_INF_NORM.csv")
df_inf_norm_1 = pd.read_csv("iris_Centroid_INF_NORM.csv")

In [7]:
df_inf_norm_0 = df_inf_norm_0.drop("Unnamed: 0", axis=1)

In [8]:
df_last_assignment =  df_inf_norm_0[df_inf_norm_0['iteration'] == 16]

In [9]:
df_merge = df_iris.merge(df_last_assignment,left_on='ID',right_on='P_i').drop(['iteration','P_i'],axis =1)
df_merge = df_merge.rename(index=str, columns={"C_j": "inf_norm"})
df_merge.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,kmeans,ID,inf_norm
0,5.1,3.5,1.4,0.2,Iris-setosa,2,1,1
1,4.9,3.0,1.4,0.2,Iris-setosa,2,2,1
2,4.7,3.2,1.3,0.2,Iris-setosa,2,3,1
3,4.6,3.1,1.5,0.2,Iris-setosa,2,4,1
4,5.0,3.6,1.4,0.2,Iris-setosa,2,5,1


In [10]:
df_1_norm_0 = pd.read_csv("iris_Assignment_1_NORM.csv")
df_1_norm_1 = pd.read_csv("iris_Centroid_1_NORM.csv")

In [11]:
df_1_norm_0 = df_1_norm_0.drop("Unnamed: 0", axis=1)

In [12]:
df_1_last_assignment =  df_1_norm_0[df_1_norm_0['iteration'] == 11]

In [13]:
df_merge = df_merge.merge(df_last_assignment,left_on='ID',right_on='P_i').drop(['iteration','P_i'],axis =1)
df_merge = df_merge.rename(index=str, columns={"C_j": "1_norm"})
df_merge

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,kmeans,ID,inf_norm,1_norm
0,5.1,3.5,1.4,0.2,Iris-setosa,2,1,1,1
1,4.9,3.0,1.4,0.2,Iris-setosa,2,2,1,1
2,4.7,3.2,1.3,0.2,Iris-setosa,2,3,1,1
3,4.6,3.1,1.5,0.2,Iris-setosa,2,4,1,1
4,5.0,3.6,1.4,0.2,Iris-setosa,2,5,1,1
5,5.4,3.9,1.7,0.4,Iris-setosa,2,6,1,1
6,4.6,3.4,1.4,0.3,Iris-setosa,2,7,1,1
7,5.0,3.4,1.5,0.2,Iris-setosa,2,8,1,1
8,4.4,2.9,1.4,0.2,Iris-setosa,2,9,1,1
9,4.9,3.1,1.5,0.1,Iris-setosa,2,10,1,1


In [14]:
df_slice = df_merge[['ID','species', 'kmeans', 'inf_norm','1_norm']]

## Analyzing Clusters
### Proper Classification
Since all centroids are ambiguously labeled, I am going to be looking at which assignment are to where. 

In [15]:
T2 = pd.crosstab(df_slice.kmeans,[df_slice.species]).round(2)
T2

species,Iris-setosa,Iris-versicolor,Iris-virginica
kmeans,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,2,36
2,50,0,0
3,0,48,14


In [16]:
T2 = pd.crosstab(df_slice.inf_norm,[df_slice.species],)
T2

species,Iris-setosa,Iris-versicolor,Iris-virginica
inf_norm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,50,0,0
2,0,42,3
3,0,8,47


In [75]:
T2 = pd.crosstab([df_slice.species],df_slice['1_norm'])
T2

1_norm,1,2,3
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Iris-setosa,50,0,0
Iris-versicolor,0,42,8
Iris-virginica,0,3,47


In [76]:
T2 = pd.crosstab([df_slice.kmeans],df_slice['inf_norm'])
T2

inf_norm,1,2,3
kmeans,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,0,38
2,50,0,0
3,0,45,17


In [77]:
T2 = pd.crosstab([df_slice.kmeans],df_slice['1_norm'])
T2

1_norm,1,2,3
kmeans,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,0,38
2,50,0,0
3,0,45,17


Iris-Setosa from Kmeans is all pointing to the same set of 50 as the INF Norm and 1-Norm case. In the case of Iris-Versicolor and Iris-Virginica they pointing to different subclusters.

### Looking at Each Cluster

In [78]:
temp = df_slice.loc[(df_slice['kmeans'] == 2) | (df_slice['inf_norm'] == 1)]
temp.count()

ID          50
species     50
kmeans      50
inf_norm    50
1_norm      50
dtype: int64

In [79]:
temp = df_slice.loc[(df_slice['kmeans'] == 2) | (df_slice['1_norm'] == 1)]
temp.count()

ID          50
species     50
kmeans      50
inf_norm    50
1_norm      50
dtype: int64

it appears that for Iris-Setosa it is fairly consistent and homogenous between all 3 techniques.

In [80]:
not_iris_setosa =  df_slice.loc[(df_slice['species'] != 'Iris-setosa')]
not_iris_setosa.head()

Unnamed: 0,ID,species,kmeans,inf_norm,1_norm
50,51,Iris-versicolor,3,3,3
51,52,Iris-versicolor,3,2,2
52,53,Iris-versicolor,1,3,3
53,54,Iris-versicolor,3,2,2
54,55,Iris-versicolor,3,2,2


In [81]:
iris_3 =df_slice.loc[(df_slice['kmeans'] == 3)]
iris_3.head()

Unnamed: 0,ID,species,kmeans,inf_norm,1_norm
50,51,Iris-versicolor,3,3,3
51,52,Iris-versicolor,3,2,2
53,54,Iris-versicolor,3,2,2
54,55,Iris-versicolor,3,2,2
55,56,Iris-versicolor,3,2,2


I am going to assume from above that where KMEANS thinks it is cluster 3 that the associate cluster in the bilinear case is 2 thus any points with respects to KMEAns assignnment 3 that is is assigned to 3 is misclassified

In [95]:
iris_3 = iris_3.loc[(iris_3['inf_norm'] == 3)|(iris_3['1_norm'] == 3)]

In [96]:
df_merge[df_merge['ID'].isin(iris_3['ID'])]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,kmeans,ID,inf_norm,1_norm
50,7.0,3.2,4.7,1.4,Iris-versicolor,3,51,3,3
58,6.6,2.9,4.6,1.3,Iris-versicolor,3,59,3,3
72,6.3,2.5,4.9,1.5,Iris-versicolor,3,73,3,3
76,6.8,2.8,4.8,1.4,Iris-versicolor,3,77,3,3
83,6.0,2.7,5.1,1.6,Iris-versicolor,3,84,3,3
86,6.7,3.1,4.7,1.5,Iris-versicolor,3,87,3,3
101,5.8,2.7,5.1,1.9,Iris-virginica,3,102,3,3
114,5.8,2.8,5.1,2.4,Iris-virginica,3,115,3,3
119,6.0,2.2,5.0,1.5,Iris-virginica,3,120,3,3
123,6.3,2.7,4.9,1.8,Iris-virginica,3,124,3,3
