In [1]:
import numpy as np
import pandas as pd
#from sklearn.cluster import KMeans
import scipy 
import sklearn
from collections import Counter
from sklearn.metrics import multilabel_confusion_matrix
from scipy import spatial

In [2]:
dataset = pd.read_csv('data.csv')
labels = pd.read_csv('label.csv',names=['label'],header=None)

In [3]:
dataset.count()

0        9999
0.1      9999
0.2      9999
0.3      9999
0.4      9999
         ... 
0.663    9999
0.664    9999
0.665    9999
0.666    9999
0.667    9999
Length: 784, dtype: int64

In [4]:
from sklearn.model_selection import train_test_split
train_values, test_values = train_test_split( dataset, test_size=0.08, random_state=50)
train_label_val, test_label_val = train_test_split( labels, test_size=0.08, random_state=50)

In [5]:
def calculate_SSE(centroid_value_dict, centroid_dict,data):
    sse = 0
    for i in centroid_dict:
        sse_cluster = 0
        for j in centroid_dict[i]:
            dp = list(data.iloc[int(j)])
            for a,b in zip(centroid_value_dict[i],dp):
                sse_cluster += (a-b)**2
        sse+=sse_cluster
    return sse   
    
def Initialize_Centroids(data,K):
    p = data.shape[0]
    centroid_value_dict={}
    for i in range(K):
        r = np.random.randint(0, p-1)
        centroid_value_dict[i] = data.iloc[r]
    return centroid_value_dict

def jaccard_similarity(centroid, dp):
    top = len(list(set(centroid).intersection(dp)))
    bottom = (len(set(centroid)) + len(set(dp))) - top
    return float(top) / bottom

def train_Kmeans(data,K,max_iter=20,mode=1,tol=10):
    centroid_value_dict = Initialize_Centroids(data,K)
    count = 0
    centroid_dict = {}
    flag = False
    while((count<max_iter) and not flag):
            
        for i in list(centroid_value_dict.keys()):
            centroid_dict[i]=[]
        for i in range(data.shape[0]):
            x = data.iloc[i]
            if mode==1 :
                distance_measure = [np.linalg.norm(x-centroid_value_dict[j])  for j in centroid_value_dict]
                idx = np.argmin(distance_measure)
                centroid_dict[idx].append(i)
            elif mode==2 :
                distance_measure = [jaccard_similarity(list(x),centroid_value_dict[j]) for j in centroid_value_dict]
                idx = np.argmax(distance_measure)
                centroid_dict[idx].append(i)
            elif mode==3 :
                distance_measure = [1-scipy.spatial.distance.cosine(x,list(centroid_value_dict[j]))  for j in centroid_value_dict]
                idx = np.argmax(distance_measure)
                centroid_dict[idx].append(i)
                
            prev_centroids=dict(centroid_value_dict)
        for i in centroid_dict:
            if len(centroid_dict[i]):
                dps_centroid = centroid_dict[i]
                centroid_value_dict[i] = np.average(data.iloc[dps_centroid],axis=0)
        current_tol=-1
        for i in centroid_value_dict:
            prev_centroid_point = prev_centroids[i]
            new_centroid_point = centroid_value_dict[i]
            change = np.sum(np.absolute(new_centroid_point-prev_centroid_point))
            current_tol = max(change, current_tol)
                
        print("Iteration ",count,": ",current_tol)
            
        count+=1
        if (current_tol<10):
            flag = True
            break
    return centroid_value_dict,centroid_dict


In [6]:
def predict_cluster_labels(C, S, labels):
    cluster_labels = np.zeros(10,dtype=int)
    for c in C:
        labels_of_points = []
        for point in S[c]:
            labels_of_points.extend(labels.iloc[point])
        counter = Counter(labels_of_points)
        try:
            cluster_labels[c] = max(counter, key=counter.get)
        except:
            cluster_labels[c] = np.random.randint(0,9)
    return cluster_labels

In [7]:
def accuracy(centroids, centroid_Labels, test_data, true_labels, mode=1):
    y_true = list(true_labels['label']);
    y_pred = []
    for index in range(test_data.shape[0]):
        featureset = test_data.iloc[index]
        if mode==1:
            distances = [np.linalg.norm(featureset - centroids[centroid]) for centroid in centroids]
            classification = distances.index(min(distances))
            y_pred.append(centroid_Labels[classification])
        elif mode==2:
            similarity = [jaccard_similarity(featureset, centroids[centroid]) for centroid in centroids]
            classification = similarity.index(max(similarity))
            y_pred.append(centroid_Labels[classification]) 
        elif mode==3:
            similarity = [1 - spatial.distance.cosine(featureset, centroids[centroid]) for centroid in centroids]
            classification = similarity.index(max(similarity))
            y_pred.append(centroid_Labels[classification])
    denominator = test_data.shape[0]
    correctly_classified = 0
    for i in range(0,len(y_pred)):
        if y_true[i] == y_pred[i]:
            correctly_classified += 1
    accuracy = correctly_classified/denominator
    return accuracy

In [8]:
centroids1,clusters1 = train_Kmeans(dataset,10, max_iter=100,mode=1)

Iteration  0 :  25624.32177263969
Iteration  1 :  6119.102931870913
Iteration  2 :  5338.716647614565
Iteration  3 :  3496.4544506585753
Iteration  4 :  2094.7911908646
Iteration  5 :  1569.5301794453508
Iteration  6 :  1167.220228384992
Iteration  7 :  1420.5918003000609
Iteration  8 :  1198.0205572743366
Iteration  9 :  971.532362479424
Iteration  10 :  1047.037965260546
Iteration  11 :  1007.4094803548795
Iteration  12 :  861.5118947615188
Iteration  13 :  827.3789802507811
Iteration  14 :  983.4956716374859


In [9]:
Euclidean_SSE =calculate_SSE(centroids1,clusters1,dataset)

In [10]:
print("Euclidean SSE:",Euclidean_SSE)

Euclidean SSE: 25434834684.725998


In [11]:
cluster_labels_euc = predict_cluster_labels(centroids1,clusters1,labels)
cluster_labels_euc

array([8, 6, 5, 0, 1, 7, 3, 9, 2, 0])

In [12]:
Accuracy_Euclidean = accuracy(centroids1, cluster_labels_euc,test_values,test_label_val)
Accuracy_Euclidean

0.09875

In [13]:
centroids2,clusters2 =train_Kmeans(dataset,10, max_iter=100,mode=2)

Iteration  0 :  37217.776639344265
Iteration  1 :  9612.821133966794
Iteration  2 :  3276.5830141153647
Iteration  3 :  2355.113684198609
Iteration  4 :  2837.229764956018
Iteration  5 :  2553.6423672288493
Iteration  6 :  820.6426359053817
Iteration  7 :  1942.9338571839382
Iteration  8 :  1119.9329459585301
Iteration  9 :  1246.5237283728375
Iteration  10 :  1646.1386026867992
Iteration  11 :  116.34187960928277
Iteration  12 :  0.0


In [14]:
Jaccard_SSE =calculate_SSE(centroids2,clusters2,dataset)

In [15]:
print("Jacard SSE:",Jaccard_SSE)

Jacard SSE: 34361687572.938736


In [16]:
cluster_labels_jac = predict_cluster_labels(centroids2,clusters2,labels)
cluster_labels_jac

array([1, 6, 5, 1, 2, 1, 7, 8, 8, 0])

In [17]:
Accuracy_Jaccard = accuracy(centroids2, cluster_labels_jac,test_values,test_label_val,mode=2)
Accuracy_Jaccard

0.1025

In [18]:
centroids3,clusters3 =train_Kmeans(dataset,10, max_iter = 100,mode=3)

Iteration  0 :  27198.213917525776
Iteration  1 :  6398.648635498203
Iteration  2 :  3557.2468412234307
Iteration  3 :  2470.6077521838856
Iteration  4 :  2089.1670165310943
Iteration  5 :  2392.819234819235
Iteration  6 :  2275.8581003627733
Iteration  7 :  1937.2847105493174
Iteration  8 :  1399.0095963185058
Iteration  9 :  1001.597255531487
Iteration  10 :  780.8525273202531
Iteration  11 :  782.6767824709202
Iteration  12 :  910.6318325553382
Iteration  13 :  689.1045614401096
Iteration  14 :  584.052697821099
Iteration  15 :  549.1289843639472
Iteration  16 :  665.7922189280317
Iteration  17 :  653.3415893950885
Iteration  18 :  597.3564354692385
Iteration  19 :  759.9313321559746
Iteration  20 :  898.3902089532753
Iteration  21 :  1232.8237647790536
Iteration  22 :  1328.1480376240393
Iteration  23 :  1311.4781847526244
Iteration  24 :  1133.3659827653707
Iteration  25 :  673.77949533985
Iteration  26 :  656.3510791366907
Iteration  27 :  757.0128450727791
Iteration  28 :  671.4

In [19]:
Cosine_SSE = calculate_SSE(centroids3,clusters3,dataset)

In [20]:
cluster_labels_cos = predict_cluster_labels(centroids3,clusters3,labels)
cluster_labels_cos

array([0, 9, 1, 2, 7, 3, 0, 2, 8, 3])

In [21]:
Accuracy_Cosine = accuracy(centroids3, cluster_labels_cos,test_values,test_label_val,mode=3)

In [22]:
print("Euclidean accuracy:",Accuracy_Euclidean)
print("Jacard accuracy:",Accuracy_Jaccard)
print("Cosine accuracy :",Accuracy_Cosine)

Euclidean accuracy: 0.09875
Jacard accuracy: 0.1025
Cosine accuracy : 0.1


In [23]:
print("Euclidean SSE:",Euclidean_SSE)
print("Jacard SSE:",Jaccard_SSE)
print("Cosine SSE :",Cosine_SSE)

Euclidean SSE: 25434834684.725998
Jacard SSE: 34361687572.938736
Cosine SSE : 25604633179.236507
