In [1]:
#Import the libraries
import pandas as pd
import numpy as np
from scipy import spatial
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
import seaborn as sns





In [2]:
#Load Data
df = pd.read_csv('./kmeans_data/data.csv',header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
#Load Label
labels_data = pd.read_csv('./kmeans_data/label.csv', header = None)
labels_data

Unnamed: 0,0
0,7
1,2
2,1
3,0
4,4
...,...
9995,2
9996,3
9997,4
9998,5


In [4]:
#Define function for Euclidean, Cosine and Jarcard similarity
def E_F(featureset, centroid):
    distance = spatial.distance.euclidean(featureset,centroid)
    return distance

def C_F(featureset, centroid):
    distance = 1 - spatial.distance.cosine(featureset, centroid)
    return distance

def J_F(featureset, centroid):
    distance = 1 - spatial.distance.jaccard(featureset, centroid)
    return distance

def SSE(k, classes, centroids):
    sse = 0
    for i in range(k):
        for features in classes[i]:
            sse += np.linalg.norm(features - centroids[i])
    return sse

In [5]:
#K-means clustering 
class K_Means:
    def __init__(self, k=2, dist_function = E_F, max_iter=10, stop_sse_incr = False, stop_centroid_no_update = False, tol=0.001):
        self.k = k
        self.tol = tol
        self.max_iter = max_iter
        self.dist_function = dist_function
        self.iteration = 0
        self.sse = 0
        self.stop_sse_incr = stop_sse_incr
        self.stop_centroid_no_update = stop_centroid_no_update

    def fit(self,data):

        self.centroids = {}

        for i in range(self.k):
            self.centroids[i] = data[i]

        for i in range(self.max_iter):
            self.classifications = {}
            self.iteration += 1

            for i in range(self.k):
                self.classifications[i] = []

            for featureset in data:
                distances = [self.dist_function(featureset, self.centroids[centroid]) for centroid in self.centroids]
                classification = distances.index(min(distances))
                self.classifications[classification].append(featureset)

            prev_centroids = dict(self.centroids)

            for classification in self.classifications:
                self.centroids[classification] = np.average(self.classifications[classification],axis=0)

            optimized = True
            
            for c in self.centroids:
                original_centroid = prev_centroids[c]
                current_centroid = self.centroids[c]
                if np.sum(current_centroid - original_centroid) > self.tol:
                    optimized = False

            prev_sse = self.sse 
            self.sse = SSE(self.k, self.classifications, self.centroids)
            
            if prev_sse != 0 and self.sse > prev_sse and self.stop_sse_incr:
                break
            if optimized and self.stop_centroid_no_update:
                break

    def predict(self,data):
        distances = [self.dist_function(data, self.centroids[centroid]) for centroid in self.centroids]
        classification = distances.index(min(distances))
        return classification

In [6]:
X = df.values

# Question 1

In [8]:
#Euclidean similarity SSE and Iterations
euclid_model = K_Means(10, E_F, 80)
euclid_model.fit(X)
print('Euclidean Model SSE:', euclid_model.sse, ' and iteration number: ', euclid_model.iteration)

Euclidean Model SSE: 15633272.774959167  and iteration number:  80


In [9]:
#Cosine similarity SSE and Iterations
cosine_model = K_Means(10, C_F, 80)
cosine_model.fit(X)
print('Cosine Model SSE:', cosine_model.sse, ' and iteration number: ', cosine_model.iteration)

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


Cosine Model SSE: 17773106.44981459  and iteration number:  80


In [10]:
#Jarcard similarity SSE and Iterations
jaccard_model = K_Means(10, J_F, 80)
jaccard_model.fit(X)
print('Jarcard Model SSE:', jaccard_model.sse, ' and iteration number: ', jaccard_model.iteration)

Jarcard Model SSE: 18375324.140570614  and iteration number:  80


# Question 2

In [12]:
#label each cluster using the majority vote label of the data points in that cluster
#Euclidean
euclid_model = K_Means(10, E_F, 80, True, True)
euclid_model.fit(X)
print('Euclidean Model SSE:', euclid_model.sse, ' and iteration number: ', euclid_model.iteration)

Euclidean Model SSE: 16227220.727338197  and iteration number:  2


In [13]:
#Cosine
cosine_model = K_Means(10, C_F, 80, True, True)
cosine_model.fit(X)
print('Cosine Model SSE:', cosine_model.sse, ' and iteration number: ', cosine_model.iteration)

Cosine Model SSE: 17417922.14993949  and iteration number:  2


In [14]:
#Jarcard
jaccard_model = K_Means(10, J_F, 80, True, True)
jaccard_model.fit(X)
print('Jarcard Model SSE:', jaccard_model.sse, ' and iteration number: ', jaccard_model.iteration)

Jarcard Model SSE: 18375324.140570614  and iteration number:  2


In [15]:
#Calculate Accuracy
from sklearn.metrics import accuracy_score
prediction = []
prediction2 = []
prediction3 = []
for i in X:
    prediction.append(euclid_model.predict(i))
    prediction2.append(cosine_model.predict(i))
    prediction3.append(jaccard_model.predict(i))
print('Euclidean Accuracy : {0:4f}'.format(accuracy_score(labels_data, prediction)))
print('Cosine Accuracy : {0:4f}'.format(accuracy_score(labels_data, prediction2)))
print('Jaccard Accuracy : {0:4f}'.format(accuracy_score(labels_data, prediction3)))


Euclidean Accuracy : 0.086300
Cosine Accuracy : 0.088300
Jaccard Accuracy : 0.098000


# Question 3

In [17]:
#Euclidean
euclid_model = K_Means(10, E_F, 80, True, False)
euclid_model.fit(X)
print('Euclidean Model SSE:', euclid_model.sse, ' and iteration number: ', euclid_model.iteration)

Euclidean Model SSE: 15634107.344752382  and iteration number:  41


In [18]:
#Cosine
cosine_model = K_Means(10, C_F, 80, True, False)
cosine_model.fit(X)
print(str(cosine_model.sse) + ' ' + str(cosine_model.iteration))

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


17417922.14993949 2


In [19]:
#Jarcard
jaccard_model = K_Means(10, J_F, 80, True, False)
jaccard_model.fit(X)
print('Jarcard Model SSE:', jaccard_model.sse, ' and iteration number: ', jaccard_model.iteration)

Jarcard Model SSE: 18375324.140570614  and iteration number:  2


# Question 4

In [21]:
#Euclidean
euclid_model = K_Means(10, E_F, 80, False, True)
euclid_model.fit(X)
print('Euclidean Model SSE:', euclid_model.sse, ' and iteration number: ', euclid_model.iteration)

Euclidean Model SSE: 16227220.727338197  and iteration number:  2


In [22]:
#Cosine
cosine_model = K_Means(10, C_F, 80, False, True)
cosine_model.fit(X)
print(str(cosine_model.sse) + ' ' + str(cosine_model.iteration))

17773106.44981459 80


In [23]:
#Jarcard
jaccard_model = K_Means(10, J_F, 80, False, True)
jaccard_model.fit(X)
print('Jarcard Model SSE:', jaccard_model.sse, ' and iteration number: ', jaccard_model.iteration)

Jarcard Model SSE: 18375324.140570614  and iteration number:  2
