# Problem 1: Kmeans


## Libraries

In [87]:
import pandas as pd

import numpy as np
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import random

import math
from scipy import spatial
from sklearn.metrics import jaccard_score

from sklearn.metrics import accuracy_score

## Defining the class objects (Fresh Copy)

In [104]:
# Writing K means 

# Ref : https://github.com/scikit-learn/scikit-learn/tree/main/sklearn/cluster



class K_Means:

    def __init__(self, dist_func, k =10, max_iterations = 100, sse_cond = False, centroid_cond = False, tolerance = 0.0001):
        self.k = k
        self.tolerance = tolerance
        self.max_iterations = max_iterations
        self.dist_func = dist_func
        self.sse_cond = sse_cond
        self.centroid_cond = centroid_cond
    
        
    def calc_sse(self, data):
        sse = 0
        for i in range(self.k):
            for features in self.classes[i]:
                sse += np.linalg.norm(features - self.centroids[i])  
        return sse
        
    
    def fit(self, data):

        self.centroids = {}
        sse = 0
        iteration = 0

        for i in range(self.k):
            self.centroids[i] = data[i]

        for j in range(self.max_iterations):
            iteration += 1
            self.classes = {}
            for i in range(self.k):
                self.classes[i] = []

            for features in data:
                distances = [self.dist_func(features, self.centroids[centroid]) for centroid in self.centroids]
                classification = distances.index(min(distances))
                self.classes[classification].append(features)

            previous = dict(self.centroids)

            for classification in self.classes:
                self.centroids[classification] = np.average(self.classes[classification], axis = 0)
    
            isOptimal = True

            for centroid in self.centroids:
                original_centroid = previous[centroid]
                curr = self.centroids[centroid]
                if np.sum(curr - original_centroid) > self.tolerance:
                    isOptimal = False

            sse_prev = sse
            #####
            sse = calc_sse2(data, self.k, self.classes, self.centroids)
            
#             print(sse_prev,sse)
            if sse_prev != 0 and sse > sse_prev and self.sse_cond:
                break
            
            if isOptimal and self.centroid_cond:
                break
            
        return iteration, sse
                
    
    def pred(self, data):
        distances = [self.dist_func(data, self.centroids[centroid]) for centroid in self.centroids]
        classification = distances.index(min(distances))
        return classification
    
# Fuctions for calculating the attributes and similarity
    
def calc_sse2(data, k, classes, centroids):
    sse = 0
    for i in range(k):
        for features in classes[i]:
            sse += np.linalg.norm(features - centroids[i]) 

    return sse

def Euclidean_distance(feat_one, feat_two):

    squared_distance = 0

    for i in range(len(feat_one)):
        squared_distance += (feat_one[i] - feat_two[i])**2

    dist = np.sqrt(squared_distance)

    return dist

def Cosine_sim(feat_one, feat_two):

    result = 1 - spatial.distance.cosine(feat_one, feat_two)

    return result

def Jaccard_scr(list1, list2):
    smax = 0
    smin = 0
    for i in range(list1.size):
        smin += min(list1[i], list2[i])
    for i in range(list1.size):
        smax += max(list1[i], list2[i])
    return 1 - smin/smax




### Testing the Functions 

In [89]:
df = pd.read_csv('data.csv',header=None)
print(df.head())
X = df.values

   0    1    2    3    4    5    6    7    8    9    ...  774  775  776  777  \
0    0    0    0    0    0    0    0    0    0    0  ...    0    0    0    0   
1    0    0    0    0    0    0    0    0    0    0  ...    0    0    0    0   
2    0    0    0    0    0    0    0    0    0    0  ...    0    0    0    0   
3    0    0    0    0    0    0    0    0    0    0  ...    0    0    0    0   
4    0    0    0    0    0    0    0    0    0    0  ...    0    0    0    0   

   778  779  780  781  782  783  
0    0    0    0    0    0    0  
1    0    0    0    0    0    0  
2    0    0    0    0    0    0  
3    0    0    0    0    0    0  
4    0    0    0    0    0    0  

[5 rows x 784 columns]


In [90]:
## puting K =10 and max iter by default is 500


km = K_Means(Euclidean_distance,k=10,max_iterations=60)
sse, it = km.fit(X)

print(sse, it)

60 15633152.831533125


In [91]:
Cosine_model = K_Means(Cosine_sim, 10,60)
sse,it = Cosine_model.fit(X)

print(sse, it)

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


60 18375324.14057061


In [92]:
Jaccard_model = K_Means(Jaccard_scr, 10,60)
sse, it = Jaccard_model.fit(X)

print(sse, it)

60 15739269.25110784


## Q2

In [105]:

km = K_Means(Euclidean_distance, 10,60,True, True)
sse,it = km.fit(X)

print(sse, it)

2 16227220.727338199


In [106]:
Cosine_model = K_Means(Cosine_sim, 10,60,True, True)
sse,it = Cosine_model.fit(X)

print(sse, it)

2 17417922.14993949


In [107]:
Jaccard_model = K_Means(Jaccard_scr, 10,60,True, True)
sse,it = Jaccard_model.fit(X)

print(sse, it)

12 15734270.096031422


In [96]:
Jaccard_model.centroids

{0: array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.000

## Calculating the Accuracy


In [63]:
df_labels = pd.read_csv("label.csv", header = None)

In [108]:

pred1 = []
pred2 = []
pred3 = []


for i in X:
    pred1.append(km.pred(i))
    pred2.append(Cosine_model.pred(i))
    pred3.append(Jaccard_model.pred(i))
    
    
print('Euclidean Accuracy : {0:4f}'.format(accuracy_score(df_labels, pred1)))
print('Cosine Accuracy : {0:4f}'.format(accuracy_score(df_labels, pred2)))
print('Jaccard Accuracy : {0:4f}'.format(accuracy_score(df_labels, pred3)))


Euclidean Accuracy : 0.086300
Cosine Accuracy : 0.088300
Jaccard Accuracy : 0.056800


## Q3

In [109]:

km = K_Means(Euclidean_distance, 10,60,True, False)
it, sse = km.fit(X)

print(sse, it)

15634107.34475238 41


In [110]:
Cosine_model = K_Means(Cosine_sim, 10,60,True, False)
it, sse = Cosine_model.fit(X)

print(sse, it)

17417922.14993949 2


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


In [111]:
Jaccard_model = K_Means(Jaccard_scr, 10,60,True, False)
it, sse = Cosine_model.fit(X)

print(sse, it)

17417922.14993949 2


## Q4

In [101]:

km = K_Means(Euclidean_distance, 10,60,False, True)
it, sse = km.fit(X)

print(sse, it)

16227220.727338199 2


In [102]:
Cosine_model = K_Means(Cosine_sim, 10,60,False, False)
it, sse = Cosine_model.fit(X)

print(sse, it)

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


18375324.14057061 60


In [103]:
Jaccard_model = K_Means(Jaccard_scr, 10,60,False, False)
it, sse = Jaccard_model.fit(X)

print(sse, it)

15739269.25110784 60


# TASK 2


In [None]:
red = np.array([[4.7,3.2], [4.9,3.1], [5.0,3.0], [4.6,2.9]])
blue = np.array([[5.9,3.2], [6.7,3.1], [6.0,3.0], [6.2,2.8]])

In [None]:
from scipy.spatial import distance

distance.cdist(red, blue, 'euclidean')

In [None]:
red = np.array([[4.7,3.2], [4.9,3.1], [5.0,3.0], [4.6,2.9]])
blue = np.array([[5.9,3.2], [6.7,3.1], [6.0,3.0], [6.2,2.8]])

from scipy.spatial import distance

distance.cdist(red, blue, 'euclidean')

In [3]:
distance.cdist(red, blue, 'euclidean').max()

2.109502310972899

In [4]:
distance.cdist(red, blue, 'euclidean').min()

0.9219544457292891

In [5]:
distance.cdist(red, blue, 'euclidean').mean()

1.4128564854895742

In [8]:
## Old Code not working (Will See later)
