In [21]:

#Whenever you see _<name>_ , this method is something defined by python
import numpy as np

class K_Means:
    
    def __init__(self, k = 2, tol = 0.001, max_iter = 300):
        self.k = k
        self.tol = tol
        self.max_iter = max_iter
        #tol = 0.001 indicates the largest possible centroid shift that can be tolerated,i.e, we will stop iterations if diff = tol
        #Why?Bcoz such a small shift may continue to oscillate between two possible centroids(due to boundary points)
        
    def fit(self,data):
        #centroids dict
        self.centroids = {}
        #since k =2 we will select first two points from the data and we will declare that as a centroid
        for i in range(self.k):
            self.centroids[i] = data[i] #[[1,2], [5,8], ...]
            #{0:[1,2], 1:[5,8]} is how centroids will look
            
        #we will run this loop for 300 times max
        for i in range(self.max_iter):
            self.classifications = {} #{0:[], 1:[]}
            
            for j in range(self.k):
                self.classifications[j] = [] #setting up our classifications dictionary
            
            for featureset in data: #finding distance from centroid, finding min value
                distances = [np.linalg.norm(featureset - self.centroids[centroid]) for centroid in self.centroids]
                #centroid:0 -> self.centroids[0](the 0th class centroid point) -> find norm and store in list
                classification = distances.index(min(distances)) #find index->will give which cluster the data belongs to
                
                self.classifications[classification].append(featureset)
            
            prev_centroids = dict(self.centroids) #storing previous centroids in another variable 
            #dict is used to specify datatype for prev_centroids, i.e, prev_centroids = {}
            
            
            for classification in self.classifications:
                self.centroids[classification] = np.average(self.classifications[classification], axis = 0)
                
            optimized = True
            
            for c in self.centroids:
                original_centroid = prev_centroids[c]
                current_centroid = self.centroids[c]
                tolerance = np.sum(((current_centroid - original_centroid)/original_centroid)*100)
                #tolerance formula 
                #read more about the maths
                if tolerance > self.tol:
                    optimized = False
            
            if optimized:
                break
            #exit max_iters as optimal centroids have been found

def predict(self, data):
    d = [np.linalg.norm(data - self.centroids[centroid]) for centroid in self.centroids]
    pred = d.index(min(d))
    return pred


In [22]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random

df=pd.read_csv("breast-cancer-dataset.data.txt")
#print(df)

df.replace('?', -99999, inplace = True)
df.drop(['id'], 1, inplace = True)
df.drop(['label'], 1 , inplace = True)
data = df.astype(float).values.tolist()
#print(data)

data = np.array(data)
clf = K_Means(k = 3)
clf.fit(data)
clf.centroids


{0: array([ 3.3750e+00,  2.4375e+00,  2.8750e+00,  1.8125e+00,  2.4375e+00,
        -9.9999e+04,  3.1250e+00,  2.7500e+00,  1.0000e+00]),
 1: array([7.16450216, 6.77922078, 6.71861472, 5.73160173, 5.46320346,
        7.92640693, 6.0952381 , 6.03896104, 2.56277056]),
 2: array([3.05088496, 1.29646018, 1.42477876, 1.34734513, 2.09513274,
        1.30530973, 2.09070796, 1.25      , 1.11283186])}