In [1]:
from sklearn.cluster import KMeans
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')


In [2]:

######################################################################
# This section contains functions for loading CSV (comma separated values)
# files and convert them to a dataset of instances.
# Each instance is a tuple of attributes. The entire dataset is a list
# of tuples.
######################################################################

# Loads a CSV files into a list of tuples.
# Ignores the first row of the file (header).
# Numeric attributes are converted to floats, nominal attributes
# are represented with strings.
# Parameters:
#   fileName: name of the CSV file to be read
# Returns: a list of tuples
def loadCSV(fileName):
    fileHandler = open(fileName, "rt")
    lines = fileHandler.readlines()
    fileHandler.close()
    del lines[0] # remove the header
    dataset = []
    for line in lines:
        instance = lineToTuple(line)
        dataset.append(instance)
    return dataset

In [3]:
# Converts a comma separated string into a tuple
# Parameters
#   line: a string
# Returns: a tuple
def lineToTuple(line):
    # remove leading/trailing witespace and newlines
    cleanLine = line.strip()
    # get rid of quotes
    cleanLine = cleanLine.replace('"', '')
    # separate the fields
    lineList = cleanLine.split(",")
    # convert strings into numbers
    stringsToNumbers(lineList)
    lineTuple = tuple(lineList)
    return lineTuple

In [4]:
# Destructively converts all the string elements representing numbers
# to floating point numbers.
# Parameters:
#   myList: a list of strings
# Returns None
def stringsToNumbers(myList):
    for i in range(len(myList)):
        if (isValidNumberString(myList[i])):
            myList[i] = float(myList[i])

# Checks if a given string can be safely converted into a positive float.
# Parameters:
#   s: the string to be checked
# Returns: True if the string represents a positive float, False otherwise
def isValidNumberString(s):
    if len(s) == 0:
        return False
    if  len(s) > 1 and s[0] == "-":
        s = s[1:]
    for c in s:
        if c not in "0123456789.":
            return False
    return True

In [5]:

######################################################################
# This section contains functions for clustering a dataset
# using the k-means algorithm.
######################################################################

def distance(instance1, instance2):
    if instance1 == None or instance2 == None:
        return float("inf")
    sumOfSquares = 0
    for i in range(0, len(instance1)-1):
        sumOfSquares += (instance1[i+1] - instance2[i])**2
    return sumOfSquares


In [6]:
def meanInstance(name, instanceList):
    numInstances = len(instanceList)
    if (numInstances == 0):
        return
    numAttributes = len(instanceList[0])
    means = [name] + [0] * (numAttributes)
    for instance in instanceList:
        for i in range(0, numAttributes):
            means[i+1] += instance[i]
   
    for i in range(0, numAttributes):
        means[i+1] /= float(numInstances)
    return tuple(means)

In [7]:

def assign(instance, centroids, dist):
    minDistance = dist(centroids[0],instance)
    minDistanceIndex = 0
    for i in range(1, len(centroids)):
        d = dist(centroids[i], instance )
        if (d < minDistance):
            minDistance = d
            minDistanceIndex = i
    return minDistanceIndex


In [8]:
def createEmptyListOfLists(numSubLists):
    myList = []
    for i in range(numSubLists):
        myList.append([])
    return myList

In [9]:
def assignAll(instances, centroids, labels, dist):
    clusters = createEmptyListOfLists(len(centroids))
    classclusters = createEmptyListOfLists(len(centroids))
    i=0
    for instance in instances:
        clusterIndex = assign(instance, centroids, dist)
        clusters[clusterIndex].append(instance)
        classclusters[clusterIndex].append(labels[i])
        i=i+1
    return clusters, classclusters


In [10]:
def computeCentroids(clusters):
    centroids = []
    for i in range(len(clusters)):
        name = "centroid" + str(i)
        centroid = meanInstance(name, clusters[i])
        centroids.append(centroid)
    return centroids

In [11]:
def kmeans(instances, k, dist,labels, animation=False, initCentroids=None):
    result = {}
    if (initCentroids == None or len(initCentroids) < k):
        # randomly select k initial centroids
        random.seed(time.time())
        centroids = random.sample(instances, k)
    else:
        centroids = initCentroids
    prevCentroids = []
    if animation:
        delay = 1.0 # seconds
#         canvas = prepareWindow(instances)
        clusters = createEmptyListOfLists(k)
        clusters[0] = instances
        #paintClusters2D(canvas, clusters, centroids, "Initial centroids")
#         time.sleep(delay)
    iteration = 0
    while (centroids != prevCentroids):
        iteration += 1
        clusters, votelabels = assignAll(instances, centroids, labels, dist)
        #if animation:
            #paintClusters2D(canvas, clusters, centroids, "Assign %d" % iteration)
            #time.sleep(delay)
        prevCentroids = centroids
        centroids = computeCentroids(clusters)
        withinss = computeWithinss(clusters, centroids, dist)
#         if animation:
#             paintClusters2D(canvas, clusters, centroids,
#                             "Update %d, withinss %.1f" % (iteration, withinss))
#             time.sleep(delay)
    print('Number of iterations are ', iteration)
    result["clusters"] = clusters
    result["centroids"] = centroids
    result["withinss"] = withinss
    result["labels"] = votelabels
    return result


In [12]:
def computeWithinss(clusters, centroids, dist):
    result = 0
    for i in range(len(centroids)):
        centroid = centroids[i]
        cluster = clusters[i]
        for instance in cluster:
            result += dist(centroid, instance)
    return result


In [13]:

# Repeats k-means clustering n times, and returns the clustering
# with the smallest withinss
def repeatedKMeans(instances, k, n):
    bestClustering = {}
    bestClustering["withinss"] = float("inf")
    for i in range(1, n+1):
        print ("k-means trial %d," % i )
        trialClustering = kmeans(instances, k)
        print ("withinss: %.1f" % trialClustering["withinss"])
        if trialClustering["withinss"] < bestClustering["withinss"]:
            bestClustering = trialClustering
            minWithinssTrial = i
    print("Trial with minimum withinss:", minWithinssTrial)
    return bestClustering

In [14]:

######################################################################
# This section contains functions for visualizing datasets and
# clustered datasets.
######################################################################

def printTable(instances):
    for instance in instances:
        if instance != None:
            line = instance[0] + "\t"
            for i in range(1, len(instance)):
                line += "%.2f " % instance[i]
            print(line)


In [15]:
def extractAttribute(instances, index):
    result = []
    for instance in instances:
        result.append(instance[index])
    return result

In [16]:

def paintCircle(canvas, xc, yc, r, color):
    canvas.create_oval(xc-r, yc-r, xc+r, yc+r, outline=color)


In [17]:

def paintSquare(canvas, xc, yc, r, color):
    canvas.create_rectangle(xc-r, yc-r, xc+r, yc+r, fill=color)


In [18]:

def drawPoints(canvas, instances, color, shape):
    random.seed(0)
    width = canvas.winfo_reqwidth()
    height = canvas.winfo_reqheight()
    margin = canvas.data["margin"]
    minX = canvas.data["minX"]
    minY = canvas.data["minY"]
    maxX = canvas.data["maxX"]
    maxY = canvas.data["maxY"]
    scaleX = float(width - 2*margin) / (maxX - minX)
    scaleY = float(height - 2*margin) / (maxY - minY)
    for instance in instances:
        x = 5*(random.random()-0.5)+margin+(instance[1]-minX)*scaleX
        y = 5*(random.random()-0.5)+height-margin-(instance[2]-minY)*scaleY
        if (shape == "square"):
            paintSquare(canvas, x, y, 5, color)
        else:
            paintCircle(canvas, x, y, 5, color)
    canvas.update()

In [19]:
def connectPoints(canvas, instances1, instances2, color):
    width = canvas.winfo_reqwidth()
    height = canvas.winfo_reqheight()
    margin = canvas.data["margin"]
    minX = canvas.data["minX"]
    minY = canvas.data["minY"]
    maxX = canvas.data["maxX"]
    maxY = canvas.data["maxY"]
    scaleX = float(width - 2*margin) / (maxX - minX)
    scaleY = float(height - 2*margin) / (maxY - minY)
    for p1 in instances1:
        for p2 in instances2:
            x1 = margin + (p1[1]-minX)*scaleX
            y1 = height - margin - (p1[2]-minY)*scaleY
            x2 = margin + (p2[1]-minX)*scaleX
            y2 = height - margin - (p2[2]-minY)*scaleY
            canvas.create_line(x1, y1, x2, y2, fill=color)
    canvas.update()

In [20]:
def mergeClusters(clusters):
    result = []
    for cluster in clusters:
        result.extend(cluster)
    return result

In [21]:

def prepareWindow(instances):
    width = 500
    height = 500
    margin = 50
    root = Tk()
    canvas = Canvas(root, width=width, height=height, background="white")
    canvas.pack()
    canvas.data = {}
    canvas.data["margin"] = margin
    setBounds2D(canvas, instances)
    paintAxes(canvas)
    canvas.update()
    return canvas

In [22]:
def setBounds2D(canvas, instances):
    attributeX = extractAttribute(instances, 1)
    attributeY = extractAttribute(instances, 2)
    canvas.data["minX"] = min(attributeX)
    canvas.data["minY"] = min(attributeY)
    canvas.data["maxX"] = max(attributeX)
    canvas.data["maxY"] = max(attributeY)

In [23]:
def paintAxes(canvas):
    width = canvas.winfo_reqwidth()
    height = canvas.winfo_reqheight()
    margin = canvas.data["margin"]
    minX = canvas.data["minX"]
    minY = canvas.data["minY"]
    maxX = canvas.data["maxX"]
    maxY = canvas.data["maxY"]
    canvas.create_line(margin/2, height-margin/2, width-5, height-margin/2,
                       width=2, arrow=LAST)
    canvas.create_text(margin, height-margin/4,
                       text=str(minX), font="Sans 11")
    canvas.create_text(width-margin, height-margin/4,
                       text=str(maxX), font="Sans 11")
    canvas.create_line(margin/2, height-margin/2, margin/2, 5,
                       width=2, arrow=LAST)
    canvas.create_text(margin/4, height-margin,
                       text=str(minY), font="Sans 11", anchor=W)
    canvas.create_text(margin/4, margin,
                       text=str(maxY), font="Sans 11", anchor=W)
    canvas.update()



In [24]:
def showDataset2D(instances):
    canvas = prepareWindow(instances)
    paintDataset2D(canvas, instances)

def paintDataset2D(canvas, instances):
    canvas.delete(ALL)
    paintAxes(canvas)
    drawPoints(canvas, instances, "blue", "circle")
    canvas.update()


In [25]:

def showClusters2D(clusteringDictionary):
    clusters = clusteringDictionary["clusters"]
    centroids = clusteringDictionary["centroids"]
    withinss = clusteringDictionary["withinss"]
    canvas = prepareWindow(mergeClusters(clusters))
    paintClusters2D(canvas, clusters, centroids,
                    "Withinss: %.1f" % withinss)


In [26]:

def paintClusters2D(canvas, clusters, centroids, title=""):
    canvas.delete(ALL)
    paintAxes(canvas)
    colors = ["blue", "red", "green", "brown", "purple", "orange"]
    for clusterIndex in range(len(clusters)):
        color = colors[clusterIndex%len(colors)]
        instances = clusters[clusterIndex]
        centroid = centroids[clusterIndex]
        drawPoints(canvas, instances, color, "circle")
        if (centroid != None):
            drawPoints(canvas, [centroid], color, "square")
        connectPoints(canvas, [centroid], instances, color)
    width = canvas.winfo_reqwidth()
    canvas.create_text(width/2, 20, text=title, font="Sans 14")
    canvas.update()


In [27]:

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score



In [28]:
def accuracy(clustering):
    correct_pred = 0
    for clust in clustering['labels']:
        for label in clust:
            if label == mode(clust):
                correct_pred+=1
    return correct_pred/150


In [29]:

def sse(clustering, centroids):
    i=0
    SSE=0
    for clust in clustering:
        for data in clust:
            SSE=SSE+distance(centroids[i], data)
        i=i+1
    return SSE



In [30]:

def cosine(centroid, data):
    if len(centroid) == 5:
        centroid = centroid[1:]
    
    #print(centroid)
    x1 = centroid
    x1 = np.array(x1)
    x1 = x1.reshape(1,-1)
    y1 = np.array(data)
    y1 = y1.reshape(1,-1)
    #print(x1,y1)
    ans = cosine_similarity(x1,y1)
    return 1-ans[0][0]


In [31]:

def jaccard(centroid, data):
    jacSumnum=[]
    jacSumden = []
    if len(centroid) == 5:
        centroid = centroid[1:]
    for i in range(len(data)):
        jacSumnum.append(min(centroid[i],data[i]))
        jacSumden.append(max(centroid[i],data[i]))
    return 1-(sum(jacSumnum)/sum(jacSumden))



In [32]:
def kmeans_sse(instances, k, dist,labels, animation=False, initCentroids=None):
    result = {}
    if (initCentroids == None or len(initCentroids) < k):
        # randomly select k initial centroids
        random.seed(time.time())
        centroids = random.sample(instances, k)
    else:
        centroids = initCentroids
    prevCentroids = [] 
    new_sse= 99999 
    prev_sse=1000000 
    if animation:
        delay = 1.0 # seconds
#         canvas = prepareWindow(instances)
        clusters = createEmptyListOfLists(k)
        clusters[0] = instances
        #paintClusters2D(canvas, clusters, centroids, "Initial centroids")
#         time.sleep(delay)
    iteration = 0
    while (new_sse < prev_sse):
        iteration += 1
        clusters, votelabels = assignAll(instances, centroids, labels, dist)
        #if animation:
            #paintClusters2D(canvas, clusters, centroids, "Assign %d" % iteration)
            #time.sleep(delay)
        prev_sse = new_sse
        centroids = computeCentroids(clusters)
        new_sse= sse(clusters, centroids)
        withinss = computeWithinss(clusters, centroids, dist)
#         if animation:
#             paintClusters2D(canvas, clusters, centroids,
#                             "Update %d, withinss %.1f" % (iteration, withinss))
#             time.sleep(delay)
    print('Number of iterations are ', iteration)
    print('prev sse', prev_sse)
    print('new sse', new_sse)
    result["clusters"] = clusters
    result["centroids"] = centroids
    result["withinss"] = withinss
    result["labels"] = votelabels
    return result


In [33]:
def kmeans_100(instances, k, dist,labels, animation=False, initCentroids=None):
    result = {}
    if (initCentroids == None or len(initCentroids) < k):
        # randomly select k initial centroids
        random.seed(time.time())
        centroids = random.sample(instances, k)
    else:
        centroids = initCentroids
    prevCentroids = []
    if animation:
        delay = 1.0 # seconds
#         canvas = prepareWindow(instances)
        clusters = createEmptyListOfLists(k)
        clusters[0] = instances
        #paintClusters2D(canvas, clusters, centroids, "Initial centroids")
#         time.sleep(delay)
    iteration = 0
    while (iteration<100):
        iteration +=1
        clusters, votelabels = assignAll(instances, centroids, labels, dist)
        #if animation:
            #paintClusters2D(canvas, clusters, centroids, "Assign %d" % iteration)
            #time.sleep(delay)
        prevCentroids = centroids
        centroids = computeCentroids(clusters)
        withinss = computeWithinss(clusters, centroids, dist)
#         if animation:
#             paintClusters2D(canvas, clusters, centroids,
#                             "Update %d, withinss %.1f" % (iteration, withinss))
#             time.sleep(delay)
    print('Number of iterations are ', iteration)
    result["clusters"] = clusters
    result["centroids"] = centroids
    result["withinss"] = withinss
    result["labels"] = votelabels
    return result

In [34]:
data=pd.read_csv(r'C:\Users\97150\Desktop\Kmeans ml\football.csv')
data


Unnamed: 0,Team,winsInSeason2016,winsInSeason2017
0,X1,3,5
1,X2,3,4
2,X3,2,8
3,X4,2,3
4,X5,6,2
5,X6,6,4
6,X7,7,3
7,X8,7,4
8,X9,8,5
9,X10,7,6


In [35]:

LabQ1= data.Team

In [36]:
LabQ1

0     X1
1     X2
2     X3
3     X4
4     X5
5     X6
6     X7
7     X8
8     X9
9    X10
Name: Team, dtype: object

In [37]:
data = data.drop(['Team'], axis=1)
data


Unnamed: 0,winsInSeason2016,winsInSeason2017
0,3,5
1,3,4
2,2,8
3,2,3
4,6,2
5,6,4
6,7,3
7,7,4
8,8,5
9,7,6


In [38]:

array1 = np.array([[4, 6], [5, 4]])

In [39]:

dataset_football = data.values.tolist()

In [40]:

x=data.winsInSeason2016
y=data.winsInSeason2017

In [41]:

def manhatten(x,y):
    result=[]
    if len(x) == len(y)+1:
        x=x[1:]
    for i in range(len(x)):
        result.append(abs(x[i]-y[i]))
        
    return sum(result) 


In [42]:
import math

def euclidean(centroid, data):
    sum1=[]
    if len(centroid) == len(data)+1:
        centroid=centroid[1:]
    for i in range (0, len(data)):
        sum1.append((centroid[i]- data[i])**2)
    
    
    euclidean= math.sqrt(sum(sum1))
    return euclidean

In [43]:
dataset_football

[[3, 5],
 [3, 4],
 [2, 8],
 [2, 3],
 [6, 2],
 [6, 4],
 [7, 3],
 [7, 4],
 [8, 5],
 [7, 6]]

In [44]:
clustering = kmeans(dataset_football, 2, manhatten,LabQ1, True, initCentroids=[[4,6],[5,4]])
printTable(clustering["centroids"])
print(clustering['clusters'])
print('The SSE is: ',sse(clustering['clusters'],clustering['centroids']))


Number of iterations are  2
centroid0	4.00 6.33 
centroid1	5.57 3.57 
[[[3, 5], [2, 8], [7, 6]], [[3, 4], [2, 3], [6, 2], [6, 4], [7, 3], [7, 4], [8, 5]]]
The SSE is:  54.09523809523809


In [45]:

clustering = kmeans(dataset_football, 2, euclidean,LabQ1, True, initCentroids=[[4,6],[5,4]])
printTable(clustering["centroids"])
print(clustering['clusters'])
print('The SSE is:',sse(clustering['clusters'],clustering['centroids']))


Number of iterations are  3
centroid0	2.50 5.00 
centroid1	6.83 4.00 
[[[3, 5], [3, 4], [2, 8], [2, 3]], [[6, 2], [6, 4], [7, 3], [7, 4], [8, 5], [7, 6]]]
The SSE is: 27.833333333333332


In [46]:

clustering = kmeans(dataset_football, 2, manhatten, LabQ1, True, initCentroids=[[3,3],[8,3]])
printTable(clustering["centroids"])
print(clustering['clusters'])
print('The SSE is:',sse(clustering['clusters'],clustering['centroids']))


Number of iterations are  2
centroid0	2.50 5.00 
centroid1	6.83 4.00 
[[[3, 5], [3, 4], [2, 8], [2, 3]], [[6, 2], [6, 4], [7, 3], [7, 4], [8, 5], [7, 6]]]
The SSE is: 27.833333333333332


In [47]:

clustering = kmeans(dataset_football, 2, manhatten, LabQ1, True, initCentroids=[[3,2],[4,8]])
printTable(clustering["centroids"])
print(clustering['clusters'])
print('The SSE is:',sse(clustering['clusters'],clustering['centroids']))


Number of iterations are  2
centroid0	4.86 3.57 
centroid1	5.67 6.33 
[[[3, 5], [3, 4], [2, 3], [6, 2], [6, 4], [7, 3], [7, 4]], [[2, 8], [8, 5], [7, 6]]]
The SSE is: 57.904761904761905


In [48]:
names = [
  'sepal_length',
  'sepal_width',
  'petal_length',
  'petal_width',
  'class',
]


In [49]:
names

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']

In [50]:
iris_df = pd.read_csv('/Users/97150/Desktop/Kmeans ml/iris.data', header=None, names=names)
iris_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [51]:

labels = iris_df['class'].values.tolist()


In [52]:
labels

['Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',


In [53]:
import numpy as np
iris_df.dropna()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [54]:
iris_df['class'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [55]:
Y = iris_df

In [56]:
data=iris_df.drop(['class'], axis=1)

In [57]:
import math
import random
import time
from tkinter import *

In [58]:
import math

def euclidean(centroid, data):
    sum1=[]
    if len(centroid) == len(data)+1:
        centroid=centroid[1:]
    for i in range (0, len(data)):
        sum1.append((centroid[i]- data[i])**2)
    
    
    euclidean= math.sqrt(sum(sum1))
    return euclidean

In [59]:
dataset_iris=data.values.tolist()
dataset_iris


[[5.1, 3.5, 1.4, 0.2],
 [4.9, 3.0, 1.4, 0.2],
 [4.7, 3.2, 1.3, 0.2],
 [4.6, 3.1, 1.5, 0.2],
 [5.0, 3.6, 1.4, 0.2],
 [5.4, 3.9, 1.7, 0.4],
 [4.6, 3.4, 1.4, 0.3],
 [5.0, 3.4, 1.5, 0.2],
 [4.4, 2.9, 1.4, 0.2],
 [4.9, 3.1, 1.5, 0.1],
 [5.4, 3.7, 1.5, 0.2],
 [4.8, 3.4, 1.6, 0.2],
 [4.8, 3.0, 1.4, 0.1],
 [4.3, 3.0, 1.1, 0.1],
 [5.8, 4.0, 1.2, 0.2],
 [5.7, 4.4, 1.5, 0.4],
 [5.4, 3.9, 1.3, 0.4],
 [5.1, 3.5, 1.4, 0.3],
 [5.7, 3.8, 1.7, 0.3],
 [5.1, 3.8, 1.5, 0.3],
 [5.4, 3.4, 1.7, 0.2],
 [5.1, 3.7, 1.5, 0.4],
 [4.6, 3.6, 1.0, 0.2],
 [5.1, 3.3, 1.7, 0.5],
 [4.8, 3.4, 1.9, 0.2],
 [5.0, 3.0, 1.6, 0.2],
 [5.0, 3.4, 1.6, 0.4],
 [5.2, 3.5, 1.5, 0.2],
 [5.2, 3.4, 1.4, 0.2],
 [4.7, 3.2, 1.6, 0.2],
 [4.8, 3.1, 1.6, 0.2],
 [5.4, 3.4, 1.5, 0.4],
 [5.2, 4.1, 1.5, 0.1],
 [5.5, 4.2, 1.4, 0.2],
 [4.9, 3.1, 1.5, 0.1],
 [5.0, 3.2, 1.2, 0.2],
 [5.5, 3.5, 1.3, 0.2],
 [4.9, 3.1, 1.5, 0.1],
 [4.4, 3.0, 1.3, 0.2],
 [5.1, 3.4, 1.5, 0.2],
 [5.0, 3.5, 1.3, 0.3],
 [4.5, 2.3, 1.3, 0.3],
 [4.4, 3.2, 1.3, 0.2],
 [5.0, 3.5,

In [78]:
clustering = kmeans(dataset_iris, 3, cosine,labels, True)
printTable(clustering["centroids"])
#print(clustering)
print('SSE is ',sse(clustering['clusters'],clustering['centroids']))
print("Accuracy is ",accuracy(clustering))


Number of iterations are  6
centroid0	5.01 3.42 1.46 0.24 
centroid1	6.54 2.96 5.50 1.99 
centroid2	5.94 2.76 4.21 1.30 
SSE is  92.07870917874394
Accuracy is  0.9733333333333334


In [77]:
clustering = kmeans(dataset_iris, 3, jaccard,labels, True)
printTable(clustering["centroids"])
print('The SSE is: ',sse(clustering['clusters'],clustering['centroids']))
print("The Accuracy is: ",accuracy(clustering))


Number of iterations are  11
centroid0	5.89 2.74 4.41 1.43 
centroid1	6.87 3.09 5.72 2.07 
centroid2	5.01 3.42 1.46 0.24 
The SSE is:  79.18674974533108
The Accuracy is:  0.88


In [76]:
clustering = kmeans(dataset_iris, 3, euclidean,labels, True)
printTable(clustering["centroids"])
#print(clustering)
print('The SSE: ',sse(clustering['clusters'],clustering['centroids']))
print("The Accuracy is: ",accuracy(clustering))


Number of iterations are  8
centroid0	6.85 3.07 5.74 2.07 
centroid1	5.90 2.75 4.39 1.43 
centroid2	5.01 3.42 1.46 0.24 
The SSE:  78.94084142614602
The Accuracy is:  0.8933333333333333


In [63]:

import statistics 
from statistics import mode 

print(mode(clustering['labels'][2]))

Iris-versicolor


In [64]:


len(clustering['labels'])

3

In [65]:

clustering['labels']


[['Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa',
  'Iris-setosa'],
 ['Iris-versicolor',
  'Iris-versicolor',
  'Iris-virginica',
  'Iris-virginica',
  'Iris-virginica',
  'Iris-virginica',
  'Iris-virginica',
  'Iris

In [66]:

if clustering['labels'][0][0] == mode(clustering['labels'][1]):
    print('True')


In [75]:
before= datetime.now()
clustering = kmeans_sse(dataset_iris, 3, cosine, labels, True)
after= datetime.now()
printTable(clustering["centroids"])
print("Time Taken: ", after- before)
print('The SSE is: ',sse(clustering['clusters'],clustering['centroids']))
print("The Accuracy is: ",accuracy(clustering))

Number of iterations are  7
prev sse 93.85979393939397
new sse 93.85979393939397
centroid0	6.52 2.97 5.48 1.99 
centroid1	5.01 3.42 1.46 0.24 
centroid2	5.95 2.76 4.20 1.30 
Time Taken:  0:00:00.580112
The SSE is:  93.85979393939397
The Accuracy is:  0.9666666666666667


In [74]:
before= datetime.now()
clustering = kmeans_sse(dataset_iris, 3, jaccard,labels, True)
after= datetime.now()
printTable(clustering["centroids"])
print("The Time Taken: ", after- before)
print("The SSE:" ,sse(clustering['clusters'],clustering['centroids']))
print("The Accuracy: ",accuracy(clustering))

Number of iterations are  5
prev sse 79.14342874302875
new sse 79.18674974533108
centroid0	5.89 2.74 4.41 1.43 
centroid1	5.01 3.42 1.46 0.24 
centroid2	6.87 3.09 5.72 2.07 
The Time Taken:  0:00:00.011968
The SSE: 79.18674974533108
The Accuracy:  0.88


In [69]:
before= datetime.now()
clustering = kmeans_sse(dataset_iris, 3, euclidean,labels, True)
after= datetime.now()
printTable(clustering["centroids"])
print("The Time Taken: ", after- before)
#print(clustering)
print('The SSE: ',sse(clustering['clusters'],clustering['centroids']))
print("The Accuracy: ",accuracy(clustering))

Number of iterations are  7
prev sse 78.940841426146
new sse 78.940841426146
centroid0	5.01 3.42 1.46 0.24 
centroid1	5.90 2.75 4.39 1.43 
centroid2	6.85 3.07 5.74 2.07 
The Time Taken:  0:00:00.013962
The SSE:  78.940841426146
The Accuracy:  0.8933333333333333


In [73]:
before= datetime.now()
clustering = kmeans_100(dataset_iris, 3, cosine,labels, True)
after= datetime.now()
printTable(clustering["centroids"])
print("The Time Taken: ", after- before)
#print(clustering)
print("The SSE: ",sse(clustering['clusters'],clustering['centroids']))
print("Accuracy:",accuracy(clustering))

Number of iterations are  100
centroid0	5.95 2.76 4.20 1.30 
centroid1	5.01 3.42 1.46 0.24 
centroid2	6.52 2.97 5.48 1.99 
The Time Taken:  0:00:08.187675
The SSE:  93.85979393939395
Accuracy: 0.9666666666666667


In [71]:
before= datetime.now()
clustering = kmeans_100(dataset_iris, 3, jaccard,labels, True)
after= datetime.now()
printTable(clustering["centroids"])
print("Time Taken:", after - before)
#print(clustering)
print("The SSE:",sse(clustering['clusters'],clustering['centroids']))
print("Accuracy: ",accuracy(clustering))

Number of iterations are  100
centroid0	5.01 3.42 1.46 0.24 
centroid1	5.69 2.67 4.12 1.27 
centroid2	6.61 3.00 5.39 1.92 
Time Taken: 0:00:00.182283
The SSE: 84.33228455008488
Accuracy:  0.8933333333333333


In [72]:
before= datetime.now()
clustering = kmeans_100(dataset_iris, 3, euclidean,labels, True)
after= datetime.now()
printTable(clustering["centroids"])
print("Time Taken: ", after- before)
#print(clustering)
print('The SSE:',sse(clustering['clusters'],clustering['centroids']))
print("The Accuracy: ",accuracy(clustering))

Number of iterations are  100
centroid0	6.85 3.07 5.74 2.07 
centroid1	5.90 2.75 4.39 1.43 
centroid2	5.01 3.42 1.46 0.24 
Time Taken:  0:00:00.133616
The SSE: 78.94084142614602
The Accuracy:  0.8933333333333333
