1. Run k-means on AT&T 100 images, set K=10. Obtain confusion matrix. Re-order the confusion matrix using bipartite graph matching and obtain accuracy.

2. Run k-means on AT&T 400 images, set K=40. Obtain confusion matrix. Re-order the confusion matrix and obtain accuracy.

3. Run k-means on Hand-written-letters data, set K=26, as above.

In [1]:
#my Kmeans
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math as m
import seaborn as sb
from random import randint
from sklearn.preprocessing import LabelEncoder
%matplotlib inline
%matplotlib inline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import munkres
from copy import deepcopy
from sklearn.cluster import KMeans
import os

In [2]:
#pick Data from file
def pickDataClass(filename, class_ids):
    cwd = os.getcwd()  # Get the current working directory (cwd)
    with open(filename,'r') as file:
        lines=[line.split(',') for line in file]
    for i in range(len(lines)):
        lines[i][-1]=lines[i][-1].replace('\n', '')
    arr=np.asarray(lines)
    arr=arr.transpose()
    test_data, train_data=[],[]
    for i in range(len(arr)):   
        if str(arr[i][0]) in str(class_ids):
            train_data.append(arr[i])
            np.delete(arr,i,axis=0)
        else:
            test_data.append(arr[i])
    return(arr,train_data, test_data)

In [3]:
#store the selected data
def store(train):
    df=pd.DataFrame(train)
    df.to_csv("train.csv", index=None)


In [4]:
#Read file and create input variable
def readfile():
    train1=pd.read_csv("train.csv")
    train=train1.iloc[:,1:]
    X=np.array(train)
    print("Size of X")
    print(X.shape)
    y_true = np.array(train1.iloc[:,0])
    print("Size of Y")
    print(y_true.shape)
    return X, y_true

In [5]:
#Kmeans
def kmeans(size,X, y_true):
    print("Size of the cluster= "+str(size))
    size=size
    # Number of clusters
    kmeans = KMeans(n_clusters=size)
    # Fitting the input data
    kmeans = kmeans.fit(X)
    # Getting the cluster labels
    labels = kmeans.predict(X)
    for i in range(len(y_true)):
        if y_true[i]==float(10):
            y_true[i]=0.0
    y_pred = labels
    c=confusion_matrix(y_true, y_pred)
    print("Size of confusion Matrix")
    print(np.shape(c))
    print("Confusion Matrix")
    print(c)
    c=c*-1
    mun=munkres.Munkres()
    result=mun.compute(c)
    print("Bipartie Graph")
    print(result)
    sample=deepcopy(y_pred)
    for j in range(len(sample)):
        for i in range(len(result)):
            if sample[j]==result[i][1]:
                y_pred[j]=result[i][0]
                break
    accuracy=(accuracy_score(y_true, y_pred))
    print("Accuracy")
    print(accuracy)

In [6]:
#function to take data according to need and store it
def takeinput(li,filename):
    arr,train, test= pickDataClass(filename,li)
    store(train)

In [7]:
#all the all the process together
def printing(size,filename):
    li=[i for i in range(1,size+1)]
    filename=filename
    takeinput(li,filename)
    X,y_true=readfile()
    kmeans(size,X, y_true)

In [8]:
printing(10,'ATNTFaceImages400.txt')

Size of X
(100, 644)
Size of Y
(100,)
Size of the cluster= 10
Size of confusion Matrix
(10, 10)
Confusion Matrix
[[ 0  0  0  0  1  9  0  0  0  0]
 [ 0  0  0  2  0  0  0  0  7  1]
 [ 0  0  0 10  0  0  0  0  0  0]
 [ 5  0  0  0  0  0  0  5  0  0]
 [ 4  0  0  0  0  0  0  6  0  0]
 [ 0  0  0  0  0  0  0  0  0 10]
 [ 0  0 10  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 10  0  0  0]
 [ 0  0  0  0 10  0  0  0  0  0]
 [ 0 10  0  0  0  0  0  0  0  0]]
Bipartie Graph
[(0, 5), (1, 8), (2, 3), (3, 0), (4, 7), (5, 9), (6, 2), (7, 6), (8, 4), (9, 1)]
Accuracy
0.87


In [9]:
printing(40,'ATNTFaceImages400.txt')

Size of X
(400, 644)
Size of Y
(400,)
Size of the cluster= 40
Size of confusion Matrix
(41, 41)
Confusion Matrix
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 8 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Bipartie Graph
[(0, 6), (1, 34), (2, 5), (3, 23), (4, 28), (5, 10), (6, 16), (7, 0), (8, 12), (9, 11), (10, 38), (11, 33), (12, 25), (13, 20), (14, 13), (15, 19), (16, 31), (17, 40), (18, 26), (19, 22), (20, 17), (21, 2), (22, 15), (23, 3), (24, 18), (25, 27), (26, 1), (27, 35), (28, 30), (29, 24), (30, 8), (31, 9), (32, 21), (33, 4), (34, 37), (35, 14), (36, 32), (37, 36), (38, 39), (39, 29), (40, 7)]
Accuracy
0.7075


In [10]:
printing(26,'HandWrittenLetters.txt')

Size of X
(1014, 320)
Size of Y
(1014,)
Size of the cluster= 26
Size of confusion Matrix
(27, 27)
Confusion Matrix
[[ 0  0  0  0  0  0  0  0  2  0  0  0  0  1  0  0  0  0  4  0  0 21  0 11
   0  0  0]
 [ 0  0  0  0  0  1  0 12  0  0  0  0  1  0  0 16  5  1  0  0  2  0  1  0
   0  0  0]
 [ 0  0  0 11  1  1  0  0  0  2  0  0  0  1  0  0  2  1  0  0  0  0  0 15
   0  5  0]
 [ 0  1  0  0  5  0 28  0  0  0  3  0  0  2  0  0  0  0  0  0  0  0  0  0
   0  0  0]
 [ 0  4  0  0  1  0  0  0  5  0  0  2  3 10  0  0  0  1  0  0  3  0  0  0
   0 10  0]
 [ 0  0  0 18 18  0  1  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0
   0  1  0]
 [ 0  0  0  1  0  0  0  0  0  0  0 16  1  0  0 11  9  1  0  0  0  0  0  0
   0  0  0]
 [ 1  6  0  2  0  0  5  1  0  0  0  9  0  1  0  0  0  0  4  0  9  0  0  1
   0  0  0]
 [ 2  0  4  0  0  0  0 18  0  0  0  0  0  0  1  4  0  0  0  0  0  0  6  0
   4  0  0]
 [ 0  0  0  0  1  0  0  0 20  0  0  0  2  0  1  0  0  0  2  0  1 12  0  0
   0  0  0]
 [ 0  0  0  0  0  0  0  0