In [236]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
import random
import math
import seaborn

data = pd.read_csv('A_Z Handwritten Data.csv') # wczytywanie datasetu

In [237]:
label = {0:'A',1:'B',2:'C',3:'D',4:'E',5:'F',
         6:'G',7:'H',8:'I',9:'J',10:'K',11:'L',
         12:'M',13:'N',14:'O',15:'P',16:'Q',17:'R',
         18:'S',19:'T',20:'U',21:'V',22:'W',23:'X',24:'Y',25:'Z'} # ustalenie jaka litera kryje się pod danym indexem

data.iloc[:,0] = data.iloc[:,0].map(label) # odniesienie do, iloc[: (wszystkie wiersze), 0 (pierwsza kolumna)] i zmapowanie na nowe wartości opisane przez label

In [238]:
row_count = data['0'].value_counts() # pobieranie liczby wierszy dla danej kategorii
min_row_count = row_count.min()

short_data = data.groupby('0').apply(lambda x: x.sample(n=80, replace=False))

short_data = short_data.reset_index(drop=True) #resetowanie indeksacji dataframeu

In [239]:
class DataProcessing:
    @staticmethod
    def shuffling(data_list):
        for i in range(len(data_list)-1,0,-1):
            index = random.randint(0,i-1)
            data_list.loc[i], data_list.loc[index] = data_list.loc[index], data_list.loc[i]

    @staticmethod
    def train_test_split(data_list,prc):
        train_len = round((len(data_list)) * prc)
        #test_len = round((len(data_list)) * 0.4)
        
        train = data_list[0:train_len]
        test = data_list[train_len:len(data_list)]

        test = test.reset_index(drop=True) #resetowanie indeksacji dataframeu

        return train, test
    
    @staticmethod
    def label_split(data_list):
        feature_list = []
        label_list = []

        sum = 0
        for col in data_list.columns:
            if col != '0':
                sum +=1
        
        for i in range(len(data_list)):
            feature_list.append(data_list.loc[i].to_list()[len(data_list.columns)-sum:])

        for i in range(len(data_list)):
            label_list.append(data_list.loc[i].to_list()[:len(data_list.columns)-sum][0])
        
        return feature_list, label_list

In [240]:
print(short_data.shape)

(2080, 785)


In [241]:
DataProcessing.shuffling(short_data)

In [242]:
train_data, test_data = DataProcessing.train_test_split(short_data, 0.6)

1248   832


In [243]:
def Minkowski_dist(x,y,m):
    res=0
    for i in range(len(x)):
        res += (abs(x[i] - y[i]))**m

    res = math.pow(res,1.0/m)

    return res

def KNN_algorithm(list, data, k, m):

    distances = []

    feature_list, label_list = DataProcessing.label_split(list)

    for i, elem in enumerate(feature_list):
       distances.append([Minkowski_dist(elem,data,m),i])

    distances = sorted(distances)

    k_dist = distances[:k]

    lw={}

    for x in label_list:
        lw[x] = 0

    #print(lw)

    for elem in k_dist:
        lw[label_list[elem[1]]] += 1

    #print(lw)

    max_elem = float("-inf")

    for elem in lw:
        if lw[elem] > max_elem:
            max_elem = lw[elem]
            category = elem

    return category

In [244]:


all=0
good=0
bad=0

sum = 0
for col in short_data.columns:
    if col != "0":
        sum +=1

data, actual = DataProcessing.label_split(test_data)

predicted = []

for i in range(len(test_data)):

    result = KNN_algorithm(train_data, test_data.loc[i].to_list()[len(short_data.columns)-sum:], 1, 2) # najlepszy rezultat dla k=1, m=2
    predicted.append(result)

    if result == test_data.loc[i].to_list()[:len(short_data.columns)-sum][0]:
        good += 1
    else:
        bad += 1

    all += 1

print("k = 1, m = 2 (k - ilość sąsiadów branych pod uwagę)")
print(f"All: {all}, good: {good}, bad: {bad}")
print(f"Test statistic: {round(good/all*100,2)}%")

k = 1, m = 2 (k - ilość sąsiadów branych pod uwagę)
All: 832, good: 609, bad: 223
Test statistic: 73.2%


In [245]:
confusion_matrix = metrics.confusion_matrix(actual, predicted)
print(confusion_matrix)

[[22  0  0  0  0  0  0  0  0  0  0  1  0  1  0  5  0  1  0  0  1  0  0  0
   0  0]
 [ 0 13  3  0  4  0  0  0  0  0  0  0  1  0  0  4  0  0  0  0  0  0  0  0
   0  0]
 [ 0  0 30  0  1  0  0  0  0  0  0  2  0  0  2  0  0  2  0  0  1  0  0  0
   1  0]
 [ 1  2  1 17  0  0  0  0  2  0  0  0  0  1  3  1  1  0  0  0  0  0  0  0
   1  0]
 [ 0  3  3  0 23  2  1  1  0  1  0  2  0  1  0  1  0  0  0  0  0  0  0  0
   1  0]
 [ 1  0  0  0  2 26  0  0  0  0  0  0  0  0  0  1  0  0  0  1  0  0  0  1
   0  0]
 [ 0  2  2  0  0  0 18  0  0  0  0  1  0  0  0  0  1  0  0  0  0  0  0  0
   0  0]
 [ 0  0  0  0  0  2  0 29  0  0  0  0  2  4  0  0  0  0  0  0  0  0  1  0
   0  0]
 [ 0  0  0  0  0  0  0  0 27  1  0  1  0  0  0  0  0  0  0  1  0  0  0  0
   0  0]
 [ 0  0  0  0  0  0  0  0  1 25  0  0  0  0  0  0  0  0  0  4  1  0  0  0
   0  0]
 [ 0  0  0  0  0  0  0  1  0  0 23  2  0  1  0  0  0  2  0  0  0  0  0  4
   0  0]
 [ 0  0  1  0  0  0  0  0  0  0  0 32  0  0  0  0  0  0  0  0  0  0  1  0
   0  0]
 [ 0