In [16]:
import pandas as pd
import seaborn as sb

import random
import math

In [17]:
class DataProcessing:
    @staticmethod
    def shuffling(data_list):
        for i in range(len(data_list)-1,0,-1):
            index = random.randint(0,i-1)
            data_list.loc[i], data_list.loc[index] = data_list.loc[index], data_list.loc[i]

    @staticmethod
    def normalize(data_list):
        for col in data_list.columns:
            if type(data_list[col].loc[0]) is not type("text"):
                min1 = float("inf")
                max1 = float("-inf")

                for x in data_list[col]:
                    min1 = min(min1,x)
                    max1 = max(max1,x)
                    
                for i in range(len(data_list[col])):
                    data_list.at[i, col] -= min1  
                    data_list.at[i, col] /= (max1 - min1)

    @staticmethod
    def train_test_split(data_list):
        train_len = round((len(data_list)) * 0.7)
        #test_len = round((len(data_list)) * 0.3)
        
        train = data_list[0:train_len]
        test = data_list[train_len:len(data_list)]

        test = test.reset_index(drop=True) #resetowanie indeksacji dataframeu

        return train, test
    
    @staticmethod
    def label_split(data_list):
        feature_list = []
        label_list = []

        sum = 0
        for col in data_list.columns:
            if type(data_list[col].loc[0]) is not type("text"):
                sum +=1
        
        for i in range(len(data_list)):
            feature_list.append(data_list.loc[i].to_list()[:sum])

        for i in range(len(data_list)):
            label_list.append(data_list.loc[i].to_list()[sum:])
        
        return feature_list, label_list

In [18]:
def Minkowski_dist(x,y,m):
    res=0
    for i in range(len(x)):
        res += (abs(x[i] - y[i]))**m

    res = math.pow(res,1.0/m)

    return res

def KNN_algorithm(list, data, k, m):

    distances = []

    feature_list, label_list = DataProcessing.label_split(list)

    for i, elem in enumerate(feature_list):
       distances.append([Minkowski_dist(elem,data,m),i])

    distances = sorted(distances)

    k_dist = distances[:k]

    lw={}

    for x in label_list:
        lw[x[0]] = 0

    #print(lw)

    for elem in k_dist:
        lw[label_list[elem[1]][0]] += 1

    #print(lw)

    max_elem = float("-inf")

    for elem in lw:
        if lw[elem] > max_elem:
            max_elem = lw[elem]
            category = elem

    return category

In [19]:
data = pd.read_csv("iris.csv")

#print(len(data)) # dlugosc datasetu
#print(data.loc[5]) # uzyskiwanie danych o jednym rekordzie
#print(data.loc[5].to_list()) # wypisywanie rekordu w postaci listy

#sb.pairplot(data, hue="variety") # wyswietlanie wykresow kazdej cechy
#sb.violinplot(data, x="sepal.width", y="variety", inner="quartile")
#print(data.describe()) # wypisuje uśrednione info o datasecie
#print(data)

DataProcessing.shuffling(data)
DataProcessing.normalize(data)
train_data, test_data = DataProcessing.train_test_split(data)

#print(train_data)
#print(test_data)



In [20]:
all=0
good=0
bad=0

sum = 0
for col in data.columns:
    if col != "variety":
        sum +=1

for i in range(len(test_data)-1):

    result = KNN_algorithm(train_data, test_data.loc[i].to_list()[:sum], 4, 2)

    if result == test_data.loc[i].to_list()[sum:][0]:
        good += 1
    else:
        bad += 1

    all += 1

print("k = 4, m = 2 (k - ilość sąsiadów branych pod uwagę)")
print(f"All: {all}, good: {good}, bad: {bad}")
print(f"Test statistic: {round(good/all,4)*100}%")

k = 4, m = 2 (k - ilość sąsiadów branych pod uwagę)
All: 44, good: 42, bad: 2
Test statistic: 95.45%
