# K Nearest Neighbor(KNN)

## Import Necessary Module

In [1]:
import numpy as np
import os
import math
import random

## Load Data to Matrix and Map it to Numeric Values

In [2]:
# Load the file and convert it to numpy matrix.

# Load necessary modules
import numpy as np
import csv



def load_text_data(file, delimiter, header, label_col_is_str , label_col):
    """
    Loading the text data and convert it to numpy matrix.
    If there is a label column with string data type, it will convert the label to Intiger and replace the string. And also return
    the mapping of the string mapping in a dictionary.
    
    Input:
    file: file name
    delimiter: "," or "\t" etc.
    header : True / False
    label_col_is_str: True/ False
    label_col : the column index of the label.
    
    Output:
    
    Data: in numpy matrix.
    label_map: in dictionary.
    
    """
    
    text_file = open(file, "rt")
    reader = csv.reader(text_file, delimiter = delimiter)
    x = list(reader)
    text_file.close()
    
    # if there is a header, remove it.
    if header:
        x.pop(0)
    Data = np.asmatrix(x)
    
    if label_col_is_str:
        try:
            Data, Label_Map = map_label(Data, label_col)
            return Data, Label_Map
        except ValueError:
            print("The header is string data type. Please specify the header = True in the function.")
            Data = None
    else:
        return Data
            




# mapping of the string labels.


# mapping the label
def map_label(Data, label_column):
    """
    If the label column is string, create a map for the label and convert it to intiger.
    
    Input:
    1. Data: in numpy matrix.
    2. label_column: the column number containing the label.
    
    Output:
    1. Data: in numpy matrix in float data type.
    """
    Label_Map = {}
    
    Label = np.unique(np.array(Data[:, label_column]))
    
    for i in range(len(Label)):
        Label_Map[Label[i]] = i
    
    mapping = lambda label, label_map : label_map[label]
    
    Data[:, label_column] = np.vectorize(mapping)(Data[:, label_column], Label_Map)
    
    Data = Data.astype(float) 
    
    return Data, Label_Map

## Split the Data to Train and Test

In [3]:
# train-test data
class split_train_test:

    
    
    def __init__(self, data, percentage, label):
        '''
        Input:
        data: a numpy matrix.
        percentage: percentage of samples for the training. percentage must be greater than 0 and less than 100.
        label: column index of the label.
        
        
        '''

        # import necessary module

        import numpy as np

        # make an one dimensional matrix n*1 to a vecto

        make_vector = lambda X : np.squeeze(np.asarray(X))


        if percentage <= 0 or percentage >= 100:
            print("The percentage of training data must be greater than 0 and less than 100.")
        else:
            import random;
        
            n = data.shape[0]  # take the number of rows in original data.
            l = int(n*percentage/100)
        
        
            # Take random integers to slice the data
            train_rows = random.sample(range(n), l)
            test_rows  = [i for i in range(n) if i not in train_rows]
            
            # Split the data to train and test.
            self.train = data[train_rows,:]
            self.test  = data[test_rows, :]
            
            # Split both train and test data to feature and label.
            self.train_feature = np.delete(self.train, label, 1)
            self.train_label = make_vector(self.train[:, label])
            
            # Split both train and test data to feature and label.
            self.test_feature = np.delete(self.test, label, 1)
            self.test_label = make_vector(self.test[:, label])
        

## KNN Algorithm

In [4]:
import numpy as np
import math

## Function for euclidean Distance
def euclidean_distance(x, y):
    # import math
    if len(x) != len(y):
        print('Length of two features/vectors are not same.')
    else:
        n = len(x)
        d2 = 0
        for i in range(n):
            d2 = d2 + (x[i] - y[i])**2
        d = math.sqrt(d2)
    
    return d



def K_nearest_neighbour(feature_sample, feature_label, test_sample, k):
    
    """
    Input: 
    feature_sample: feature matrix, returned from train test split. in numpy matrix
    feature_label : feature label matrix, returned from train test split. in numpy matrix
    test_sample :   test feature matrix, returned from train test split. in numpy matrix
    k : number of items to be considered as "nearest neighbours." Data type: intiger.
    
    Output:
    Predicted label: A list of predicted lables corresponding to the given feature.
    
    In the implementation "np.squeeze(np.asarray(MATRIX))" was used to convert matrix to array for easy of calculation.
    """
    
    m = test_sample.shape[0]                        # number of samples in the test data.
    n = feature_sample.shape[0]                     # number of samples in the train data
    
    predicted_label_list = [] 
    
    for i in range(m):
        X = test_sample[i,:]
        
        D = np.full([n,2], np.nan)                  # make an empty matrix to store feature label and corresponding distance. 1st Column 
                                                    # is for label and 2nd column is for corresponding distance from the test feature.
        D[:, 0] = np.squeeze(np.asarray(feature_label))
        
        for j in range(n):
            D[j,1] = euclidean_distance(np.squeeze(np.asarray(X)), np.squeeze(np.asarray(feature_sample[j,:])))
        
        Z = D[np.argsort(D[:, 1]),][:, 0]
        Z = Z[0:k]

        lab, count = np.unique(Z, return_counts = True)
        
        predicted_label = lab[np.argsort(count)[0],]
        
        predicted_label_list.append(predicted_label)
        
    return predicted_label_list

## Performance and Accurace Measurement 

In [5]:
def confusion_matrix(true_label, pred_label, percentage = True):
    """
    Input: 
    1. true_label: The True label.
    2. pred_label: The predicted label.
    3. percentage: If the result wanted in percentage. True/False
    
    Output:
    1. total accuracy
    2. confusion table. True label * Predicted label
    """


    import numpy as np

        


    if len(pred_label) == len(true_label):
        n = len(pred_label)
        accuracy = round((sum(np.asarray(pred_label) == np.asarray(true_label))/n) * 100, 2)
        
        p_l = np.unique(pred_label)
        t_l = np.unique(true_label)
        n = len(p_l)
        confusion_matrix = np.zeros([n, n])

        for i in range(n):
            t = t_l[i]
            for j in range(n):
                p = p_l[j]
                c = sum([pred_label[k] == p and true_label[k] == t for k in range(len(pred_label))])
                confusion_matrix[i,j] = c
        
        if percentage:
            confusion_matrix = np.around((confusion_matrix/len(pred_label)) * 100, decimals = 2)
        
        
        
        return accuracy, confusion_matrix



    else:
        print("Predicted label and True label lengths are not same.")

# Implementation with Iris Dataset

## Step 1: Load the Dataset

In [6]:
os.chdir(r"C:\Users\tahsi\OneDrive - University of Eastern Finland\Python Algorithm and Data Structure\GitHub\ml-algorithms-python-numpy\1. Codes")
Iris, d_map = load_text_data(file = "Iris.txt", delimiter = ",", header = False , label_col_is_str = True , label_col = 4)

In [7]:
# first 5 rows of the data. The 5th column was the label. The label was string. The function mapped it to integer. The map of label
# is stored in the d_map
print(Iris[0:5 ,:])

[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]


In [8]:
print(d_map)

{'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}


## Step 2: Split the Data to Train and Test

In [9]:
train_test_data = split_train_test(data = Iris, percentage = 50, label = 4)

train_data_feature = train_test_data.train_feature
train_data_label = train_test_data.train_label
test_feature = train_test_data.test_feature
true_label = train_test_data.test_label

## Step 3: Implementation of KNN Algorithm

In [10]:
predicted_label = K_nearest_neighbour(feature_sample = train_data_feature, feature_label = train_data_label, test_sample = test_feature, k = 10)

## Step 4: Performance Measurement

In [11]:
acuraccy, con_table = confusion_matrix(true_label = true_label, pred_label = predicted_label, percentage = True)

In [12]:
print(acuraccy)

69.33


In [13]:
print(con_table)

[[32.    0.    0.  ]
 [ 0.   21.33 16.  ]
 [ 0.   14.67 16.  ]]
