In [1]:
import math

import pandas as pd
import numpy as np

In [None]:
def euc_dist(vec1, vec2):
    """
    This function takes in two pixels as vectors of their features, calculates and returns the Euclidean distance.
    """
    pixel1 = vec1 #take the 5 pixel features as defining the vector
    pixel2 = vec2
    dist_sqrd = 0
    dist_sqrd = ((a-b)**2 for a,b in zip(df1, df2)) #calculate the square of the euclidean distance
    dist = math.sqrt(sum(dist_sqrd))
    
    return dist

In [None]:
def neighbor_ident(database, inputdata):
    """Takes in a dataframe and locates each entry's k nearest neighbors in the archived database"""
    
    dist_matrix = pd.DataFrame() #initialize empty dataframe to hold the distance information
    knns = pd.DataFrame() #initialize empty dataframe to hold the sorted nearest neighbors
    
    for index1, row1 in inputdata.iterrows(): #calculate euclidean distance and save it
        for index2, row2 in database.iterrows():
            dist_matrix.loc[index1, index2] = knn.euc_dist(row1, row2)
        
        knn_index1 = dist_matrix.loc[index1].sort_values() #sort the euclidean distances 
        sorted_neighbors = list(knn_index1.index) #the indexes refer to the neighbors, save these not their dist.
        knns[index1] = sorted_neighbors

    return knns

In [None]:
def class_prediction(inputdata, database, k):
    """Takes in a test matrix, the training matrix, and user choice of k
    then makes a class prediction and returns a series of these predictions"""
    
    k_neighbors = pd.DataFrame()
    
    for i in range(k):
        k_neighbors = k_neighbors.append(knn.neighbor_ident(database, inputdata).loc[i])
    
    levels, labels = pd.factorize(database.Type) #Turns string type class titles and makes unique integers out of them
    y = levels #index of the unique class integers

    predictions = pd.Series(len(inputdata))
    
    for i in range(len(inputdata)): #Determine class of each of the k nearest neighbors
        type1 = 0
        type2 = 0
        type3 = 0
        
        for j in range(k):
            if y[k_neighbors[i][j]] == y[0]:
                type1 += 1
            elif y[k_neighbors[i][j]] == y[5]:
                type2 += 1
            elif y[k_neighbors[i][j]] == y[7]:
                type3 += 1
            else:
                pass

        type_count = {y[0]:type1, y[5]:type2, y[7]:type3} # Classify the testing point based on the majority type
        predictions[i] = max(type_count, key=type_count.get)  # around it

    return predictions

In [None]:
def accuracy_check(k, inputdata):
    """Takes in a k value and testing dataframe. Compares the predicted types of the testing data
        versus their actual types. Returns the number of correct guesses"""
    
    number_correct = 0
    inputdata_classes = pd.Series(len(inputdata))
    
    for i in range(len(inputdata)): # Makes a series of the actual class types
        if inputdata.loc[i][3] == 'PT':
            inputdata_classes[i] = 0
        elif inputdata.loc[i][3] == 'TM':
            inputdata_classes[i] = 1
        elif inputdata.loc[i][3] == 'Alk':
            inputdata_classes[i] = 2
        else:
            pass
    
    for i in range(len(inputdata)): # Counts the number of predictions that are correct for a given k
        if knn.class_prediction(inputdata, database, k)[i] == inputdata_classes[i]:
            number_correct += 1
        else:
            pass
        
    return number_correct

In [None]:
def k_choice(database, inputdata):
    """Takes in training data and testing data, then returns a dictionary
        of possible k values and their number of correct predictions"""
    
    possible_k = {}
    
    for i in range (1, len(database)+1):
        possible_k[i] = (knn.accuracy_check(i, inputdata)/len(inputdata))*100
        
    return

In [None]:
def knn_interface(database, inputdata, k):
    """A function to handle user input and interface with the kNN functions."""
    
    classifications = pd.Series(len(inputdata))
    predictions = pd.Series(len(inputdata))
    
    for i in range(len(inputdata)):
        predictions[i] = knn.class_prediction(inputdata, database, k)[i]
        
    for i in range(len(inputdata)):
        if predictions[i] == 0:
            classifications[i] = 'PT'
        elif predictions[i]== 1:
            classifications[i] = 'TM'
        elif predictions[i] == 2:
            classifications[i] = 'Alk'
        else:
            classifications[i] = 'Unknown'
    
    return classifications