In [153]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import math
import sys

In [154]:
# loading the files
df_train = pd.read_csv("../CSV_files/sonar_train.csv")
df_test = pd.read_csv("../CSV_files/sonar_test.csv")

In [183]:
# minkowski distance implementation
def minkDist(q, a, b):
    '''q = 1(man) or 2(euc); a and b are any same-sized tuples'''
    z = []
    zSum = 0
    if (len(a) != len(b)): # checks tuples are of the same length
        print("tuple length mismatch")
    else:      
        for i in range(0, len(a)): # fills z with distance by plane, to the necessary power
            z.append(abs(a[i]-b[i])**q)  
        for j in range(0, len(z)): # sums all planar distances
            zSum += z[j]
        dist = zSum**(1/q) #uses the necessary root for the values
            
        return dist
    
a = (2,2)
b = (5,6)

print("Manhattan Distance: ", minkDist(1, a, b)) # 7 expected
print("Euclidean Distance: ", minkDist(2, a, b)) # 5 expected

Manhattan Distance:  7.0
Euclidean Distance:  5.0


In [186]:
# neighrest neighbour implementation
def nN(distType, newObj, otherObjs):
    '''newObj is a tuple, otherObjs is a list of tuples, returns nearest object'''
    closestVal = sys.maxsize
    closestObj = -1
    for i in otherObjs:
        dFromNew = minkDist(distType, newObj, i)
        if (dFromNew < closestVal):
            closestVal = dFromNew
            closestObj = i
    return closestObj

a = (-4, 7)
bs = ((0,0), (-3,5), (10,12), (-4,6), (2,-9)) 

print("Nearest Neighbour: ", nN(2, a, bs)) # (-4, 6) expected

Nearest Neighbour:  (-4, 6)


In [196]:
def dfToTupList(df):
    '''converts pandas dataframe to lists of rows, minus the Class Parameter'''
    recList = list(df.to_records(index=False))
    tupList = []
    for obj in recList:
        oVals = tuple(obj)[:-1]
        tupList.append(oVals)
    return(tupList)

print("outer return type: ", type(dfToTupList(df_train))) # list expected
print("inner return type: ", type(dfToTupList(df_train)[0])) # tuple expected

outer return type:  <class 'list'>
inner return type:  <class 'tuple'>


In [202]:
def getPredictions(q, testList, trainList):
    '''2 lists of tuples required'''
    testPreds = []
    for obj in testList:
        closest = nN(q, obj, trainList)
        index = trainList.index(closest) # indexes are the same, predicted class from the training data
        testPreds.append(df_train["Class"][index])
    return(testPreds)

print("Predictions: ", getPredictions(2, dfToTupList(df_test), dfToTupList(df_train))) #expected list of classes

Predictions:  ['R', 'M', 'M', 'R', 'R', 'M', 'M', 'M', 'M', 'M', 'R', 'R', 'R', 'R', 'M', 'M', 'M', 'R', 'M', 'M', 'M', 'R', 'R', 'R', 'R', 'M', 'R', 'R', 'M', 'M', 'M', 'M', 'M', 'R', 'M', 'M', 'M', 'M', 'M', 'R', 'R', 'M', 'M', 'M', 'M', 'R', 'R', 'M', 'R', 'R', 'M', 'R', 'R', 'M', 'M', 'R', 'M', 'R', 'M', 'M', 'R', 'M', 'M', 'R', 'M', 'M', 'M', 'M', 'M']


In [175]:
def modelEvaluator(predictions, actual):
    '''returns model evaluation statistics - accuracy, sensitivity, specificity and precision'''
    # "True" means correct, "Positive" means desireable (metal)
    tp = tn = fp = fn = 0
    for i in actual:
        for j in predictions:
            if (j == "M"):
                if (j == i):
                    tp +=1
                elif (j != i):
                    fp += 1
            elif (j == "R"):
                if (j == i):
                    tn +=1
                elif (j != i):
                    fn += 1

    totLen = tp + fp + tn + fn
    # accuracy: 
    accScore = 100 * (tn+tp) / totLen
    # sensitivity
    senScore = 100 * tp / (tp + fn) 
    # specificity
    speScore = 100 * tn / (tn + fp) 
    # precision
    preScore = 100 * tp / (tp + fp)

    print("accuracy: ", accScore, "%")
    print("sensitivity: ", senScore, "%")
    print("specificity: ", speScore, "%")
    print("precision: ", preScore, "%")
    
    # to complete the task at the end of part 1b
    return(accScore)

In [204]:
def trainEval(q, testData, trainData):
    
    testList = dfToTupList(testData)
    trainList = dfToTupList(trainData)
    predList = getPredictions(q, testList, trainList)
    acc = modelEvaluator(predList, df_test["Class"])
    
    return(acc)

In [205]:
# run the function and find the best accuracy
accuracies = []
for i in range(1, 21): #they want accuracies between 1 and 20
    print("q=", i, ": ")
    accuracies.append(trainEval(i, df_train, df_test))

# to find the highest accuracy. Note: I initiate index counting variables to -1 to recognise potential errors
topAcc = 0
topQ = -1
for i in range(0, len(accuracies)):
    if (i > topAcc):
        topAcc = accuracies[i]
        topQ = i+1 # accuracies is 0 indexed so i is always 1 lower than the q value. Order used is always same

print("most accurate q value = ", topQ)

q= 1 : 
accuracy:  49.45261182358461 %
sensitivity:  42.44604316546763 %
specificity:  57.55395683453237 %
precision:  53.6231884057971 %
q= 2 : 
accuracy:  49.40047961630695 %
sensitivity:  41.726618705035975 %
specificity:  58.273381294964025 %
precision:  53.6231884057971 %
q= 3 : 
accuracy:  49.40047961630695 %
sensitivity:  41.726618705035975 %
specificity:  58.273381294964025 %
precision:  53.6231884057971 %
q= 4 : 
accuracy:  49.29621520175164 %
sensitivity:  40.28776978417266 %
specificity:  59.71223021582734 %
precision:  53.6231884057971 %
q= 5 : 
accuracy:  49.244082994473985 %
sensitivity:  39.568345323741006 %
specificity:  60.431654676258994 %
precision:  53.6231884057971 %
q= 6 : 
accuracy:  49.244082994473985 %
sensitivity:  39.568345323741006 %
specificity:  60.431654676258994 %
precision:  53.6231884057971 %
q= 7 : 
accuracy:  49.19195078719633 %
sensitivity:  38.84892086330935 %
specificity:  61.15107913669065 %
precision:  53.6231884057971 %
q= 8 : 
accuracy:  49.19