# Project Navigation Guidelines

# Trial Description

# Core Libraries

In [1]:
import pandas as pd #from pandas import read_csv, pivot_table
import numpy as np
import math
import operator

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import preprocessing

import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Exploratory Data Analysis

# Prepare/Load Relevant Dataset

In [2]:
# Read csv file into dataframe.
csvfile = pd.read_csv('distanceLog_130320.csv')
relDf = pd.DataFrame(csvfile)
# print(relDf)
relDf.columns = relDf.columns.str.strip() #strip unintended whitespaces

# print("Columns: ", relDf.columns)
relDf = relDf[['Date','Time', 'ID2', 'RSSI', 'Label']] #select relevant columns
# df = df.drop(columns=['Type', 'ID0', 'ID1', 'Sensor Data', 'Temperature', 'Humidity', 'Raw Data'])
print(relDf)

         Date       Time      ID2  RSSI  Label
0     3/12/20   15:46:25   0x0001   192      1
1     3/12/20   15:46:26   0x0003   190      1
2     3/12/20   15:46:26   0x0002   192      1
3     3/12/20   15:46:26   0x0006   192      1
4     3/12/20   15:46:27   0x0001   191      1
...       ...        ...      ...   ...    ...
9463  3/12/20   16:44:59   0x0006   191      5
9464  3/12/20   16:45:00   0x0004   191      5
9465  3/12/20   16:45:00   0x0005   177      5
9466  3/12/20   16:45:01   0x0002   177      5
9467  3/12/20   16:45:01   0x0003   179      5

[9468 rows x 5 columns]


In [None]:
# Resample/Reshape Dataset

In [3]:
# Preparation for Date/Time Differences
# ----------------------------------------

# Combine date and time to datetime
readings = relDf.copy()
readings['Datetime'] = pd.to_datetime(readings['Date'].apply(str)+' '+readings['Time']) #create new Datetime field
dateTime = readings['Datetime']
readings = readings.drop(columns=['Datetime'])
readings.insert(0, 'Datetime', dateTime)
readings = readings.drop(columns=['Date','Time'])
readings.tail()

from datetime import datetime as dt
earliestDate = readings.Datetime.dt.date.min()
print(readings.Datetime.dt.date.iloc[-1])
# print(earliestDate - readings.Datetime.dt.date.iloc[-1])

# Function that expresses timestamps in seconds.
def time_to_sec(timeArray):
    timeSecArray = []
    for i in range(len(timeArray)):
        seconds = (timeArray[i].hour * 60 + timeArray[i].minute) * 60 + timeArray[i].second
        timeSecArray.append(seconds)
    return(timeSecArray)

# Function that calculates day difference between dates in seconds
def date_to_sec(dateArray):
    dateSecArray = []
    for i in range(len(dateArray)):
        dayDiff = dateArray[i] - earliestDate
        dateSecArray.append(dayDiff.total_seconds())
    return(dateSecArray)


readings['DateSec'] = date_to_sec(readings.Datetime.dt.date)
readings['TimeSec'] = time_to_sec(readings.Datetime.dt.time)
readings['DatetimeSec'] = readings['DateSec'] + readings['TimeSec']
readings = readings[['DatetimeSec', 'ID2', 'RSSI', 'Label']]
print(readings.head(15))

# pd.set_option('display.max_columns', 10)
# pd.set_option('display.min_rows', 50)
# pd.set_option('display.width', 500)

readings.DatetimeSec = (readings.DatetimeSec - readings.DatetimeSec.min()) // 2
time_diff_btw_rows = readings.DatetimeSec.diff().fillna(0)
# print(time_diff_btw_rows)
time_diff_btw_rows -= (time_diff_btw_rows > 1) & (readings.DatetimeSec % 2 > 0)
readings.DatetimeSec -= (time_diff_btw_rows - 1).clip(lower=0).cumsum()
readings.ID2 = readings.ID2.transform(lambda hexadecimal: int(hexadecimal, 16))
resampled_readings = readings.groupby(["DatetimeSec", "Label", "ID2"]).mean().reset_index()
pivoted_readings = pd.pivot_table(resampled_readings, values="RSSI", columns=["ID2"], index=["DatetimeSec", "Label"]).fillna(0)

writeFile = True
if (writeFile == True):
    pivoted_readings.to_csv('distanceLog_resample.csv')
    print("FILE WRITTEN")

pivoted_readings.head()

2020-03-12
    DatetimeSec      ID2  RSSI  Label
0       56785.0   0x0001   192      1
1       56786.0   0x0003   190      1
2       56786.0   0x0002   192      1
3       56786.0   0x0006   192      1
4       56787.0   0x0001   191      1
5       56787.0   0x0005   188      1
6       56788.0   0x0004   193      1
7       56789.0   0x0001   192      1
8       56790.0   0x0003   191      1
9       56790.0   0x0002   183      1
10      56790.0   0x0006   190      1
11      56790.0   0x0005   184      1
12      56791.0   0x0004   185      1
13      56791.0   0x0001   191      1
14      56793.0   0x0005   183      1
FILE WRITTEN


Unnamed: 0_level_0,ID2,1,2,3,4,5,6
DatetimeSec,Label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
-40211.0,5,0.0,0.0,182.0,0.0,179.0,0.0
-40210.0,5,177.0,177.0,0.0,189.0,0.0,0.0
-40209.0,5,177.0,178.0,181.0,0.0,180.0,187.0
-40208.0,5,184.0,0.0,0.0,186.0,185.0,190.0
-40207.0,5,185.0,178.0,190.0,185.0,181.0,0.0


In [4]:
resampledDS = pd.read_csv('distanceLog_resample.csv')
tempSeries = resampledDS['Label']
resampledDS = resampledDS.drop(columns=['Label', 'DatetimeSec'])
resampledDS["Label"] = tempSeries
resampledDS.head()
lastIndex = len(resampledDS)
numCol = len(resampledDS.columns)

ips_data = resampledDS.iloc[:lastIndex, :numCol-1]
# print(ips_data)
ips_labels = resampledDS.iloc[:lastIndex, numCol-1:]
# print(ips_labels)


# #dataset.iloc[rowrange, columnrange]
X = ips_data.to_numpy()
y = ips_labels.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True,test_size=0.3) 
y_train = y_train.reshape(len(y_train),)

k_value_rootN = math.ceil(math.sqrt(len(X_train)))
print("estimated k value: ", k_value_rootN)
print("X_test: ", X_test)
print("y_test: ", y_test)

estimated k value:  41
X_test:  [[183. 185.   0. 183. 187.   0.]
 [185. 185. 191.   0.   0.   0.]
 [188. 187.   0. 187. 188. 178.]
 ...
 [  0.   0. 179. 192. 177.   0.]
 [180. 192.   0. 190. 190. 190.]
 [192. 182.   0.   0. 180. 190.]]
y_test:  [[4]
 [4]
 [4]
 [1]
 [3]
 [3]
 [1]
 [4]
 [1]
 [2]
 [1]
 [1]
 [2]
 [4]
 [1]
 [5]
 [2]
 [5]
 [5]
 [2]
 [3]
 [1]
 [1]
 [4]
 [2]
 [2]
 [5]
 [5]
 [2]
 [2]
 [4]
 [1]
 [2]
 [2]
 [4]
 [2]
 [3]
 [1]
 [4]
 [2]
 [3]
 [4]
 [5]
 [4]
 [2]
 [2]
 [1]
 [3]
 [3]
 [5]
 [4]
 [3]
 [3]
 [5]
 [4]
 [2]
 [2]
 [3]
 [5]
 [2]
 [1]
 [5]
 [1]
 [1]
 [4]
 [5]
 [3]
 [2]
 [2]
 [3]
 [1]
 [5]
 [4]
 [1]
 [1]
 [4]
 [3]
 [2]
 [5]
 [2]
 [1]
 [3]
 [1]
 [5]
 [3]
 [5]
 [1]
 [1]
 [5]
 [4]
 [1]
 [1]
 [1]
 [1]
 [3]
 [1]
 [2]
 [4]
 [4]
 [1]
 [2]
 [1]
 [5]
 [1]
 [4]
 [1]
 [1]
 [2]
 [1]
 [4]
 [1]
 [5]
 [2]
 [4]
 [2]
 [3]
 [2]
 [3]
 [4]
 [4]
 [2]
 [2]
 [2]
 [4]
 [2]
 [2]
 [5]
 [2]
 [2]
 [4]
 [5]
 [3]
 [1]
 [3]
 [4]
 [3]
 [3]
 [4]
 [4]
 [4]
 [5]
 [2]
 [5]
 [2]
 [4]
 [5]
 [4]
 [3]
 [3]
 [5]
 [4]


# Parameter Optimisation

In [5]:
# k_range = list(range(1, 31))
# print("k_range: ", k_range)

# classifier = KNeighborsClassifier()

# param_grid = dict(n_neighbors=k_range)
# print("param_grid: ", param_grid)

# grid = GridSearchCV(classifier, param_grid, cv=10, scoring='accuracy')

# grid.fit(X,y)
# print("best_estimator: ", grid.best_estimator_)

# KNN Algorithm

In [6]:
n_samples = 20

#Function calculates euclidean distance between two n-dimensional data instances 
def euclideanDistance(instance1, instance2):
    #handles if instances are lists or tuples:
    instance1 = np.array(instance1) 
    instance2 = np.array(instance2)
    
    '''
    https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.norm.html
    uses 2-norm frobenius norm and returns euclidean distance
    '''
    return np.linalg.norm(instance1 - instance2) #euclidean distance

In [7]:
#Function finds nearest neighbours; nearest -> smallest euclidean distance
def get_neighbors(training_set, 
                  labels, 
                  test_instance, 
                  k, 
                  distance=euclideanDistance):
    """
    get_neighbors calculates a list of the k nearest neighbors
    of an instance 'test_instance'.
    The list neighbors contains 3-tuples with  
    (index, dist, label)
    where
    index    is the index from the training_set, 
    dist     is the distance between the test_instance and the 
             instance training_set[index]
    distance is a reference to a function used to calculate the 
             distances
    """
    distances = [] #empty distance array
    
    #calculates euclidean distance between test_instance and ALL other instances in training_set
    for index in range(len(training_set)):
        dist = euclideanDistance(test_instance, training_set[index])
        distances.append((training_set[index], dist, labels[index]))
    distances.sort(key=lambda x: x[1])
    neighbors = distances[:k]
    return neighbors # The list neighbors contains 3-tuples with (index, dist, label)

In [8]:
# TEST OUTPUT

outArray = []
for i in range(n_samples):
    neighbors = get_neighbors(X_train, 
                              y_train, 
                              X_test[i], 
                              k_value_rootN, 
                              distance=euclideanDistance)
#     print(i,
#           X_test[i],
#           y_test[i],
#           neighbors)
    
    outArray.append([i,
          X_test[i],
          y_test[i],
          neighbors])

out_df = pd.DataFrame(outArray, columns=['i', 'X_test', 'y_test', 'neighbours'])
out_df.head()
# out_df.tail()

Unnamed: 0,i,X_test,y_test,neighbours
0,0,"[183.0, 185.0, 0.0, 183.0, 187.0, 0.0]",[4],"[([186.0, 186.0, 0.0, 186.0, 188.0, 0.0], 4.47..."
1,1,"[185.0, 185.0, 191.0, 0.0, 0.0, 0.0]",[4],"[([189.0, 183.0, 191.0, 0.0, 0.0, 0.0], 4.4721..."
2,2,"[188.0, 187.0, 0.0, 187.0, 188.0, 178.0]",[4],"[([187.0, 186.0, 0.0, 185.0, 188.0, 179.0], 2...."
3,3,"[193.0, 187.0, 0.0, 0.0, 187.0, 0.0]",[1],"[([194.0, 188.0, 0.0, 0.0, 187.0, 0.0], 1.4142..."
4,4,"[182.0, 188.0, 188.0, 0.0, 190.0, 189.0]",[3],"[([185.0, 191.0, 188.0, 0.0, 189.0, 192.0], 5...."


In [11]:
from collections import Counter

#Function enables voting mechanic in KNN for Classification according to majority class vote
def vote(neighbors):
    class_counter = Counter() #A Counter is a dict subclass for counting hashable objects. 
    for neighbor in neighbors:
        class_counter[neighbor[2]] += 1 #neighbor[2] -> label for neighbor(s)
    return class_counter.most_common(1)[0][0]

In [12]:
k_value_knn = 1
for i in range(5):
    neighbors = get_neighbors(X_train, 
                              y_train, 
                              X_test[i], 
                              k_value_knn, 
                              distance=euclideanDistance)
    print("index: ", i, 
          ", result of vote: ", vote(neighbors), 
          ", label: ", y_train[i], 
          ", data: ", X_train[i])

index:  0 , result of vote:  4 , label:  4 , data:  [183.   0. 185. 183. 177. 182.]
index:  1 , result of vote:  1 , label:  1 , data:  [192.   0.   0.   0. 185.   0.]
index:  2 , result of vote:  4 , label:  4 , data:  [185.   0. 189. 182. 182.   0.]
index:  3 , result of vote:  1 , label:  4 , data:  [185. 184. 182. 177. 184. 182.]
index:  4 , result of vote:  3 , label:  1 , data:  [193.   0.   0. 190. 190. 194.]


In [13]:
#Function returns vote 'probability' - i.e. distribution/percentage majority vote
def vote_prob(neighbors):
    class_counter = Counter() # Counter object - https://docs.python.org/2/library/collections.html
    for neighbor in neighbors:
        class_counter[neighbor[2]] += 1 #add to count of target (class)
        
    # aggregates into tuples ~ zip(*iterables), 
    # Return a list of the n most common elements and their counts from the most common to the least.    
    labels, votes = zip(*class_counter.most_common()) #returns list of sorted most common [labels], [votes]
    #print("L|V: ", labels, votes)
    #print("Class Counter: ", class_counter.most_common)
    winner = class_counter.most_common(1)[0][0]       #majority label
    votes4winner = class_counter.most_common(1)[0][1] #majority vote count
    return winner, votes4winner/sum(votes)            #returns majority label, majority proportion

In [15]:
k_value_knn = k_value_rootN
writeKNN = True
fileNameKNN = 'distanceRSS_KNN.csv'
resultsKNN = pd.DataFrame(0, index=range(len(X_test)), columns=['index', 'data', 'vote_result', 'vote_prob', 'label', 'prediction'])

indexArray = []
dataArray = []
voteArray = []
probArray = []
labelArray = []
predArray = []
for i in range(len(X_test)):
    neighbors = get_neighbors(X_train, 
                              y_train, 
                              X_test[i], 
                              k_value_knn, 
                              distance=euclideanDistance)
    indexArray.append(i)
    dataArray.append(X_test[i])
    voteArray.append(vote(neighbors))
    probArray.append(vote_prob(neighbors))
    labelArray.append(y_test[i][0])
    predArray.append("CORRECT" if (vote(neighbors) == y_test[i]) else "WRONG")
#     print("index: ", i, 
#           ", result of vote: ", vote(neighbors), 
#           ", vote_prob: ", vote_prob(neighbors), 
#           ", label: ", y_test[i], "prediction: "
#         "CORRECT" if (vote(neighbors) == y_test[i]) else "WRONG"
#           ", data: ", X_test[i])

resultsKNN['index'] = indexArray
resultsKNN['data'] = dataArray
resultsKNN['vote_result'] = voteArray
resultsKNN['vote_prob'] = probArray
resultsKNN['label'] = labelArray 
resultsKNN['prediction'] = predArray

if (writeKNN == True):
    resultsKNN.to_csv(fileNameKNN, index=False)
    print("File "+fileNameKNN+" has been written.")
resultsKNN.head(20)

File distanceRSS_KNN.csv has been written.


Unnamed: 0,index,data,vote_result,vote_prob,label,prediction
0,0,"[183.0, 185.0, 0.0, 183.0, 187.0, 0.0]",4,"(4, 0.3902439024390244)",4,CORRECT
1,1,"[185.0, 185.0, 191.0, 0.0, 0.0, 0.0]",1,"(1, 0.2926829268292683)",4,WRONG
2,2,"[188.0, 187.0, 0.0, 187.0, 188.0, 178.0]",4,"(4, 0.3902439024390244)",4,CORRECT
3,3,"[193.0, 187.0, 0.0, 0.0, 187.0, 0.0]",2,"(2, 0.4146341463414634)",1,WRONG
4,4,"[182.0, 188.0, 188.0, 0.0, 190.0, 189.0]",1,"(1, 0.5853658536585366)",3,WRONG
5,5,"[184.0, 0.0, 187.0, 189.0, 189.0, 0.0]",2,"(2, 0.43902439024390244)",3,WRONG
6,6,"[191.0, 188.0, 196.0, 0.0, 0.0, 194.0]",1,"(1, 0.2926829268292683)",1,CORRECT
7,7,"[188.0, 185.0, 0.0, 185.0, 189.0, 0.0]",1,"(1, 0.2926829268292683)",4,WRONG
8,8,"[194.0, 191.0, 194.0, 0.0, 185.0, 0.0]",1,"(1, 0.4878048780487805)",1,CORRECT
9,9,"[191.0, 189.0, 0.0, 0.0, 193.0, 0.0]",2,"(2, 0.5609756097560976)",2,CORRECT


# Rank (Harmonic) Weighted KNN

In [18]:
#Function for weighted KNN voting mechanic; harmonic weights based on ranking of datapoint (vote += 1/rank)
def vote_harmonic_weights(neighbors, all_results=True):
    class_counter = Counter()
    number_of_neighbors = len(neighbors)
    for index in range(number_of_neighbors):
        #weighted count of votes
#         print("Index: ", index)
#         print(class_counter[neighbors[index][2]])
#         print(neighbors[index][2][0])
        class_counter[neighbors[index][2]] += ( 1/(index+1) ) #add (1/ neighbour_rank) for each vote; index+1 = neighbour_rank
    labels, votes = zip(*class_counter.most_common())
    #print(labels, votes)
    winner = class_counter.most_common(1)[0][0]
    votes4winner = class_counter.most_common(1)[0][1]
    if all_results:
        total = sum(class_counter.values(), 0.0)
        
        for key in class_counter:
             class_counter[key] /= total #returns vote proportion for key ("class") in class_counter
        return winner, class_counter.most_common()
    else:
        return winner, votes4winner / sum(votes)

In [20]:
k_value_rwknn = k_value_rootN
writeRWKNN = True
fileNameRWKNN = 'distanceRSS_RWKNN.csv'
resultsRWKNN = pd.DataFrame(0, index=range(len(X_test)), columns=['index', 'data', 'label', 'vote_result', 'prediction'])

indexArray = []
dataArray = []
labelArray = []
voteArray = []
predArray = []

for i in range(len(X_test)):
    neighbors = get_neighbors(X_train, 
                              y_train, 
                              X_test[i], 
                              k_value_rwknn, 
                              distance=euclideanDistance)
    
    indexArray.append(i)
    dataArray.append(X_test[i])
    labelArray.append(y_test[i])
    voteResRank = vote_harmonic_weights(neighbors,all_results=True)[0]
    voteArray.append(voteResRank)
    predArray.append("CORRECT" if (voteResRank == y_test[i]) else "WRONG")
    
#     print("index:", i, 
#           ",result of vote: ", 
#           vote_harmonic_weights(neighbors,
#                                 all_results=True), 
#           ",label: ", y_test[i], 
#         ",prediction: ",
#         "CORRECT" if ( vote_harmonic_weights(neighbors,
#                                 all_results=True)[0] == y_test[i]) else "WRONG")
    
resultsRWKNN['index'] = indexArray
resultsRWKNN['data'] = dataArray
resultsRWKNN['label'] = labelArray
resultsRWKNN['vote_result'] = voteArray
resultsRWKNN['prediction'] = predArray

if (writeRWKNN == True):
    resultsRWKNN.to_csv(fileNameRWKNN, index=False)
    print("File "+fileNameRWKNN+" has been written.")
resultsRWKNN


File distanceRSS_RWKNN.csv has been written.


Unnamed: 0,index,data,label,vote_result,prediction
0,0,"[183.0, 185.0, 0.0, 183.0, 187.0, 0.0]",[4],4,CORRECT
1,1,"[185.0, 185.0, 191.0, 0.0, 0.0, 0.0]",[4],1,WRONG
2,2,"[188.0, 187.0, 0.0, 187.0, 188.0, 178.0]",[4],4,CORRECT
3,3,"[193.0, 187.0, 0.0, 0.0, 187.0, 0.0]",[1],1,CORRECT
4,4,"[182.0, 188.0, 188.0, 0.0, 190.0, 189.0]",[3],5,WRONG
...,...,...,...,...,...
714,714,"[184.0, 179.0, 182.0, 183.0, 0.0, 177.0]",[3],4,WRONG
715,715,"[194.0, 0.0, 196.0, 188.0, 0.0, 191.0]",[1],1,CORRECT
716,716,"[0.0, 0.0, 179.0, 192.0, 177.0, 0.0]",[5],5,CORRECT
717,717,"[180.0, 192.0, 0.0, 190.0, 190.0, 190.0]",[3],3,CORRECT


# Distance Weighted KNN

In [23]:
#Function for distance weighted KNN voting mechanic; weight based on euclidean distance (vote += 1/distance)
def vote_distance_weights(neighbors, all_results=True):
    debug = False
    class_counter = Counter()
    number_of_neighbors = len(neighbors)
    for index in range(number_of_neighbors):
        dist = neighbors[index][1]
        label = neighbors[index][2]
        if (debug == True):
            print("Neighbour record: Label is ", label, "with distance ", dist)
            print("Existing count for ", label, "is ", class_counter[label[0]])
            print("Adding count by ", (1 / (1 if dist == 0 else dist)))
        class_counter[label] += (1 / (1 if dist == 0 else dist)) #sensitivity of distance weight can be adjusted here.
        
        
    labels, votes = zip(*class_counter.most_common())
    #print(labels, votes)
    winner = class_counter.most_common(1)[0][0]
    votes4winner = class_counter.most_common(1)[0][1]
    if (debug == True):
        print("COUNTER: ", class_counter)
        print("----------------------------------------------------")
        print("winner: ", winner)
        print("votes: ", votes4winner)
    
    if all_results:
        total = sum(class_counter.values(), 0.0)
        
        for key in class_counter:
            class_counter[key] /= total
        if (debug == True):
            print("total votes: ", total)
            print("AFTER NORMALISING: ", class_counter)
        return winner, class_counter.most_common()
    else:
        return winner, votes4winner / sum(votes)

In [24]:
k_value_dwknn = 1
testRange = 5
for i in range(testRange):
    neighbors = get_neighbors(X_train, 
                              y_train, 
                              X_test[i], 
                              k_value_dwknn, 
                              distance=euclideanDistance)
    res = vote_distance_weights(neighbors, all_results=True)
    
    print("index: ", i, 
      ", result of vote: ", res,
    ",label: ", y_test[i], 
    ",prediction: ",
    "CORRECT" if ( res[0] == y_test[i]) else "WRONG")
    print("\n\n")


index:  0 , result of vote:  (4, [(4, 1.0)]) ,label:  [4] ,prediction:  CORRECT



index:  1 , result of vote:  (1, [(1, 1.0)]) ,label:  [4] ,prediction:  WRONG



index:  2 , result of vote:  (4, [(4, 1.0)]) ,label:  [4] ,prediction:  CORRECT



index:  3 , result of vote:  (1, [(1, 1.0)]) ,label:  [1] ,prediction:  CORRECT



index:  4 , result of vote:  (3, [(3, 1.0)]) ,label:  [3] ,prediction:  CORRECT





In [26]:
k_value_dwknn = k_value_rootN
writeDWKNN = True
fileNameDWKNN = 'distanceRSS_DWKNN.csv'
resultsDWKNN = pd.DataFrame(0, index=range(len(X_test)), columns=['index', 'data', 'label', 'vote_result', 'prediction'])

indexArray = []
dataArray = []
labelArray = []
voteArray = []
predArray = []

# need to normalise distance!!!!

for i in range(len(X_test)):
    neighbors = get_neighbors(X_train, 
                              y_train, 
                              X_test[i], 
                              k_value_dwknn, 
                              distance=euclideanDistance)
    vote_distance_weights(neighbors, all_results=True)
    
    indexArray.append(i)
    dataArray.append(X_test[i])
    labelArray.append(y_test[i])
    voteDistWRes = vote_distance_weights(neighbors,all_results=True)[0]
    voteArray.append(voteDistWRes)
    predArray.append("CORRECT" if (voteDistWRes == y_test[i]) else "WRONG")

#     print("index: ", i, 
#           ", result of vote: ", vote_distance_weights(neighbors,
#                                                       all_results=True),
#         ",label: ", y_test[i], 
#         ",prediction: ",
#         "CORRECT" if ( vote_harmonic_weights(neighbors,
#                                 all_results=True)[0] == y_test[i]) else "WRONG")
    
resultsDWKNN['index'] = indexArray
resultsDWKNN['data'] = dataArray
resultsDWKNN['label'] = labelArray
resultsDWKNN['vote_result'] = voteArray
resultsDWKNN['prediction'] = predArray


if (writeDWKNN == True):
    resultsDWKNN.to_csv(fileNameDWKNN, index=False)
    print("File "+fileNameDWKNN+" has been written.")
resultsDWKNN



File distanceRSS_DWKNN.csv has been written.


Unnamed: 0,index,data,label,vote_result,prediction
0,0,"[183.0, 185.0, 0.0, 183.0, 187.0, 0.0]",[4],4,CORRECT
1,1,"[185.0, 185.0, 191.0, 0.0, 0.0, 0.0]",[4],1,WRONG
2,2,"[188.0, 187.0, 0.0, 187.0, 188.0, 178.0]",[4],4,CORRECT
3,3,"[193.0, 187.0, 0.0, 0.0, 187.0, 0.0]",[1],2,WRONG
4,4,"[182.0, 188.0, 188.0, 0.0, 190.0, 189.0]",[3],1,WRONG
...,...,...,...,...,...
714,714,"[184.0, 179.0, 182.0, 183.0, 0.0, 177.0]",[3],4,WRONG
715,715,"[194.0, 0.0, 196.0, 188.0, 0.0, 191.0]",[1],1,CORRECT
716,716,"[0.0, 0.0, 179.0, 192.0, 177.0, 0.0]",[5],5,CORRECT
717,717,"[180.0, 192.0, 0.0, 190.0, 190.0, 190.0]",[3],3,CORRECT


# Data Analysis via Visualisation

In [None]:

KNN_kcomp1 = pd.read_csv('distanceRSS_KNN.csv')
#count number of CORRECT/ WRONG.
KNN_kcomp1_results = KNN_kcomp1.groupby('prediction').count().data
count_correct_KNN_kcomp1 = (KNN_kcomp1_results.CORRECT)
count_wrong_KNN_kcomp1 = (KNN_kcomp1_results.WRONG)

RWKNN_kcomp1 = pd.read_csv('distanceRSS_RWKNN.csv')
#count number of CORRECT/ WRONG.
RWKNN_kcomp1_results = RWKNN_kcomp1.groupby('prediction').count().data
count_correct_RWKNN_kcomp1 = (RWKNN_kcomp1_results.CORRECT)
count_wrong_RWKNN_kcomp1 = (RWKNN_kcomp1_results.WRONG)

DWKNN_kcomp1 = pd.read_csv('distanceRSS_DWKNN.csv')
#count number of CORRECT/ WRONG.
DWKNN_kcomp1_results = DWKNN_kcomp1.groupby('prediction').count().data
count_correct_DWKNN_kcomp1 = (DWKNN_kcomp1_results.CORRECT)
count_wrong_DWKNN_kcomp1 = (DWKNN_kcomp1_results.WRONG)

# create plot
n_groups = 1 # number of groups
fig, ax = plt.subplots()
index = np.arange(n_groups) # x location for groups
bar_width = 1

#first bar - rects1+rects2
rects1 = plt.bar(index, count_correct_KNN_kcomp1, bar_width, bottom = count_wrong_KNN_kcomp1, 
                 color = 'g')
rects2 = plt.bar(index, count_wrong_KNN_kcomp1, bar_width, 
                 color = 'r')

rects3 = plt.bar(index + 2*bar_width, count_correct_RWKNN_kcomp1, bar_width, bottom = count_wrong_RWKNN_kcomp1, 
                 color = 'g')
rects4 = plt.bar(index + 2*bar_width, count_wrong_RWKNN_kcomp1, bar_width, 
                 color = 'r')

rects5 = plt.bar(index + 4*bar_width, count_correct_DWKNN_kcomp1, bar_width, bottom = count_wrong_DWKNN_kcomp1, 
                 color = 'g')
rects6 = plt.bar(index + 4*bar_width, count_wrong_DWKNN_kcomp1, bar_width,
                 color = 'r')

plt.xlabel('Prediction')
plt.ylabel('Count')
plt.title('Prediction Results)
plt.xticks([0,2*bar_width, 4*bar_width, 6*bar_width, 8*bar_width, 10*bar_width], ["KNN","RWKNN","DWKNN"])
# plt.legend(loc='best', bbox_to_anchor=(1,1))
plt.figure(figsize=(30,30))
plt.show()

In [None]:
'''
KNN_kcomp1 = pd.read_csv('E1_KNN_T1_k1.csv')
#count number of CORRECT/ WRONG.
KNN_kcomp1_results = KNN_kcomp1.groupby('prediction').count().data
count_correct_KNN_kcomp1 = (KNN_kcomp1_results.CORRECT)
count_wrong_KNN_kcomp1 = (KNN_kcomp1_results.WRONG)

DWKNN_kcomp1 = pd.read_csv('E1_DWKNN_T1_k1.csv')
#count number of CORRECT/ WRONG.
DWKNN_kcomp1_results = DWKNN_kcomp1.groupby('prediction').count().data
count_correct_DWKNN_kcomp1 = (DWKNN_kcomp1_results.CORRECT)
count_wrong_DWKNN_kcomp1 = (DWKNN_kcomp1_results.WRONG)

RWKNN_kcomp1 = pd.read_csv('E1_RWKNN_T1_k1.csv')
#count number of CORRECT/ WRONG.
RWKNN_kcomp1_results = RWKNN_kcomp1.groupby('prediction').count().data
count_correct_RWKNN_kcomp1 = (RWKNN_kcomp1_results.CORRECT)
count_wrong_RWKNN_kcomp1 = (RWKNN_kcomp1_results.WRONG)

KNN_kcomp2 = pd.read_csv('E1_KNN_T1_k38.csv')
#count number of CORRECT/ WRONG.
KNN_kcomp2_results = KNN_kcomp2.groupby('prediction').count().data
count_correct_KNN_kcomp2 = (KNN_kcomp2_results.CORRECT)
count_wrong_KNN_kcomp2 = (KNN_kcomp2_results.WRONG)

RWKNN_kcomp2 = pd.read_csv('E1_RWKNN_T1_k38.csv')
#count number of CORRECT/ WRONG.
RWKNN_kcomp2_results = RWKNN_kcomp2.groupby('prediction').count().data
count_correct_RWKNN_kcomp2 = (RWKNN_kcomp2_results.CORRECT)
count_wrong_RWKNN_kcomp2 = (RWKNN_kcomp2_results.WRONG)

DWKNN_kcomp2 = pd.read_csv('E1_DWKNN_T1_k38.csv')
#count number of CORRECT/ WRONG.
DWKNN_kcomp2_results = DWKNN_kcomp2.groupby('prediction').count().data
count_correct_DWKNN_kcomp2 = (DWKNN_kcomp2_results.CORRECT)
count_wrong_DWKNN_kcomp2 = (DWKNN_kcomp2_results.WRONG)


# create plot
n_groups = 1 # number of groups
fig, ax = plt.subplots()
index = np.arange(n_groups) # x location for groups
bar_width = 1

#first bar - rects1+rects2
rects1 = plt.bar(index, count_correct_KNN_kcomp1, bar_width, bottom = count_wrong_KNN_kcomp1, 
#                  label='KNN_1_CORRECT', 
                 color = 'g')
rects2 = plt.bar(index, count_wrong_KNN_kcomp1, bar_width, 
                 color = 'r')

rects3 = plt.bar(index + 2*bar_width, count_correct_KNN_kcomp2, bar_width, bottom = count_wrong_KNN_kcomp2, 
                 color = 'g')
rects4 = plt.bar(index + 2*bar_width, count_wrong_KNN_kcomp2, bar_width, 
                 color = 'r')

rects5 = plt.bar(index + 4*bar_width, count_correct_DWKNN_kcomp1, bar_width, bottom = count_wrong_DWKNN_kcomp1, 
                 color = 'g')
rects6 = plt.bar(index + 4*bar_width, count_wrong_DWKNN_kcomp1, bar_width,
                 color = 'r')

rects7 = plt.bar(index + 6*bar_width, count_correct_DWKNN_kcomp2, bar_width, bottom = count_wrong_DWKNN_kcomp2,
                 color = 'g')
rects8 = plt.bar(index + 6*bar_width, count_wrong_DWKNN_kcomp2, bar_width,
                 color = 'r')

rects9 = plt.bar(index + 8*bar_width, count_correct_RWKNN_kcomp1, bar_width, bottom = count_wrong_RWKNN_kcomp1,  
                 color = 'g')
rects10 = plt.bar(index + 8*bar_width, count_wrong_RWKNN_kcomp1, bar_width, 
                 color = 'r')

rects9 = plt.bar(index + 10*bar_width, count_correct_RWKNN_kcomp2, bar_width, bottom = count_wrong_RWKNN_kcomp2,  
                 color = 'g')
rects10 = plt.bar(index + 10*bar_width, count_wrong_RWKNN_kcomp2, bar_width, 
                 color = 'r')

plt.xlabel('Prediction')
plt.ylabel('Count')
plt.title('Prediction Results for KNN and DWKNN')
plt.xticks([0,2*bar_width, 4*bar_width, 6*bar_width, 8*bar_width, 10*bar_width], ["KNN_1", "KNN_38", "DWKNN_1", "DWKNN_38", "RWKNN_1", "RWKNN_38"])
# plt.legend(loc='best', bbox_to_anchor=(1,1))
plt.figure(figsize=(30,30))
plt.show()'''

In [None]:
'''KNNsummary1 = KNN_kcomp1.groupby('prediction').count().data
RWKNNsummary1 = RWKNN_kcomp1.groupby('prediction').count().data
DWKNNsummary1 = DWKNN_kcomp1.groupby('prediction').count().data

KNNsummary2 = KNN_kcomp2.groupby('prediction').count().data
RWKNNsummary2 = RWKNN_kcomp2.groupby('prediction').count().data
DWKNNsummary2 = DWKNN_kcomp2.groupby('prediction').count().data

print("--------------")
print("Summary")
print("--------------")
print("KNNcomp1: \n", KNNsummary1)
print("Accuracy: ", KNN_kcomp1_results.CORRECT / 
      (KNN_kcomp1_results.CORRECT + KNN_kcomp1_results.WRONG))
print("\nKNNcomp2: \n", KNNsummary2)
print("Accuracy: ", KNN_kcomp2_results.CORRECT / 
      (KNN_kcomp2_results.CORRECT + KNN_kcomp2_results.WRONG))
print("--------------")
print("RWKNNcomp1: \n", RWKNNsummary1)
print("Accuracy: ", RWKNN_kcomp1_results.CORRECT / 
      (RWKNN_kcomp1_results.CORRECT + RWKNN_kcomp1_results.WRONG))
print("\nRWKNNcomp2: \n", RWKNNsummary2)
print("Accuracy: ", RWKNN_kcomp2_results.CORRECT / 
      (RWKNN_kcomp2_results.CORRECT + RWKNN_kcomp2_results.WRONG))
print("--------------")
print("DWKNNcomp1: \n", DWKNNsummary1)
print("Accuracy: ", DWKNN_kcomp1_results.CORRECT / 
      (DWKNN_kcomp1_results.CORRECT + DWKNN_kcomp1_results.WRONG))
print("\nDWKNNcomp2: \n", DWKNNsummary2)
print("Accuracy: ", DWKNN_kcomp2_results.CORRECT / 
      (DWKNN_kcomp2_results.CORRECT + DWKNN_kcomp2_results.WRONG))'''