# TFM

data: https://drive.google.com/drive/folders/1fx_j6gmiATvhEaBtUwEPuCOl8peJ2zDM

code source: https://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/

## 1. Simple k-NN implementation

### Euclidean distance

In [24]:
from math import sqrt
import pandas as pd
from random import seed
from random import randrange
from IPython.core.debugger import Pdb;

In [30]:
dataset = [[2.7810836,2.550537003,0],
    [1.465489372,2.362125076,0],
    [3.396561688,4.400293529,0],
    [1.38807019,1.850220317,0],
    [3.06407232,3.005305973,0],
    [7.627531214,2.759262235,0],
    [5.332441248,2.088626775,0],
    [6.922596716,1.77106367,0],
    [8.675418651,-0.242068655,0],
    [7.673756466,3.508563011,0]]

In [26]:
# calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

In [31]:
row0 = dataset[0]
for row in dataset:
    distance = euclidean_distance(row0, row)
    print(distance)

0.0
1.3290173915275787
1.9494646655653247
1.5591439385540549
0.5356280721938492
4.850940186986411
2.592833759950511
4.214227042632867
6.522409988228337
4.985585382449795


### Get k-nearest neighbors

In [28]:
# Locate the most similar neighbors
def get_neighbors(train, test_row, k):
    distances = list()
    for train_row in train:
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(k):
        neighbors.append(distances[i][0])
    return neighbors

In [32]:
neighbors = get_neighbors(dataset, dataset[0], 3)
for neighbor in neighbors:
    print(neighbor)

[2.7810836, 2.550537003, 0]
[3.06407232, 3.005305973, 0]
[1.465489372, 2.362125076, 0]


### Data from csv

In [7]:
df = pd.read_csv('data/DSI1_trncrd.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4
0,73.756532,54.633661,3.7,1,1
1,73.756532,54.633661,3.7,1,1
2,73.756532,54.633661,3.7,1,1
3,76.199587,53.462735,3.7,1,1
4,76.199587,53.462735,3.7,1,1


In [8]:
# for now, drop columns of 'floor' and 'building', keep x,y,z
df.drop(df.columns[[3,4]], axis=1, inplace=True)
df.head()

Unnamed: 0,0,1,2
0,73.756532,54.633661,3.7
1,73.756532,54.633661,3.7
2,73.756532,54.633661,3.7
3,76.199587,53.462735,3.7
4,76.199587,53.462735,3.7


In [9]:
print(df.duplicated().sum())
print(df.shape[0])

1139
1369


In [10]:
# dropping duplicate values
df.drop_duplicates(keep = 'first', inplace = True)
print(df.duplicated().sum())
df.head()

0


Unnamed: 0,0,1,2
0,73.756532,54.633661,3.7
3,76.199587,53.462735,3.7
9,78.208971,54.11325,3.7
15,78.743835,56.006966,3.7
21,89.441223,63.336084,3.7


In [39]:
df

Unnamed: 0,0,1,2
0,73.756532,54.633661,3.7
3,76.199587,53.462735,3.7
9,78.208971,54.113250,3.7
15,78.743835,56.006966,3.7
21,89.441223,63.336084,3.7
...,...,...,...
1339,56.656774,41.075178,3.7
1345,55.245240,43.412544,3.7
1351,56.095192,46.736461,3.7
1357,59.661980,46.159711,3.7


In [42]:
neighbors = get_neighbors(df.values.tolist(), df.iloc[0], 3)
for neighbor in neighbors:
    print(neighbor)

[73.75653190807489, 54.633660946947444, 3.7]
[71.8551465320582, 53.11990258599164, 3.7]
[76.19958735171623, 53.462734781284276, 3.7]
