# TFM

data: https://drive.google.com/drive/folders/1fx_j6gmiATvhEaBtUwEPuCOl8peJ2zDM

source: https://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/

## 1. Simple k-NN implementation

### Euclidean distance

In [1]:
from math import sqrt
import pandas as pd
from scipy import stats
from matplotlib import pyplot as plt
import matplotlib.pyplot as plt

In [20]:
dataset = [[2.7810836,2.550537003,0],
    [1.465489372,2.362125076,0],
    [3.396561688,4.400293529,0],
    [1.38807019,1.850220317,0],
    [3.06407232,3.005305973,0],
    [7.627531214,2.759262235,0],
    [5.332441248,2.088626775,0],
    [6.922596716,1.77106367,0],
    [8.675418651,-0.242068655,0],
    [7.673756466,3.508563011,0]]

In [2]:
# calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

In [21]:
row0 = dataset[0]
for row in dataset:
    distance = euclidean_distance(row0, row)
    print(distance)

0.0
1.3290173915275787
1.9494646655653247
1.5591439385540549
0.5356280721938492
4.850940186986411
2.592833759950511
4.214227042632867
6.522409988228337
4.985585382449795


### Get k-nearest neighbors

In [23]:
# Locate the most similar neighbors and return list of indexes
def get_neighbors(train, test_row, k):
    distances = list()
    for idx, train_row in enumerate(train):
        dist = euclidean_distance(test_row, train_row)
        distances.append((idx, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(k):
        neighbors.append(distances[i][0])
    return neighbors

In [24]:
neighbors = get_neighbors(dataset, dataset[0], 3)
for neighbor in neighbors:
    print(neighbor)

0
4
1


### Data from csv

In [38]:
# TRAIN data
# load fingerprints
df_trnrss = pd.read_csv('data/DSI1_trnrss.csv', header=None)
df_trnrss = df_trnrss.values.tolist()
# load coordinates
df_trncrd = pd.read_csv('data/DSI1_trncrd.csv', header=None)

# for now, drop columns of 'floor' and 'building', keep x,y,z
df_trncrd.drop(df_trncrd.columns[[3,4]], axis=1, inplace=True)
df_trncrd = df_trncrd.values.tolist()

In [12]:
# TEST data
# load fingerprints
df_tstrss = pd.read_csv('data/DSI1_tstrss.csv', header=None)
df_tstrss = df_tstrss.values.tolist()
# load coordinates
df_tstcrd = pd.read_csv('data/DSI1_tstcrd.csv', header=None)

# for now, drop columns of 'floor' and 'building', keep x,y,z
df_tstcrd.drop(df_tstcrd.columns[[3,4]], axis=1, inplace=True)

In [None]:
# first N rows from df used for test and deleted from df
N = 10
rows_test = [df.iloc[i] for i in range(N)]
df = df.iloc[N: , :]

In [25]:
k=3
all_neighbors = []

for i in df_tstrss:
    all_neighbors.append(get_neighbors(df_trnrss, i, k))

In [56]:
predicted_pos = []

for knn in all_neighbors:    
    x=0
    y=0
    for i in knn:
        x += df_trncrd[i][0]
        y += df_trncrd[i][1]
    predicted_pos.append([x/k,y/k])

In [55]:
print(len(predicted_pos))
print(len(df_tstrss))

348
348


In [None]:
# euclidean distance from rows test to closest neighbours
error_distances = []

for pair in neighbors:
    for i in pair[1]:
        distance = euclidean_distance(pair[0], i)
        error_distances.append(distance)

In [None]:
# scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(error_distances, [0]*k*N)
plt.show()

### ECDF

In [None]:
# ecdf plot

def ecdf(sample):

    # convert sample to a numpy array, if it isn't already
    sample = np.atleast_1d(sample)

    # find the unique values and their corresponding counts
    quantiles, counts = np.unique(sample, return_counts=True)

    # take the cumulative sum of the counts and divide by the sample size to
    # get the cumulative probabilities between 0 and 1
    cumprob = np.cumsum(counts).astype(np.double) / sample.size

    return quantiles, cumprob

In [None]:
# a normal distribution with a mean of 0 and standard deviation of 1
n = stats.norm(loc=0, scale=1)

# draw some random samples from it
sample = n.rvs(100)

# compute the ECDF of the samples
qe, pe = ecdf(sample)

# evaluate the theoretical CDF over the same range
q = np.linspace(qe[0], qe[-1], 1000)
p = n.cdf(q)

# plot
fig, ax = plt.subplots(1, 1)
ax.plot(q, p, '-k', lw=2, label='Theoretical CDF')
ax.plot(qe, pe, '-r', lw=2, label='Empirical CDF')
ax.set_xlabel('Quantile')
ax.set_ylabel('Cumulative probability')
ax.legend(fancybox=True, loc='right')

plt.show()