<div style="width: 100%; clear: both;">
<div style="float: left; width: 50%;">
<img src="http://www.uoc.edu/portal/_resources/common/imatges/marca_UOC/UOC_Masterbrand.jpg", align="left">
</div>
<div style="float: right; width: 50%;">
<p style="margin: 0; padding-top: 22px; text-align:right;">TFM</p>
<p style="margin: 0; text-align:right;">Master universitari en Ciència de dades (Data science)</p>
<p style="margin: 0; text-align:right; padding-button: 100px;">Estudis de Informàtica, Multimedia i Telecomunicació</p>
</div>
</div>
<div style="width:100%;">&nbsp;</div>


# TFM

data: https://drive.google.com/drive/folders/1fx_j6gmiATvhEaBtUwEPuCOl8peJ2zDM

code source: https://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/

## 1. Simple k-NN implementation

### Euclidean distance

In [53]:
from math import sqrt
import pandas as pd
from random import seed
from random import randrange
from IPython.core.debugger import Pdb;

In [2]:
dataset = [[2.7810836,2.550537003,0],
    [1.465489372,2.362125076,0],
    [3.396561688,4.400293529,0],
    [1.38807019,1.850220317,0],
    [3.06407232,3.005305973,0],
    [7.627531214,2.759262235,1],
    [5.332441248,2.088626775,1],
    [6.922596716,1.77106367,1],
    [8.675418651,-0.242068655,1],
    [7.673756466,3.508563011,1]]

In [3]:
# calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

In [5]:
row0 = dataset[0]
for row in dataset:
    distance = euclidean_distance(row0, row)
    #print(distance)

### Get nearest neighbors

In [66]:
# Locate the most similar neighbors
def get_neighbors(train, test_row, k):
    distances = list()
    for train_row in train:
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(k):
        neighbors.append(distances[i][0])
    return neighbors

In [77]:
neighbors = get_neighbors(dataset, dataset[0], 3)
for neighbor in neighbors:
    print(neighbor)

[2.7810836, 2.550537003, 0]
[3.06407232, 3.005305973, 0]
[1.465489372, 2.362125076, 0]


### Predictions

In [80]:
# Make a classification prediction with neighbors
def predict_classification(train, test_row, k):
    neighbors = get_neighbors(train, test_row, k)
    print(neighbors)
    output_values = [row[-1] for row in neighbors]
    print(output_values)
    prediction = max(set(output_values), key=output_values.count)
    return prediction

In [81]:
prediction = predict_classification(dataset, dataset[5], 3)
print('Expected %d, Got %d.' % (dataset[5][-1], prediction))

[[7.627531214, 2.759262235, 1], [7.673756466, 3.508563011, 1], [6.922596716, 1.77106367, 1]]
[1, 1, 1]
Expected 1, Got 1.


### Data cleaning

In [106]:
df = pd.read_csv('data/DSI1_trncrd.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4
0,73.756532,54.633661,3.7,1,1
1,73.756532,54.633661,3.7,1,1
2,73.756532,54.633661,3.7,1,1
3,76.199587,53.462735,3.7,1,1
4,76.199587,53.462735,3.7,1,1


In [107]:
# for now, drop columns of 'floor' and 'building', keep x,y,z
df.drop(df.columns[[3,4]], axis=1, inplace=True)
df.head()

Unnamed: 0,0,1,2
0,73.756532,54.633661,3.7
1,73.756532,54.633661,3.7
2,73.756532,54.633661,3.7
3,76.199587,53.462735,3.7
4,76.199587,53.462735,3.7


In [108]:
print(df.duplicated().sum())
print(df.shape[0])

1139
1369


In [109]:
# dropping ALL duplicate values
df.drop_duplicates(keep = 'first', inplace = True)
print(df.duplicated().sum())
df.head()

0


Unnamed: 0,0,1,2
0,73.756532,54.633661,3.7
3,76.199587,53.462735,3.7
9,78.208971,54.11325,3.7
15,78.743835,56.006966,3.7
21,89.441223,63.336084,3.7


### Algorithm

In [110]:
# kNN Algorithm
def k_nearest_neighbors(train, test, k):
    predictions = list()
    for row in test:
        output = predict_classification(train, row, k)
        predictions.append(output)
    return(predictions)

In [115]:
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = dataset.values.tolist()
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split
 
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

In [117]:
# Test the kNN on the example dataset
seed(1)
# evaluate algorithm
n_folds = 5
k = 5
#Pdb().set_trace()
scores = evaluate_algorithm(df, k_nearest_neighbors, n_folds, k)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
#label = predict_classification(dataset, row, num_neighbors)
#print('Data=%s, Predicted: %s' % (row, label))

[[1.269764297411208, 7.56145749810766, 3.7], [1.4721479051604598, 10.61165987444968, 3.7], [4.690928810957247, 10.06790923343326, 3.7], [0.09392851724029301, 9.402937219081585, 3.7], [2.667086815935277, 5.760036527981738, 3.7]]
[3.7, 3.7, 3.7, 3.7, 3.7]
[[70.21716613866568, 55.409121133274795, 3.7], [69.65344677241606, 53.74986376611102, 3.7], [66.41728588329538, 53.22290633823879, 3.7], [71.8551465320582, 53.11990258599164, 3.7], [69.0199357823271, 51.08874711537703, 3.7]]
[3.7, 3.7, 3.7, 3.7, 3.7]
[[32.20714129179261, 18.378806080876828, 3.7], [29.303923820913578, 22.016135573495802, 3.7], [28.764563617167877, 17.98594773440199, 3.7], [30.263738829162758, 16.83486919537314, 3.7], [31.714733012284718, 23.331754471314422, 3.7]]
[3.7, 3.7, 3.7, 3.7, 3.7]
[[7.515571991891584, 10.748068473245722, 3.7], [4.690928810957247, 10.06790923343326, 3.7], [6.0450298903818, 13.52371067280567, 3.7], [4.4740866358705516, 13.002485009999448, 3.7], [7.596585906870612, 8.969278116496726, 3.7]]
[3.7, 3.7

In [75]:
row = [76.199587,53.462735,3.7]
prediction = predict_classification(dataset, row, 3)
print('Expected %d, Got %d.' % (row[-1], prediction))

Expected 3, Got 1.
