### Imports

In [1]:
# IMPORTS

import numpy as np
from datascience import *
import math
import statistics
from statistics import mode
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg', warn=False)
%matplotlib inline
plt.style.use('fivethirtyeight')

### Distance formulas 

In [2]:
def euclidean_distance(series1, series2):
    distance = []
    for i in np.arange(len(series1)):
        dist = (series1[i] - series2[i])**2
        distance.append(dist)
    return (sum(distance))**0.5

In [3]:
def manhattan_distance(series1, series2):
    distance = 0
    for i in np.arange(len(series1)):
        dist = abs(series1[i] - series2[i])
        distance += dist
    return distance

In [4]:
def hamming_distance(series1, series2):
    distance = 0
    for i in np.arange(len(series1)):
        if series1[i] == series2[i]:
            distance += 1
    return abs(len(series1) - distance)

### Normalize function

In [9]:
def normalize(series1):
    new_series = make_array()
    for i in np.arange(len(series1)):
        new_item = (series1[i] - np.mean(series1)) / np.std(series1)
        new_series = np.append(new_series, new_item)
    return new_series

### Processing the Dataset

In [30]:
#LOAD IN DATA

titanic_original = Table().read_table('titanic.csv')
titanic_original.show(1)

Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,3,Mr. Owen Harris Braund,male,22,1,0,7.25


In [31]:
#DROP SOME COLUMNS

titanic_dropped = titanic_original.drop(2,5,6).select(1,2,3,4)
titanic_dropped.show(1)

Pclass,Sex,Age,Fare
3,male,22,7.25


In [33]:
# FORMAT AND NORMALIZE COLUMNS

sexes = np.array(titanic_dropped.column('Sex'))
new_sex = make_array()
for i in np.arange(len(sexes)):
    if sexes[i] == 'male':
        new_sex = np.append(new_sex, 0)
    else:
        new_sex = np.append(new_sex, 1)
        
new_sex = normalize(new_sex)
new_pclass = normalize(titanic_dropped.column('Pclass'))
new_fare = normalize(titanic_dropped.column('Fare'))
new_age = normalize(titanic_dropped.column('Age'))
survived = titanic_original.column('Survived')

In [34]:
# SHOW NEW TABLE

titanic_normalized = Table().with_columns('Class', new_pclass, 'Age', new_age, 'Fare', new_fare, 'Sex', new_sex)
titanic_normalized.show(1)

Class,Age,Fare,Sex
0.830524,-0.529366,-0.503586,-0.740266


In [35]:
# SEPARATE TRAIN AND TEST SETS

train = titanic_normalized.take(np.arange(750))
test = titanic_normalized.take(np.arange(750,887))

### Defining the KNN Classifier

In [17]:
def distance_to_row(row, train_table):
    distances = make_array()
    for i in np.arange(train_table.num_rows):
        dist = manhattan_distance(row, train_table.row(i))
        distances = np.append(distances, dist)
    dist_table = Table().with_columns('Survived', survived[np.arange(train_table.num_rows)], 'Distance', distances)
    return dist_table.sort('Distance', descending=False)

In [18]:
def most_common(table, k):
    return mode(table.take(np.arange(k)).column(0))

In [19]:
def knn_predict(row_number, k):
    dist_table = distance_to_row(test.row(row_number), train)
    guess = most_common(dist_table, k)
    if guess == actuals[row_number]:
        return guess, 'Correct'
    else:
        return guess, 'Wrong'

In [23]:
actuals = titanic_original.take(np.arange(750,887)).column(0)

In [24]:
predictions = []
for i in np.arange(test.num_rows):
    guess = knn_predict(i, 7)[0]
    predictions.append(guess)

In [36]:
error = hamming_distance(predictions, actuals) / len(predictions)
accuracy = 1 - error
accuracy

0.8832116788321168

In [37]:
interp = Table().with_columns('Prediction', predictions, 'Actual', actuals)
interp.show(3)

Prediction,Actual
1,1
1,1
0,0


In [38]:
knn_predict(55, 7)

(1, 'Correct')