## 1. Import packages and import data from pdf 

In [1]:
import requests
import pandas as pd
import numpy as np

In [2]:
from tabula import read_pdf
from tabula import convert_into

In [3]:
convert_into('challenge.pdf','challenge.csv',output_format='csv')

In [3]:
#import the dataset
data = pd.read_csv('challenge.csv', header=None)

In [4]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,5.1,3.5,1.4,0.2,'A',5.1,3.4,1.5,0.2,'A',6.0,2.9,4.5,1.5,'B',7.7,3.8,6.7,2.2,'C'
1,4.9,3.0,1.4,0.2,'A',5.0,3.5,1.3,0.3,'A',5.7,2.6,3.5,1.0,'B',7.7,2.6,6.9,2.3,'C'
2,4.7,3.2,1.3,0.2,'A',4.5,2.3,1.3,0.3,'A',5.5,2.4,3.8,1.1,'B',6.0,2.2,5.0,1.5,'C'
3,4.6,3.1,1.5,0.2,'A',4.4,3.2,1.3,0.2,'A',5.5,2.4,3.7,1.0,'B',6.9,3.2,5.7,2.3,'C'
4,5.0,3.6,1.4,0.2,'A',5.0,3.5,1.6,0.6,'A',5.8,2.7,3.9,1.2,'B',5.6,2.8,4.9,2.0,'C'


In [5]:
data.shape

(39, 20)

In [6]:
data.dtypes

0     float64
1     float64
2     float64
3     float64
4      object
5     float64
6     float64
7     float64
8     float64
9      object
10    float64
11    float64
12    float64
13    float64
14     object
15    float64
16    float64
17    float64
18    float64
19     object
dtype: object

In [7]:
data1 = data.iloc[:,:5]
data2 = data.iloc[:,5:10]
data3 = data.iloc[:,10:15]
data4 = data.iloc[:,15:20]

In [8]:
data2.columns = [0,1,2,3,4]
data3.columns = [0,1,2,3,4]
data4.columns = [0,1,2,3,4]

In [9]:
df= pd.concat([data1,data2,data3,data4], ignore_index=True)

In [10]:
df.dropna(inplace=True)

In [11]:
df.dtypes

0    float64
1    float64
2    float64
3    float64
4     object
dtype: object

In [12]:
colnames_numeric = df.columns[0:4]
df[colnames_numeric].head()

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


## 2. Normalization

Rescale all data attributes into the range 0-1 before calculating similarity

In [13]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[colnames_numeric] = scaler.fit_transform(df[colnames_numeric])

In [14]:
df.head()

Unnamed: 0,0,1,2,3,4
0,0.222222,0.625,0.067797,0.041667,'A'
1,0.166667,0.416667,0.067797,0.041667,'A'
2,0.111111,0.5,0.050847,0.041667,'A'
3,0.083333,0.458333,0.084746,0.041667,'A'
4,0.194444,0.666667,0.067797,0.041667,'A'


In [15]:
df.shape

(150, 5)

In [16]:
df1 = df.values.tolist()

Split the data into train and test set

In [17]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df1, test_size=0.33)

In [18]:
print(len(train))
print(len(test))

100
50


## 3. Implementing k-Nearest Neighbors

Define Euclidean distances,calculate the similarities between data instances

In [19]:
import math
def Euclideandist(a,b, length):
    d = 0.0
    for i in range(length):
        d += pow(float(a[i])- float(b[i]),2)
    return math.sqrt(d)

Use the Euclidean distances to collect the k most similar instances for a given unseen instance

In [20]:
import operator
def getNeighbors(train, testInstance, k):
    distances = []
    length = len(testInstance)-1
    for x in range(len(train)):
        dist = Euclideandist(testInstance, train[x], length)
        distances.append((train[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

Making predicted response based on those neighbors

In [21]:
def getResponse(neighbors):
    classVotes = {}
    for x in range(len(neighbors)):
        response = neighbors[x][-1]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True) #Sorting it based on votes
    return sortedVotes[0][0] 

Evaluating the accuracy of predictions

In [22]:
def getAccuracy(test, predictions):
    correct = 0
    for x in range(len(test)):
        if test[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(test))) * 100.0

Generating predictions

In [23]:
predictions=[]
k = 7
for x in range(len(test)):
    neighbors = getNeighbors(train, test[x], k)
    result = getResponse(neighbors)
    predictions.append(result)
    print('> predicted=' + repr(result) + ', actual=' + repr(test[x][-1]))

> predicted="'C'", actual="'C'"
> predicted="'A'", actual="'A'"
> predicted="'C'", actual="'B'"
> predicted="'A'", actual="'A'"
> predicted="'B'", actual="'B'"
> predicted="'B'", actual="'B'"
> predicted="'C'", actual="'C'"
> predicted="'B'", actual="'B'"
> predicted="'A'", actual="'A'"
> predicted="'A'", actual="'A'"
> predicted="'A'", actual="'A'"
> predicted="'A'", actual="'A'"
> predicted="'A'", actual="'A'"
> predicted="'B'", actual="'B'"
> predicted="'B'", actual="'B'"
> predicted="'C'", actual="'C'"
> predicted="'A'", actual="'A'"
> predicted="'B'", actual="'B'"
> predicted="'C'", actual="'C'"
> predicted="'A'", actual="'A'"
> predicted="'A'", actual="'A'"
> predicted="'A'", actual="'A'"
> predicted="'C'", actual="'C'"
> predicted="'A'", actual="'A'"
> predicted="'B'", actual="'B'"
> predicted="'B'", actual="'B'"
> predicted="'C'", actual="'B'"
> predicted="'B'", actual="'B'"
> predicted="'C'", actual="'C'"
> predicted="'C'", actual="'C'"
> predicted="'C'", actual="'C'"
> predic

In [24]:
accuracy = getAccuracy(test, predictions)
print('Accuracy: ' + repr(accuracy) + '%')

Accuracy: 96.0%


## 4. Use scikit-learn to train a KNN classifier and evaluate its performance

Creating design matrix X and trget vector y, and split data into train and test sets

In [25]:
X = np.array(df.iloc[:, 0:4])  
y = np.array(df.iloc[:,4])  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [26]:
print(X_train.shape)
print(y_train.shape)

(100, 4)
(100,)


In [27]:
# loading library
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# instantiate learning model (k = 3)
knn = KNeighborsClassifier(n_neighbors=3)

# fitting the model
knn.fit(X_train, y_train)

# predict the response
pred = knn.predict(X_test)

# evaluate accuracy
accuracy1 = accuracy_score(y_test, pred)
print ('Accuracy: ' + repr(accuracy1))

Accuracy: 0.98


## 5. Alternative Distance Measure: define Manhattan Distance

In [28]:
def manhattanDistance(a, b, length):
    distance = 0
    for i in range(length):
        distance = distance + abs(a[i]-b[i])
    return distance

## 6. KNN for regression

Taking the mean of the output of the k closest in training data points, so that I can use the k neighbors to fill 
out thie missing values

In [29]:
def regression(test_set, train_set, k):
    
    predicted_values = []   
    for n in range(len(test_set)):
        missing = int(np.argwhere(np.isnan(test_set[n]))) 
        av_res = 0
        neighbors = getNeighbors(train_set, test_set[n], k) # k neighbors        
        for m in neighbors:
            av_res += m[missing]       
        predicted_values.append(av_res/k)
        
    return predicted_values

Now take a look at how the knn regression works. Let's check it on a example!

In [33]:
test = np.array([ [0.99 , 0.4, np.nan , 0.2], 
                   [0.5 , 0.44, 0.22 , np.nan]
                 ])

In [34]:
prediction_for_test = regression(test, X_train, k)

In [35]:
print(prediction_for_test)

[0.5302663438256657, 0.25000000000000006]


In [33]:
#END