# DS-SF-27 | Codealong 08 | k-Nearest Neighbors

In [None]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

from sklearn import preprocessing, neighbors, grid_search, cross_validation

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

In [None]:
df = pd.read_csv(os.path.join('..', 'datasets', 'iris.csv'))

In [None]:
df

## Part A - Activity | Iris Dataset | Exploratory Data Analysis

In [None]:
color = df.Species.map({'Setosa': 'red', 'Versicolor': 'green', 'Virginica': 'blue'})

In [None]:
# TODO

## Part B - First hand-coded classifier

In [None]:
c = df.Species

In [None]:
def my_first_classifier(row):
    # TODO

In [None]:
c_hat = df.apply(my_first_classifier, axis = 1)

c_hat

## Part C - Classification metrics

### Accuracy

In [None]:
# TODO

### Misclassification rate

In [None]:
# TODO

## Part D - Activity | Second hand-coded classifier

In [None]:
def my_second_classifier(row):
    # TODO

In [None]:
c_hat = df.apply(my_second_classifier, axis = 1)

In [None]:
(c_hat != c).sum()

In [None]:
(c_hat == c).mean()

## Part E - k-Nearest Neighbors (k-NN)

### Feature matrix and label vector

In [None]:
X = df[ ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth'] ]

scaler = preprocessing.MinMaxScaler().fit(X)

X = scaler.transform(X)

In [None]:
X

In [None]:
c = df.Species

In [None]:
c

In [None]:
model = neighbors.KNeighborsClassifier(n_neighbors = 5, weights = 'uniform').\
    fit(X, c)

In [None]:
c_hat = model.predict(X)

In [None]:
(c_hat != c).sum()

In [None]:
pd.DataFrame({'c': c, 'c_hat': c_hat})[c_hat != c]

You can measure the precision of your prediction either manually or with `.score()`

In [None]:
(c_hat == c).mean()

In [None]:
model.score(X, c)

# weights = 'uniform'

In [None]:
neighbors.KNeighborsClassifier(n_neighbors = 5, weights = 'uniform').\
    fit(X, c).\
    score(X, c)

# weights = 'distance'

In [None]:
neighbors.KNeighborsClassifier(n_neighbors = 5, weights = 'distance').\
    fit(X, c).\
    score(X, c)

## Part F - What is the best value for `k`?

In [None]:
k = range(1, df.shape[0])

score_df = pd.DataFrame({'k': k})

score_df['model'] = score_df.\
    apply(lambda row: neighbors.KNeighborsClassifier(n_neighbors = row['k']).fit(X, c), axis = 1)

score_df['score'] = score_df.model.apply(lambda model: model.score(X, c))

In [None]:
score_df

In [None]:
score_df.plot(x = 'k', y = 'score')

In [None]:
score_df[score_df.k.isin([99, 100])]

In [None]:
c_hat_99 = score_df.loc[score_df.k == 99, 'model'].values[0].predict(X)
c_hat_100 = score_df.loc[score_df.k == 100, 'model'].values[0].predict(X)

pd.crosstab(c_hat_99, c, rownames = ['Hypothesized Class (k = 99)'], colnames = ['True Class'])

In [None]:
pd.crosstab(c_hat_100, c, rownames = ['Hypothesized Class (k = 100)'], colnames = ['True Class'])

## Part G - Validation

60% of the dataset to train the model; the rest to test the model

In [None]:
train_df = df.sample(frac = .6, random_state = 0).sort_index()

In [None]:
train_df

In [None]:
test_df = df.drop(train_df.index)

In [None]:
test_df

The error in the training set is less than the error is the test set

In [None]:
train_X = train_df[ ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth'] ]

scaler = preprocessing.MinMaxScaler().fit(train_X)

train_X = scaler.transform(train_X)

train_c = train_df.Species

model = neighbors.KNeighborsClassifier(n_neighbors = 5, weights = 'uniform').\
    fit(train_X, train_c)

model.score(train_X, train_c)

In [None]:
test_X = test_df[ ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth'] ]

# TODO

test_c = test_df.Species

model.score(test_X, test_c)

In [None]:
k = range(1, train_df.shape[0])

score_df = pd.DataFrame({'k': k})

models = score_df.\
    apply(lambda row: neighbors.KNeighborsClassifier(n_neighbors = row['k']).fit(train_X, train_c), axis = 1)

score_df['train_score'] = models.apply(lambda model: model.score(train_X, train_c))
score_df['test_score'] = models.apply(lambda model: model.score(test_X, test_c))

In [None]:
score_df

In [None]:
score_df.set_index('k').plot()

## Part H - Cross-Validation

In [None]:
train_df.shape

In [None]:
train_df

In [None]:
k_cv = 5 # 5-fold CV
k_nn = range(1, train_df.shape[0] * (k_cv - 1) / k_cv) # k-NN

gs = grid_search.GridSearchCV(
    estimator = neighbors.KNeighborsClassifier(),
    param_grid = {'n_neighbors': k_nn, 'weights': ['uniform', 'distance']},
    cv = cross_validation.KFold(train_df.shape[0], n_folds = k_cv)
)

gs.fit(train_X, train_c)

score_df = pd.DataFrame({'k': [score.parameters['n_neighbors'] for score in gs.grid_scores_],
    'weights': [score.parameters['weights'] for score in gs.grid_scores_],
    'score': [score.mean_validation_score for score in gs.grid_scores_]})

score_df

In [None]:
score_uniform_df = score_df[score_df.weights == 'uniform']
plt.plot(score_uniform_df.k, score_uniform_df.score, label = 'uniform')

score_distance_df = score_df[score_df.weights == 'distance']
plt.plot(score_distance_df.k, score_distance_df.score, label = 'distance')

plt.legend()

In [None]:
gs.best_score_

In [None]:
gs.best_estimator_

In [None]:
gs.best_params_

In [None]:
gs.score(train_X, train_c)

In [None]:
gs.score(test_X, test_c)

## Final model

In [None]:
# TODO