In [None]:
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np
import sklearn.pipeline as pl
import sklearn.metrics as sx
import sklearn.preprocessing as sp
import sklearn.model_selection as ms 
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier

In [None]:
def splitYX(df):
    return df.loc[:,df.columns[0]], df.loc[: ,df.columns[1:]]
y, X = splitYX(pd.read_csv("./wine.data", header=None))

In [None]:
def gen_plain_knn(n=5):
    return pl.Pipeline(steps=[('knn', KNeighborsClassifier(n_neighbors=n))])

def gen_preprocessed_knn(n=5):
    return pl.Pipeline(steps=[('scale', sp.StandardScaler()) ,('knn', KNeighborsClassifier(n_neighbors=n))])

def cl_score(cl, X, y, scoring='accuracy'):
    return np.mean(ms.cross_val_score(cl, X, y, n_jobs=4, 
                                     scoring=scoring, cv=ms.KFold(n_splits=5, random_state=42, shuffle=True)))

In [None]:
knns = range(1,51)

In [None]:
score_plain_knn = [cl_score(gen_plain_knn(n=n), X, y) for n in knns]
score_preprocessed_knn = [cl_score(gen_preprocessed_knn(n=n), X, y) for n in knns]

In [None]:
"%d, %.2f" % max(zip(knns, score_plain_knn), key = lambda z: z[1])

In [None]:
"%d, %.2f" % max(zip(knns, score_preprocessed_knn), key = lambda z: z[1])

In [None]:
plt.plot(knns, score_plain_knn, label='plain knn')
plt.plot(knns, score_preprocessed_knn, label='preprocessed')
plt.legend()
pass

## Assignment 2.2

In [None]:
import sklearn.datasets
from sklearn.neighbors import KNeighborsRegressor

In [None]:
data_obj = sklearn.datasets.load_boston()

In [None]:
metric_p = np.linspace(1, 10, num=200)

In [None]:
X = data_obj.data
y = data_obj.target

In [None]:
def metric_score(p):
    cl = pl.Pipeline(steps=[('scale', sp.StandardScaler()), ('knn_reg', KNeighborsRegressor(n_neighbors=5, weights='distance', p = p))])
    return cl_score(cl, X, y, scoring='neg_mean_squared_error')

In [None]:
def nbh_score(n):
    cl = pl.Pipeline(steps=[('scale', sp.StandardScaler()), ('knn_reg', KNeighborsRegressor(n_neighbors=n, weights='distance', p = 1))])
    return cl_score(cl, X, y, scoring='neg_mean_squared_error')

In [None]:
s = [metric_score(p) for p in metric_p]

In [None]:
ns = [metric_score(n) for n in range(1,51)]

In [None]:
"%d, %.1f" % max(zip(metric_p, s), key=lambda z: z[1])

In [None]:
plt.plot(metric_p, s, label='metric')
plt.legend()
pass

In [None]:
plt.plot(range(1,51), ns, label='best neighbours')
plt.legend()
pass