In [2]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
data_df=pd.read_csv(os.path.join('.','heart-numerical.csv'))
data_df.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,ca,disease
0,63,145,233,150,2.3,0,absence
1,67,160,286,108,1.5,3,presence
2,67,120,229,129,2.6,2,presence
3,37,130,250,187,3.5,0,absence
4,41,130,204,172,1.4,0,absence


In [4]:
X=data_df.drop('disease', axis=1).values
y=data_df.disease.values


In [6]:
from sklearn.model_selection import train_test_split

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, random_state=0)

In [9]:
# Fir a KNN-classifier with default values as baseline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

knn_pipe=Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

knn_pipe.fit(X_tr, y_tr)

baseline_accuracy=knn_pipe.score(X_te, y_te)

print("Baseline accuracy: {:.3f}".format(baseline_accuracy))

Baseline accuracy: 0.747


In [11]:
# Define a set of reasonable values
k_values = np.arange(1, 21) # 1, 2, 3, .., 20
weights_functions = ['uniform', 'distance']
distance_types = [1, 2] # L1, L2 distances

knn_pipe.get_params()

{'memory': None,
 'steps': [('scaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('knn',
   KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
              metric_params=None, n_jobs=None, n_neighbors=5, p=2,
              weights='uniform'))],
 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'knn': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
            metric_params=None, n_jobs=None, n_neighbors=5, p=2,
            weights='uniform'),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'knn__algorithm': 'auto',
 'knn__leaf_size': 30,
 'knn__metric': 'minkowski',
 'knn__metric_params': None,
 'knn__n_jobs': None,
 'knn__n_neighbors': 5,
 'knn__p': 2,
 'knn__weights': 'uniform'}

In [21]:
# Grid seacrh using ParameterGrid

from sklearn.model_selection import ParameterGrid

grid=ParameterGrid({
    'scaler': [None, StandardScaler()],
    'knn__n_neighbors': k_values,
    'knn__weights': weights_functions,
    'knn__p': distance_types,
})

print("Number of combinaison:", len(grid))

Number of combinaison: 160


In [22]:
test_scores=[]

for params_dict in grid:
    knn_pipe.set_params(**params_dict)
    knn_pipe.fit(X_tr, y_tr)
    accuracy=knn_pipe.score(X_te, y_te)
    params_dict['accuracy']=accuracy
    test_scores.append(params_dict)

pd.DataFrame(test_scores).sort_values('accuracy', ascending=False)

Unnamed: 0,accuracy,knn__n_neighbors,knn__p,knn__weights,scaler
27,0.813187,4,1,distance,"StandardScaler(copy=True, with_mean=True, with..."
57,0.813187,8,1,uniform,"StandardScaler(copy=True, with_mean=True, with..."
65,0.802198,9,1,uniform,"StandardScaler(copy=True, with_mean=True, with..."
25,0.802198,4,1,uniform,"StandardScaler(copy=True, with_mean=True, with..."
59,0.802198,8,1,distance,"StandardScaler(copy=True, with_mean=True, with..."
75,0.802198,10,1,distance,"StandardScaler(copy=True, with_mean=True, with..."
19,0.802198,3,1,distance,"StandardScaler(copy=True, with_mean=True, with..."
151,0.791209,19,2,distance,"StandardScaler(copy=True, with_mean=True, with..."
145,0.791209,19,1,uniform,"StandardScaler(copy=True, with_mean=True, with..."
147,0.791209,19,1,distance,"StandardScaler(copy=True, with_mean=True, with..."
