## Grid search to find best hyperparameters

In [8]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
# Loading data
path = '.'
def load_dataset():
    csv_path= os.path.join(path,'diabetes.csv')
    return pd.read_csv(csv_path)
dataset = load_dataset()

# Imputation with median strategy
from sklearn.impute import SimpleImputer
imputer_median = SimpleImputer(missing_values = 0, strategy = 'median')
dataset.iloc[:,1:6] = pd.DataFrame(imputer_median.fit_transform(dataset.values[:, 1:6]), # ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
                               columns=dataset.columns.values[1:6])

# Create features and labels 
features = dataset.drop(['Outcome'], axis=1)
labels = dataset['Outcome']

# Create training and test set 
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.25, random_state=123)

# Data up-sampling
from sklearn.utils import resample

df = pd.concat([features_train,labels_train],axis=1)
df.rename(columns={'Outcome':'balance'}, inplace=True)

# Separate majority (n=367) and minority (n=209) classes
df_majority = df[df['balance']==0] 
df_minority = df[df['balance']==1]

# Upsample samples with replacement for minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True, # sample with replacement
                                 n_samples=381, # to match majority class
                                 random_state=123) # reproducible results
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled.balance.value_counts()

# re-preparing the training sets
features_train = df_upsampled.drop(['balance'], axis=1)
labels_train = df_upsampled[['balance']]
labels_train.rename(columns={'balance':'Outcome'}, inplace=True)

# Implement scaling using standardiation
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
features_train_std = standardScaler.fit_transform(features_train)
features_test_std = standardScaler.transform(features_test)

> Comparision to original knn algorithm
```
# implement kNN algorithm
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=7)
# Fit data or train data
classifier.fit(features_train_std, labels_train)
# Predicting with classifier
pred = classifier.predict(features_test_std)
```

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
grid_params = {'n_neighbors' : [3,5,7,9,11,13,15,17,19], 'weights' : ['uniform', 'distance'], 'metric' : ['euclidian', 'manhattan']}
gs=GridSearchCV(KNeighborsClassifier(),grid_params,verbose=1,cv=3,n_jobs=-1)
gs_results=gs.fit(features_train_std, labels_train)


Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:    6.4s finished


In [4]:
gs_results.best_score_

0.8241469816272966

In [5]:
gs_results.best_estimator_

KNeighborsClassifier(metric='manhattan', n_neighbors=15, weights='distance')

In [6]:
gs_results.best_params_

{'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'distance'}

In [7]:
GridSearchCV.get_params(gs).keys()

dict_keys(['cv', 'error_score', 'estimator__algorithm', 'estimator__leaf_size', 'estimator__metric', 'estimator__metric_params', 'estimator__n_jobs', 'estimator__n_neighbors', 'estimator__p', 'estimator__weights', 'estimator', 'iid', 'n_jobs', 'param_grid', 'pre_dispatch', 'refit', 'return_train_score', 'scoring', 'verbose'])