In [148]:
import pandas as pd
from  sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.compose import make_column_transformer 
from sklearn.compose import make_column_selector
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV, KFold
from sklearn.pipeline import Pipeline
import os 



In [27]:
hr = pd.read_csv("Cases/human-resources-analytics/HR_comma_sep.csv")
y = hr['left']
X = hr.drop('left', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
        random_state=24, test_size=0.3, stratify=y)

ohe = OneHotEncoder(sparse_output=False, drop='first').set_output(transform='pandas')
scaler_mm = MinMaxScaler()
scaler_std = StandardScaler()
ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude=object)  ),
                             (ohe, make_column_selector(dtype_include=object) ),
                            verbose_feature_names_out=False).set_output(transform='pandas')
knn = KNeighborsClassifier(n_neighbors =1)
pipe = Pipeline([('CT',ct),('SCL',None),('KNN',knn)])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('Accuracy score: ', accuracy_score(y_test, y_pred))

y_pred_prob = pipe.predict_proba(X_test)
print("Log loss: ",log_loss(y_test, y_pred_prob)) # log loss is always more trustworthy than accuracy

Accuracy score:  0.948877528339631
Log loss:  1.8426406489213036


In [35]:
knn = KNeighborsClassifier(n_neighbors = 5)
pipe = Pipeline([('CT',ct),('SCL',scaler_mm),('KNN',knn)])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('Accuracy score: ', accuracy_score(y_test, y_pred))

y_pred_prob = pipe.predict_proba(X_test)
print("Log loss: ",log_loss(y_test, y_pred_prob))

Accuracy score:  0.9410980217826184
Log loss:  0.688465995800685


In [49]:
kfold= StratifiedKFold(n_splits= 5, shuffle = True, random_state= 24)
pipe = Pipeline([('CT',ct),('SCL',scaler_mm),('KNN',knn)])
params={'KNN__n_neighbors': np.arange(1,11) ,'SCL': [scaler_mm, scaler_std, None]}
gcv = GridSearchCV(pipe, param_grid = params, cv= kfold, verbose=3)

gcv.fit(X,y)


Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.967 total time=   0.2s
[CV 2/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.965 total time=   0.2s
[CV 3/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.968 total time=   0.2s
[CV 4/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.969 total time=   0.2s
[CV 5/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.969 total time=   0.2s
[CV 1/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=0.965 total time=   0.2s
[CV 2/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=0.966 total time=   0.2s
[CV 3/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=0.968 total time=   0.2s
[CV 4/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=0.969 total time=   0.2s
[CV 5/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=0.969 total time=   0.2s
[CV 1/5] END ......KNN__n_neighbors=1, SCL=None;, score=0.957 total time=   0.

In [54]:
gcv.best_score_

0.9676558852950983

In [56]:
gcv.best_params_

{'KNN__n_neighbors': 1, 'SCL': MinMaxScaler()}

In [58]:
pd_cv= pd.DataFrame(gcv.cv_results_)
pd_cv.shape

(30, 15)

In [60]:
kfold= StratifiedKFold(n_splits= 5, shuffle = True, random_state= 24)
pipe = Pipeline([('CT',ct),('SCL',scaler_mm),('KNN',knn)])
params={'KNN__n_neighbors': np.arange(1,11) ,'SCL': [scaler_mm, scaler_std, None]}
gcv = GridSearchCV(pipe, param_grid = params, cv= kfold, verbose=3, scoring = 'neg_log_loss')

gcv.fit(X,y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-1.202 total time=   0.0s
[CV 2/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-1.262 total time=   0.0s
[CV 3/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-1.142 total time=   0.0s
[CV 4/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-1.106 total time=   0.0s
[CV 5/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-1.118 total time=   0.0s
[CV 1/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-1.274 total time=   0.0s
[CV 2/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-1.238 total time=   0.0s
[CV 3/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-1.166 total time=   0.0s
[CV 4/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-1.106 total time=   0.0s
[CV 5/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-1.106 total time=   0.0s
[CV 1/5] END .....KNN__n_neighbors=1, SCL=None;, score=-1.550 total 

In [66]:
print(gcv.best_score_)
print(gcv.best_params_)
pd_cv= pd.DataFrame(gcv.cv_results_)
pd_cv.shape

-0.46380636508556156
{'KNN__n_neighbors': 10, 'SCL': StandardScaler()}


(30, 15)

## On Glass Dataset

In [71]:
df = pd.read_csv('Cases/Glass Identification/Glass.csv')

In [73]:
df.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,building_windows_float_processed
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,building_windows_float_processed
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,building_windows_float_processed
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,building_windows_float_processed
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,building_windows_float_processed


In [75]:
y= df['Type']
X = df.drop('Type', axis =1)

In [77]:
# ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude= object)),(ohe, make_column_selector(dtype_include= object)))
# pipe = Pipeline([('CT', ct),('KNN', knn)])

In [85]:
lbl = LabelEncoder() 
y= lbl.fit_transform(y)

In [115]:
kfold= StratifiedKFold(n_splits= 5, shuffle = True, random_state= 24)
pipe = Pipeline([('SCL',scaler_mm),('KNN',knn)])
params={'KNN__n_neighbors': np.arange(1,11) ,'KNN__metric': ['cityblock', 'minkowski','manhattan', 'haversine'],'SCL': [scaler_mm, scaler_std, None]}
gcv = GridSearchCV(pipe, param_grid = params, cv= kfold, verbose=3, scoring = 'neg_log_loss')
gcv.fit(X,y)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV 1/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-10.897 total time=   0.0s
[CV 2/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-6.706 total time=   0.0s
[CV 3/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-10.897 total time=   0.0s
[CV 4/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-9.220 total time=   0.0s
[CV 5/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-12.015 total time=   0.0s
[CV 1/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=StandardScaler();, score=-8.382 total time=   0.0s
[CV 2/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=StandardScaler();, score=-6.706 total time=   0.0s
[CV 3/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=StandardScaler();, score=-10.059 total time=   0.0s
[CV 4/5] END KNN__metric=cityblock, KNN__n_neighbors=1,

150 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])

In [104]:
gcv.best_params_

{'KNN__n_neighbors': 8, 'SCL': None}

In [106]:
gcv.best_score_

-2.1501310079676896

#### Using different metrics - cityblock, minkowski, manhattan, haversine

In [113]:
kfold= StratifiedKFold(n_splits= 5, shuffle = True, random_state= 24)
pipe = Pipeline([('SCL',scaler_mm),('KNN',knn)])
params={'KNN__n_neighbors': np.arange(1,16) ,'SCL': [scaler_mm, scaler_std, None]}
gcv = GridSearchCV(pipe, param_grid = params, cv= kfold, verbose=3, scoring = 'neg_log_loss')
gcv.fit(X,y)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
[CV 1/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-12.573 total time=   0.0s
[CV 2/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-8.382 total time=   0.0s
[CV 3/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-13.412 total time=   0.0s
[CV 4/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-10.059 total time=   0.0s
[CV 5/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-12.015 total time=   0.0s
[CV 1/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-10.059 total time=   0.0s
[CV 2/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-7.544 total time=   0.0s
[CV 3/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-11.735 total time=   0.0s
[CV 4/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-10.059 total time=   0.0s
[CV 5/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-13.731 total time=   0.0s
[CV 1/5] END ....KNN__n_neighbors=1, SCL=None;, score=-10.89

In [117]:
gcv.best_params_

{'KNN__metric': 'cityblock', 'KNN__n_neighbors': 10, 'SCL': None}

In [119]:
gcv.best_score_

-1.9992426425743097

# KNN for regression

In [123]:
df= pd.read_csv('Datasets/Boston.csv')

In [125]:
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [128]:
y=df['medv']
X=df.drop('medv', axis=1)

In [130]:
from sklearn.neighbors import KNeighborsRegressor

In [132]:
knnr = KNeighborsRegressor()

In [140]:

pipe = Pipeline([('SCL',None),('KNNR',knnr)])
params={'KNNR__n_neighbors': np.arange(1,11) ,'KNNR__metric': ['cityblock', 'minkowski','manhattan', 'haversine'],'SCL': [scaler_mm, scaler_std, None]}
gcv = GridSearchCV(pipe, cv=5, param_grid = params, verbose=3)
gcv.fit(X,y)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV 1/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=MinMaxScaler();, score=0.412 total time=   0.0s
[CV 2/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=MinMaxScaler();, score=0.300 total time=   0.0s
[CV 3/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=MinMaxScaler();, score=0.292 total time=   0.0s
[CV 4/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=MinMaxScaler();, score=0.456 total time=   0.0s
[CV 5/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=MinMaxScaler();, score=-0.608 total time=   0.0s
[CV 1/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=StandardScaler();, score=0.383 total time=   0.0s
[CV 2/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=StandardScaler();, score=0.438 total time=   0.0s
[CV 3/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=StandardScaler();, score=0.656 total time=   0.0s
[CV 4/5] END KNNR__metric=cityblock, KNNR__n_neigh

150 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])

In [166]:
print(gcv.best_score_)
print(gcv.best_params_)

0.7875954797972841
{'KNNR__metric': 'cityblock', 'KNNR__n_neighbors': 3, 'SCL': StandardScaler()}


{'KNNR__metric': 'cityblock', 'KNNR__n_neighbors': 6, 'SCL': StandardScaler()}

In [150]:

kfold= KFold(n_splits=5, random_state=24, shuffle =True)
pipe = Pipeline([('SCL',None),('KNNR',knnr)])
params={'KNNR__n_neighbors': np.arange(1,11) ,'KNNR__metric': ['cityblock', 'minkowski','manhattan', 'haversine'],'SCL': [scaler_mm, scaler_std, None]}
gcv = GridSearchCV(pipe, cv=kfold, param_grid = params, verbose=3)
gcv.fit(X,y)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV 1/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=MinMaxScaler();, score=0.764 total time=   0.0s
[CV 2/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=MinMaxScaler();, score=0.829 total time=   0.0s
[CV 3/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=MinMaxScaler();, score=0.723 total time=   0.0s
[CV 4/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=MinMaxScaler();, score=0.528 total time=   0.0s
[CV 5/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=MinMaxScaler();, score=0.755 total time=   0.0s
[CV 1/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=StandardScaler();, score=0.677 total time=   0.0s
[CV 2/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=StandardScaler();, score=0.872 total time=   0.0s
[CV 3/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=StandardScaler();, score=0.720 total time=   0.0s
[CV 4/5] END KNNR__metric=cityblock, KNNR__n_neighb

150 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])

In [164]:
print(gcv.best_score_)
print(gcv.best_params_)

0.7875954797972841
{'KNNR__metric': 'cityblock', 'KNNR__n_neighbors': 3, 'SCL': StandardScaler()}


In [158]:

pipe = Pipeline([('SCL',None),('KNNR',knnr)])
params={'KNNR__n_neighbors': np.arange(1,11) ,'KNNR__metric': ['cityblock', 'minkowski','manhattan', 'haversine'],'SCL': [scaler_mm, scaler_std, None]}
gcv = GridSearchCV(pipe, cv= KFold(n_splits=5, random_state=24, shuffle =True), param_grid = params, verbose=3)
gcv.fit(X,y)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV 1/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=MinMaxScaler();, score=0.764 total time=   0.0s
[CV 2/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=MinMaxScaler();, score=0.829 total time=   0.0s
[CV 3/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=MinMaxScaler();, score=0.723 total time=   0.0s
[CV 4/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=MinMaxScaler();, score=0.528 total time=   0.0s
[CV 5/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=MinMaxScaler();, score=0.755 total time=   0.0s
[CV 1/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=StandardScaler();, score=0.677 total time=   0.0s
[CV 2/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=StandardScaler();, score=0.872 total time=   0.0s
[CV 3/5] END KNNR__metric=cityblock, KNNR__n_neighbors=1, SCL=StandardScaler();, score=0.720 total time=   0.0s
[CV 4/5] END KNNR__metric=cityblock, KNNR__n_neighb

150 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])

In [162]:
print(gcv.best_score_)
print(gcv.best_params_)

0.7875954797972841
{'KNNR__metric': 'cityblock', 'KNNR__n_neighbors': 3, 'SCL': StandardScaler()}
