## KNN Classifier для Iris DataSet

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold 
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, Normalizer
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings("ignore")

In [None]:
iris=load_iris()
df=pd.DataFrame(iris.data,columns=iris.feature_names)
df_target=pd.DataFrame(iris.target,columns=["MEDV"])
df_full=pd.concat([df,df_target],axis=1)
df_full['MEDV'] = df_full['MEDV'].astype('category')

### Простейшая модель:

In [None]:
sns.set()
sns.pairplot(df_full, hue='MEDV', diag_kind="kde");

In [None]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=1)

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
sc_train = knn.score(X_train , y_train)
sc_test = knn.score(X_test , y_test)
sc_train, sc_test

In [None]:
pred = knn.predict(X_test)
print(sklearn.metrics.classification_report(y_test, pred, digits=4))

In [None]:
pred_proba = knn.predict_proba(X_test)
sklearn.metrics.log_loss(y_test, pred_proba)

### Препроцессинг

#### StandardScaler

In [None]:
stdsc = StandardScaler()
stdsc.fit(X_train)
X_train_std = stdsc.transform(X_train)
X_test_std = stdsc.transform(X_test)

In [None]:
df_full_std = pd.concat([pd.DataFrame(X_train_std),pd.DataFrame(y_train)],axis=1)
df_full_std.columns = df_full.columns

In [None]:
sns.set()
sns.pairplot(df_full_std, hue='MEDV', diag_kind="kde");

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_std, y_train)

sc_train = knn.score(X_train_std , y_train)
sc_test = knn.score(X_test_std , y_test)

sc_train, sc_test 

In [None]:
pred = knn.predict(X_test_std)
print(sklearn.metrics.classification_report(y_test, pred, digits=4))

In [None]:
pred_proba = knn.predict_proba(X_test_std)
sklearn.metrics.log_loss(y_test, pred_proba)

#### MinMaxScaler

In [None]:
mmsc = MinMaxScaler()
mmsc.fit(X_train)
X_train_mm = mmsc.transform(X_train)
X_test_mm = mmsc.transform(X_test)

In [None]:
df_full_mm = pd.concat([pd.DataFrame(X_train_mm),pd.DataFrame(y_train)],axis=1)
df_full_mm.columns = df_full.columns

In [None]:
sns.set()
sns.pairplot(df_full_mm, hue='MEDV', diag_kind="kde");

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_mm, y_train)

sc_train = knn.score(X_train_mm , y_train)
sc_test = knn.score(X_test_mm, y_test)

sc_train, sc_test

In [None]:
pred = knn.predict(X_test_mm)
print(sklearn.metrics.classification_report(y_test, pred, digits=4))

In [None]:
pred_proba = knn.predict_proba(X_test_mm)
sklearn.metrics.log_loss(y_test, pred_proba)

#### RobustScaler

In [None]:
robsc = RobustScaler()
robsc.fit(X_train)
X_train_rob = robsc.transform(X_train)
X_test_rob = robsc.transform(X_test)

In [None]:
df_full_rob = pd.concat([pd.DataFrame(X_train_rob),pd.DataFrame(y_train)],axis=1)
df_full_rob.columns = df_full.columns

In [None]:
sns.set()
sns.pairplot(df_full_rob, hue='MEDV', diag_kind="kde");

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_rob, y_train)

sc_train = knn.score(X_train_rob , y_train)
sc_test = knn.score(X_test_rob , y_test)

sc_train, sc_test

In [None]:
pred = knn.predict(X_test_rob)
print(sklearn.metrics.classification_report(y_test, pred, digits=4))

In [None]:
pred_proba = knn.predict_proba(X_test_rob)
sklearn.metrics.log_loss(y_test, pred_proba)

#### Normalizer

In [None]:
norm = Normalizer()
norm.fit(X_train)
X_train_norm = norm.transform(X_train)
X_test_norm = norm.transform(X_test)

In [None]:
df_full_norm = pd.concat([pd.DataFrame(X_train_norm),pd.DataFrame(y_train)],axis=1)
df_full_norm.columns = df_full.columns

In [None]:
sns.set()
sns.pairplot(df_full_norm, hue='MEDV', diag_kind="kde");

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_norm, y_train)

sc_train = knn.score(X_train_norm , y_train)
sc_test = knn.score(X_test_norm, y_test)

sc_train, sc_test

In [None]:
pred = knn.predict(X_test_norm)
print(sklearn.metrics.classification_report(y_test, pred, digits=4))

In [None]:
pred_proba = knn.predict_proba(X_test_norm)
sklearn.metrics.log_loss(y_test, pred_proba)

#### Вывод
Наилучшие результаты получаются при использовании __MinMaxScaler__

### Поиск лучшей комбинации гиперпараметров 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=1)

In [None]:
pipe = Pipeline([('preprocessing', MinMaxScaler()), 
                 ('classifier', KNeighborsClassifier())])

param_grid = { 
               'preprocessing': [MinMaxScaler(), StandardScaler(),RobustScaler(), Normalizer(), None], 
               'classifier': [KNeighborsClassifier()],
               'classifier__n_neighbors': list(range(1,11)),
               'classifier__p': [1, 2, 3, 4, 6, 8, 10],
               'classifier__weights': ['uniform', 'distance'],
               'classifier__algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute']
             }

kfold = KFold(n_splits=10, shuffle=True, random_state=42)


In [None]:
grid = GridSearchCV(pipe, param_grid, cv=kfold, return_train_score=True, iid=True)
grid.fit(X_train,y_train)

In [None]:
print("----------------- Обучили и тестировали -------------------")
print("Наилучшие параметры:\n{}\n".format(grid.best_params_))
print("Средняя правильность для наилучшей модели кроссвалидации на валидационных тестовых наборах: {:.6f}\n".format(grid.best_score_)) 
print("Правильность для наилучшей модели на тестовом наборе: {:.6f}\n".format(grid.score(X_test, y_test)))
gridresults = pd.DataFrame(grid.cv_results_)
display(gridresults.sort_values(["rank_test_score"]).T)