# Diabetes Outcome: Classifier Evaluation and MLP

In [1]:
import numpy as np
import pandas as pd

## Classifier evaluation

### Binary classification

#### Loading data and EDA

In [None]:
df_pid = pd.read_csv("diabetes.csv")
df_pid

In [None]:
df_pid.dtypes

In [None]:
df_pid['Outcome'] = df_pid['Outcome'].astype('category')

In [None]:
df_pid.describe()

In [None]:
df_pid['Outcome'].describe()

In [None]:
import plotly.express as px

In [None]:
fig = px.scatter_matrix(df_pid, color='Outcome', height=1800, width=1800)
fig.show()

In [None]:
for feature in df_pid.columns:
  fig = px.histogram(df_pid, x=feature, color='Outcome')
  fig.show()

#### Data Splitting

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_pid_train, df_pid_test = train_test_split(df_pid, test_size=0.2)

In [None]:
X_pid_train = df_pid_train.iloc[:, :-1]
y_pid_train = df_pid_train['Outcome']

X_pid_test = df_pid_test.iloc[:, :-1]
y_pid_test = df_pid_test['Outcome']

#### Building a k-NN QuAM

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [None]:
from sklearn.neighbors import KNeighborsClassifier 

In [None]:
knn_pid = KNeighborsClassifier(n_neighbors=3)
knn_pid.fit(X_pid_train, y_pid_train)

#### Accuracy Score

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
yhat_pid_train = knn_pid.predict(X_pid_train)
accuracy_score(y_pid_train, yhat_pid_train)

In [None]:
yhat_pid_test = knn_pid.predict(X_pid_test)
accuracy_score(y_pid_test, yhat_pid_test)

#### Precision Score

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html

In [None]:
from sklearn.metrics import precision_score

In [None]:
display(precision_score(y_pid_train, yhat_pid_train))
display(precision_score(y_pid_test, yhat_pid_test))

#### Recall Score

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html

In [None]:
from sklearn.metrics import recall_score

In [None]:
display(recall_score(y_pid_train, yhat_pid_train))
display(recall_score(y_pid_test, yhat_pid_test))

#### F1 Score

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html

In [None]:
from sklearn.metrics import f1_score


In [None]:
display(f1_score(y_pid_train, yhat_pid_train))
display(f1_score(y_pid_test, yhat_pid_test))

#### Classification Report

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html

In [None]:
from sklearn.metrics import classification_report

In [None]:
print("On training data:")
print(classification_report(y_pid_train, yhat_pid_train, \
                           target_names=['No Diabetes', 'Diabetes']))
print("On test data:")
print(classification_report(y_pid_test, yhat_pid_test, \
                           target_names=['No Diabetes', 'Diabetes']))

#### Chacking against a baseline

In [None]:
display(374 / 576)

In [None]:
display(accuracy_score(y_pid_train, yhat_pid_train))

In [None]:
display(126 / 192)

In [None]:
display(accuracy_score(y_pid_test, yhat_pid_test))

### Multiclass classification

#### Loading data and EDA

https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html

In [None]:
from sklearn.datasets import load_digits

In [None]:
data = load_digits() 

print(data['DESCR'])

In [None]:
df_digits = pd.DataFrame(data.data)
df_digits['label'] = data.target

In [None]:
df_digits

In [None]:
df_digits.dtypes

In [None]:
df_digits['label'] = df_digits['label'].astype('category')

In [None]:
df_digits.describe()

In [None]:
df_digits['label'].describe()

In [None]:
# px.scatter_matrix(df_digits, color='label', height=13000, width=13000)

In [None]:
# for column in df_digits.columns:
#   fig = px.histogram(df_digits, x=column, color='label')
#   fig.show()

#### Data Splitting

In [None]:
df_digits_train, df_digits_test = train_test_split(df_digits, test_size=0.2)

X_digits_train = df_digits_train.iloc[:, :-1]
y_digits_train = df_digits_train['label']

X_digits_test = df_digits_test.iloc[:, :-1]
y_digits_test = df_digits_test['label']

#### Building a k-NN QuAM

In [None]:
knn_digits = KNeighborsClassifier(n_neighbors=5, p=1)
knn_digits.fit(X_digits_train, y_digits_train)

In [None]:
yhat_digits_train = knn_digits.predict(X_digits_train)
yhat_digits_test  = knn_digits.predict(X_digits_test)

#### Confusion matrix

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_digits_train, yhat_digits_train)

In [None]:
confusion_matrix(y_digits_test, yhat_digits_test)

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.plot_confusion_matrix.html

In [None]:
from sklearn.metrics import plot_confusion_matrix

In [None]:
plot_confusion_matrix(knn_digits, X_digits_train, y_digits_train)

In [None]:
plot_confusion_matrix(knn_digits, X_digits_test, y_digits_test)

#### Other metrics

In [None]:
display(accuracy_score(y_digits_train, yhat_digits_train))
display(accuracy_score(y_digits_test,  yhat_digits_test))

In [None]:
print("On training data:")
print(classification_report(y_digits_train, yhat_digits_train))
print("On test data:")
print(classification_report(y_digits_test, yhat_digits_test))

In [None]:
display(precision_score(y_digits_train, yhat_digits_train, average=None))
display(precision_score(y_digits_test,  yhat_digits_test,  average=None))

In [None]:
display(recall_score(y_digits_train, yhat_digits_train, average=None))
display(recall_score(y_digits_test,  yhat_digits_test,  average=None))

In [None]:
display(f1_score(y_digits_train, yhat_digits_train, average=None))
display(f1_score(y_digits_test,  yhat_digits_test,  average=None))

## Quick sanity check

In [None]:
nn_digits = KNeighborsClassifier(n_neighbors=1, p=1)
nn_digits.fit(X_digits_train, y_digits_train)

plot_confusion_matrix(nn_digits, X_digits_train, y_digits_train)

In [None]:
plot_confusion_matrix(nn_digits, X_digits_test,  y_digits_test)

## Choosing best hyperparameters

### Normalization

In [None]:
X_pid_train.describe()

In [None]:
X_pid_test.describe()

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
pid_scaler = StandardScaler()

In [None]:
df_pid_train.iloc[:, :-1] = pid_scaler.fit_transform(X_pid_train)
X_pid_train = df_pid_train.iloc[:, :-1]

In [None]:
X_pid_train.describe()

In [None]:
df_pid_test.iloc[:, :-1] = pid_scaler.transform(X_pid_test)
X_pid_test = df_pid_test.iloc[:, :-1]

In [None]:
X_pid_test.describe()

In [None]:
display(pid_scaler.mean_)
display(pid_scaler.scale_)

### The validation loop

In [None]:
df_pid_tr, df_pid_va = train_test_split(df_pid_train, test_size=0.33)

X_pid_tr = df_pid_tr.iloc[:, :-1]
y_pid_tr = df_pid_tr['Outcome']

X_pid_va = df_pid_va.iloc[:, :-1]
y_pid_va = df_pid_va['Outcome']

In [None]:
best_accuracy = 0.
best_k = 0
best_degree = 0
for k in range(1, 10):
  for degree in range(1, 5):
    knn_pid_temp = KNeighborsClassifier(n_neighbors=k, p=degree)
    knn_pid_temp.fit(X_pid_tr, y_pid_tr)

    yhat_pid_va_temp = knn_pid_temp.predict(X_pid_va)
    
    accuracy_temp = accuracy_score(y_pid_va, yhat_pid_va_temp)
    if accuracy_temp > best_accuracy:
      best_accuracy = accuracy_temp
      best_k = k
      best_degree = degree

print("best k:", best_k)
print("best distance metric: L" + str(best_degree))
print("best accuracy:", best_accuracy)

### The final test

In [None]:
knn_pid_2 = KNeighborsClassifier(n_neighbors=best_k, p=best_degree)
knn_pid_2.fit(X_pid_train, y_pid_train)

yhat_pid_test_2 = knn_pid_2.predict(X_pid_test)
print("Accuracy on test data:", accuracy_score(y_pid_test, yhat_pid_test_2))

### $k$-Fold cross-validation

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html?highlight=kfold

In [None]:
from sklearn.model_selection import KFold

In [None]:
n_folds = 5
kf = KFold(n_splits=n_folds)

best_accuracy = 0.
best_k = 0
best_degree = 0
for k in range(1, 8):
  for degree in range(1, 5):
    accuracy_sum_temp = 0.
    for training_indices, validation_indices in kf.split(X_pid_train):
      knn_pid_fold = KNeighborsClassifier(n_neighbors=k, p=degree)
      knn_pid_fold.fit(X_pid_train.iloc[training_indices, :], \
                      y_pid_train.iloc[training_indices])

      yhat_pid_va_fold = \
        knn_pid_fold.predict(X_pid_train.iloc[validation_indices, :])
      
      accuracy_fold = accuracy_score(y_pid_train.iloc[validation_indices], \
                                     yhat_pid_va_fold)
      accuracy_sum_temp += accuracy_fold
    accuracy_temp = accuracy_sum_temp / n_folds
    if accuracy_temp > best_accuracy:
      best_accuracy = accuracy_temp
      best_k = k
      best_degree = degree

print("best k:", best_k)
print("best distance metric: L" + str(best_degree))
print("best accuracy:", best_accuracy)

### Grid search

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
knn_pid_search = GridSearchCV(estimator=KNeighborsClassifier(), \
                              cv=5, \
                              param_grid=dict(n_neighbors=[2, 3, 4, 5, 6, 7], \
                                              p=[1, 2, 3, 4]), \
                              scoring='accuracy')

In [None]:
knn_pid_search.fit(X_pid_train, y_pid_train)

In [None]:
knn_pid_search.best_estimator_

## Learning curves

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.learning_curve.html

In [None]:
from sklearn.model_selection import learning_curve

In [None]:
data_sizes, training_scores, validation_scores = \
  learning_curve(KNeighborsClassifier(), X_digits_train, \
                 y_digits_train, cv=10, scoring='accuracy', \
                 train_sizes=np.linspace(0.01, 1.0, 51))

In [None]:
display(data_sizes)

In [None]:
display(training_scores)
display(training_scores.shape)

In [None]:
display(validation_scores)
display(validation_scores.shape)

In [None]:
training_mean = training_scores.mean(axis=1) 
training_standard_deviation = training_scores.std(axis=1) 

In [None]:
validation_mean = validation_scores.mean(axis=1) 
validation_standard_deviation = validation_scores.std(axis=1)

In [None]:
import plotly.graph_objects as go

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=data_sizes, 
                        y=training_mean,
                        mode='lines',
                        name='Training',
                        line=dict(color='red')))
fig.add_trace(go.Scatter(x=data_sizes, 
                        y=training_mean - training_standard_deviation,
                        mode='lines',
                        name='Training lower bound',
                        line=dict(width=0, color='red'),
                        showlegend=False))
fig.add_trace(go.Scatter(x=data_sizes, 
                        y=training_mean + training_standard_deviation,
                        mode='lines',
                        name='Training upper bound',
                        line=dict(width=0, color='red'),
                        fill='tonexty',
                        fillcolor='rgba(255, 0, 0, 0.3)',
                        showlegend=False))

fig.add_trace(go.Scatter(x=data_sizes, 
                        y=validation_mean,
                        mode='lines',
                        name='Validation',
                        line=dict(color='blue')))
fig.add_trace(go.Scatter(x=data_sizes, 
                        y=validation_mean - validation_standard_deviation,
                        mode='lines',
                        name='Validation lower bound',
                        line=dict(width=0, color='blue'),
                        showlegend=False))
fig.add_trace(go.Scatter(x=data_sizes, 
                        y=validation_mean + validation_standard_deviation,
                        mode='lines',
                        name='Validation upper bound',
                        line=dict(width=0, color='blue'),
                        fill='tonexty',
                        fillcolor='rgba(0, 0, 255, 0.3)',
                        showlegend=False))

fig.update_layout(title='Learning curve',
                 xaxis_title='Dataset size',
                 yaxis_title='Accuracy')
fig.show()

That's all Folks!