Standardization and k-NN
---

In [None]:
import pandas as pd

# Load data
data_df = pd.read_csv('heart-numerical.csv')

# Data summary
data_df.describe()

In [None]:
from sklearn.model_selection import train_test_split

# Create X/y arrays
X = data_df.drop('disease', axis=1).values
y = data_df.disease.values

# Split data
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Create a k-NN pipeline
knn_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=5))
])

# Fit estimator
knn_pipe.fit(X_tr, y_tr)

# Evaluate on test set
accuracy = knn_pipe.score(X_te, y_te)
print('Accuracy: {:.3f}'.format(accuracy))

In [None]:
from sklearn.base import clone

knn_pipe2 = clone(knn_pipe)
knn_pipe2.set_params(scaler=None)
knn_pipe2.fit(X_tr, y_tr)
print('Accuracy: {:.3f}'.format(knn_pipe2.score(X_te, y_te)))

In [None]:
import numpy as np

# Generate a set of k values
k_values = np.r_[1, np.arange(5, 101, step=5)]
k_values

In [None]:
# Variable to store the results
gs_results = []

# Grid search
for k in k_values:
    # Fit k-NN model
    knn_pipe.set_params(knn__n_neighbors=k)
    knn_pipe.fit(X_tr, y_tr)
    
    # Save model and its performance on train/test sets
    gs_results.append({
        'k': k,
        'train_accuracy': knn_pipe.score(X_tr, y_tr),
        'test_accuracy': knn_pipe.score(X_te, y_te)
    })

# Convert results to DataFrame
gs_results = pd.DataFrame(gs_results)
gs_results.sort_values(by='test_accuracy', ascending=False).head()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

# Plot the validation curves
plt.plot(gs_results['k'], gs_results['train_accuracy'], label='train curve')
plt.plot(gs_results['k'], gs_results['test_accuracy'], label='test curve')
plt.ylabel('accuracy')
plt.xlabel('k')
plt.legend()
plt.show()

In [None]:
# Run several times the evaluation
gs_results = []
for run_idx in range(30):
    # Split into train/test sets
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.3, random_state=run_idx)

    # Grid search
    for k in k_values:
        # Fit k-NN model
        knn_pipe.set_params(knn__n_neighbors=k)
        knn_pipe.fit(X_tr, y_tr)

        # Save model and its performance on train/test sets
        gs_results.append({
            'k': k,
            'run_idx': run_idx,
            'train_accuracy': knn_pipe.score(X_tr, y_tr),
            'test_accuracy': knn_pipe.score(X_te, y_te)
        })
        
# Convert results to DataFrame
gs_results = pd.DataFrame(gs_results)

In [None]:
# Group results by alpha value
grouped = gs_results.groupby('k')

# Compute train/test mean scores with std
mean_tr = grouped.train_accuracy.mean()
mean_te = grouped.test_accuracy.mean()
std_tr = grouped.train_accuracy.std()
std_te = grouped.test_accuracy.std()

In [None]:
# Plot mean scores
plt.plot(k_values, mean_tr, label='train')
plt.plot(k_values, mean_te, label='test')

# Add marker for best score
best_k = mean_te.idxmax()
plt.scatter(best_k, mean_te.max(), marker='x', c='red', zorder=10)

# Quantify variance with ±std curves
plt.fill_between(k_values, mean_tr-std_tr, mean_tr+std_tr, alpha=0.2)
plt.fill_between(k_values, mean_te-std_te, mean_te+std_te, alpha=0.2)
plt.title('Best k: {} with {:.1f}% accuracy'.format(best_k, 100*mean_te[best_k]))
plt.ylabel('accuracy')
plt.xlabel('k')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import classification_report

# Tuned k-NN estimator
tuned_knn = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=best_k))
])
tuned_knn.fit(X_tr, y_tr)

# Classification report
y_te_preds = tuned_knn.predict(X_te)
print(classification_report(y_true=y_te, y_pred=y_te_preds))

In [None]:
# Compute probabilities
y_te_probs = tuned_knn.predict_proba(X_te)
print('Predictions:', y_te_preds[:5])
print('Probs:')
print(y_te_probs[:5])

In [None]:
from sklearn.metrics import roc_curve

# ROC curve
fpr, tpr, thresholds = roc_curve(y_true=y_te, y_score=y_te_probs[:, 1], pos_label='presence')
pd.DataFrame({
    'fpr': fpr,
    'tpr (recall)': tpr,
    'thresholds': thresholds
})

In [None]:
# Plot ROC curve
plt.plot(fpr, tpr, label='ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.title('ROC curve for "presence"')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

# Area under the curve (AUC)
roc_auc_score(y_true=y_te, y_score=y_te_probs[:, 1])

In [None]:
# Adjust threshold
custom_preds = ['absence' if p else 'presence' for p in (y_te_probs[:, 1] < 0.2)]

print(classification_report(y_true=y_te, y_pred=custom_preds))