In [1]:
import numpy as np
import pandas as pd

# sklearn tools
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [2]:
data = pd.read_csv('/Users/vaibhavrangan/Downloads/Stat_303-3/Datasets/heart_disease_classification.csv')

data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
# Response and predictors
y = data['target']
X = data.drop('target', axis=1)

In [4]:
# Separate the data into training and test sets

# Inputs:
    # data: X, y
    # Ratio of the datasets: using 80%-20% below
    # random_state for reproducibility
    # stratify: to keep the Class 0 to Class 1 ratio the same on the training and test sets
        # This input is very important for classifiers, not necessary for regressors
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20, stratify = y)

In [5]:
print(y_train.value_counts()/y_train.shape[0])
print(y_test.value_counts()/y_test.shape[0])

target
1    0.545455
0    0.454545
Name: count, dtype: float64
target
1    0.540984
0    0.459016
Name: count, dtype: float64


In [6]:
# IF NECESSARY -- scale the data
scaler = StandardScaler()

scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

Cross Validation Functions

cross_val_score

In [7]:
# An array of hyperparam values to try
Ks = np.arange(10, 160, 10)

# An empty list to store the CV scores
scores = []

for K in Ks: # For each hyperparam value
    model = KNeighborsClassifier(n_neighbors=K) # Create the model -- keeping weights as default
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5) # Using the model and data, perform 5-fold cv
    scores.append(cv_scores) # Store the results

# The output of the function is a score for each fold -- accuracy for a classifier by default, can be changed

In [8]:
avg_cv_scores = np.array(scores).mean(axis=1)

print('Best avg CV score:', max(avg_cv_scores)*100)
print('Best K:', Ks[avg_cv_scores.argmax()])

Best avg CV score: 82.25340136054422
Best K: 30


cross_val_predict

In [9]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score

# first let's do the greedy method, using the K value found above

tuned_model = KNeighborsClassifier(n_neighbors=Ks[avg_cv_scores.argmax()]) # hyperparameter already tuned

# cross_val_predict to get the cv prediction probabilities, so we can tune the threshold with them
# cross_val_predict output is the training data predictions WHEN EACH OBSERVATION WAS IN THE ASSESSMENT FOLD during cv
cv_probs = cross_val_predict(tuned_model, X_train_scaled, y_train, cv=5, method='predict_proba')[:, 1]

# method='predict_proba' makes the function return probabilities, not class predictions

# tuning the threshold
thrs = np.arange(0, 1.01, 0.01)

accs = []

for thr in thrs: # iterate through the thresholds
    cv_preds = cv_probs > thr # get the cv preds for each threshold
    acc = accuracy_score(y_train, cv_preds) # get the cv accuracy
    accs.append(acc) # store the results

print(np.array(accs).max()) # best accuracy
print(thrs[np.array(accs).argmax()]) # best threshold

0.8223140495867769
0.5


Not guaranteed to be the best combination because we haven't tuned threshold with different K hyperparameters

In [10]:
# the guaranteed method

# K and threshold arrays
Ks = np.arange(10, 160, 10)
thrs = np.arange(0, 1.01, 0.01)

cv_results = pd.DataFrame(columns=['K', 'threshold', 'accuracy']) # empty dataframe to store cv results

counter = 0

for K in Ks: # iterate through the K values
    model = KNeighborsClassifier(n_neighbors=K) # create the model for specific K value
    cv_probs = cross_val_predict(model, X_train_scaled, y_train, cv=5, method='predict_proba')[:, 1]
    
    for thr in thrs: # iterate through the thresholds
        cv_results.loc[counter, "K"] = K
        cv_results.loc[counter, "threshold"] = thr
        cv_results.loc[counter, "accuracy"] = accuracy_score(y_train, cv_probs > thr)
        counter += 1

cv_results.head()



Unnamed: 0,K,threshold,accuracy
0,10,0.0,0.636364
1,10,0.01,0.636364
2,10,0.02,0.636364
3,10,0.03,0.636364
4,10,0.04,0.636364


In [11]:
best_index = cv_results["accuracy"].argmax() # row index of best K and threshold combo

cv_results.iloc[best_index]

# greedy method missed the optimal combination

K                  20
threshold        0.55
accuracy     0.826446
Name: 156, dtype: object

In [12]:
len(cv_results)

1515

cross_validate

In [13]:
# if we want to find the cv results for multiple metrics at the same time, we can use cross_validate

from sklearn.model_selection import cross_validate

Ks = np.arange(10, 160, 10)

scores = []

for K in Ks:
    model = KNeighborsClassifier(n_neighbors=K)
    score = cross_validate(model, X_train_scaled, y_train, cv=5, scoring=['accuracy', 'precision', 'recall'])
    scores.append(score)

df_scores = pd.DataFrame(scores) # the test_... columns are of interest -- they are CV scores not test scores
df_scores

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall
0,"[0.0005309581756591797, 0.0003910064697265625,...","[0.004470109939575195, 0.0037910938262939453, ...","[0.8775510204081632, 0.8163265306122449, 0.812...","[0.92, 0.8214285714285714, 0.7931034482758621,...","[0.8518518518518519, 0.8518518518518519, 0.884..."
1,"[0.0003268718719482422, 0.00037288665771484375...","[0.0027701854705810547, 0.005188941955566406, ...","[0.8571428571428571, 0.7346938775510204, 0.854...","[0.8846153846153846, 0.7058823529411765, 0.806...","[0.8518518518518519, 0.8888888888888888, 0.961..."
2,"[0.0004458427429199219, 0.0011289119720458984,...","[0.00418400764465332, 0.003712892532348633, 0....","[0.8571428571428571, 0.7346938775510204, 0.875...","[0.8571428571428571, 0.7058823529411765, 0.833...","[0.8888888888888888, 0.8888888888888888, 0.961..."
3,"[0.0003230571746826172, 0.00032782554626464844...","[0.0027909278869628906, 0.005269050598144531, ...","[0.8775510204081632, 0.7551020408163265, 0.812...","[0.8888888888888888, 0.7142857142857143, 0.774...","[0.8888888888888888, 0.9259259259259259, 0.923..."
4,"[0.0006241798400878906, 0.0004286766052246094,...","[0.004588603973388672, 0.005477190017700195, 0...","[0.8571428571428571, 0.7142857142857143, 0.833...","[0.8571428571428571, 0.6756756756756757, 0.8, ...","[0.8888888888888888, 0.9259259259259259, 0.923..."
5,"[0.0003788471221923828, 0.0003268718719482422,...","[0.0027289390563964844, 0.004761219024658203, ...","[0.8367346938775511, 0.7142857142857143, 0.833...","[0.8518518518518519, 0.6756756756756757, 0.8, ...","[0.8518518518518519, 0.9259259259259259, 0.923..."
6,"[0.0003008842468261719, 0.0003142356872558594,...","[0.0027310848236083984, 0.003661632537841797, ...","[0.8979591836734694, 0.7142857142857143, 0.833...","[0.8666666666666667, 0.6756756756756757, 0.8, ...","[0.9629629629629629, 0.9259259259259259, 0.923..."
7,"[0.0002968311309814453, 0.0006330013275146484,...","[0.003280162811279297, 0.0032999515533447266, ...","[0.9183673469387755, 0.6938775510204082, 0.833...","[0.896551724137931, 0.6578947368421053, 0.8, 0...","[0.9629629629629629, 0.9259259259259259, 0.923..."
8,"[0.0003159046173095703, 0.0003330707550048828,...","[0.003093242645263672, 0.0027048587799072266, ...","[0.8979591836734694, 0.6938775510204082, 0.854...","[0.8666666666666667, 0.6578947368421053, 0.806...","[0.9629629629629629, 0.9259259259259259, 0.961..."
9,"[0.00024700164794921875, 0.00028228759765625, ...","[0.002557039260864258, 0.003240823745727539, 0...","[0.8571428571428571, 0.6938775510204082, 0.854...","[0.8125, 0.6578947368421053, 0.806451612903225...","[0.9629629629629629, 0.9259259259259259, 0.961..."


In [14]:
df_scores["avg_cv_accuracy"] = df_scores["test_accuracy"].apply(np.mean)
df_scores["avg_cv_precision"] = df_scores["test_precision"].apply(np.mean)
df_scores["avg_cv_recall"] = df_scores["test_recall"].apply(np.mean)
df_scores

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,avg_cv_accuracy,avg_cv_precision,avg_cv_recall
0,"[0.0005309581756591797, 0.0003910064697265625,...","[0.004470109939575195, 0.0037910938262939453, ...","[0.8775510204081632, 0.8163265306122449, 0.812...","[0.92, 0.8214285714285714, 0.7931034482758621,...","[0.8518518518518519, 0.8518518518518519, 0.884...",0.822109,0.81909,0.87151
1,"[0.0003268718719482422, 0.00037288665771484375...","[0.0027701854705810547, 0.005188941955566406, ...","[0.8571428571428571, 0.7346938775510204, 0.854...","[0.8846153846153846, 0.7058823529411765, 0.806...","[0.8518518518518519, 0.8888888888888888, 0.961...",0.822534,0.795223,0.917379
2,"[0.0004458427429199219, 0.0011289119720458984,...","[0.00418400764465332, 0.003712892532348633, 0....","[0.8571428571428571, 0.7346938775510204, 0.875...","[0.8571428571428571, 0.7058823529411765, 0.833...","[0.8888888888888888, 0.8888888888888888, 0.961...",0.822534,0.790562,0.924786
3,"[0.0003230571746826172, 0.00032782554626464844...","[0.0027909278869628906, 0.005269050598144531, ...","[0.8775510204081632, 0.7551020408163265, 0.812...","[0.8888888888888888, 0.7142857142857143, 0.774...","[0.8888888888888888, 0.9259259259259259, 0.923...",0.818197,0.783049,0.932194
4,"[0.0006241798400878906, 0.0004286766052246094,...","[0.004588603973388672, 0.005477190017700195, 0...","[0.8571428571428571, 0.7142857142857143, 0.833...","[0.8571428571428571, 0.6756756756756757, 0.8, ...","[0.8888888888888888, 0.9259259259259259, 0.923...",0.805952,0.772527,0.924501
5,"[0.0003788471221923828, 0.0003268718719482422,...","[0.0027289390563964844, 0.004761219024658203, ...","[0.8367346938775511, 0.7142857142857143, 0.833...","[0.8518518518518519, 0.6756756756756757, 0.8, ...","[0.8518518518518519, 0.9259259259259259, 0.923...",0.797704,0.770143,0.909402
6,"[0.0003008842468261719, 0.0003142356872558594,...","[0.0027310848236083984, 0.003661632537841797, ...","[0.8979591836734694, 0.7142857142857143, 0.833...","[0.8666666666666667, 0.6756756756756757, 0.8, ...","[0.9629629629629629, 0.9259259259259259, 0.923...",0.809949,0.770373,0.939316
7,"[0.0002968311309814453, 0.0006330013275146484,...","[0.003280162811279297, 0.0032999515533447266, ...","[0.9183673469387755, 0.6938775510204082, 0.833...","[0.896551724137931, 0.6578947368421053, 0.8, 0...","[0.9629629629629629, 0.9259259259259259, 0.923...",0.805782,0.767848,0.939316
8,"[0.0003159046173095703, 0.0003330707550048828,...","[0.003093242645263672, 0.0027048587799072266, ...","[0.8979591836734694, 0.6938775510204082, 0.854...","[0.8666666666666667, 0.6578947368421053, 0.806...","[0.9629629629629629, 0.9259259259259259, 0.961...",0.805867,0.763161,0.947009
9,"[0.00024700164794921875, 0.00028228759765625, ...","[0.002557039260864258, 0.003240823745727539, 0...","[0.8571428571428571, 0.6938775510204082, 0.854...","[0.8125, 0.6578947368421053, 0.806451612903225...","[0.9629629629629629, 0.9259259259259259, 0.961...",0.797704,0.752328,0.947009


In [15]:
df_scores.index = Ks
df_scores.head()

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,avg_cv_accuracy,avg_cv_precision,avg_cv_recall
10,"[0.0005309581756591797, 0.0003910064697265625,...","[0.004470109939575195, 0.0037910938262939453, ...","[0.8775510204081632, 0.8163265306122449, 0.812...","[0.92, 0.8214285714285714, 0.7931034482758621,...","[0.8518518518518519, 0.8518518518518519, 0.884...",0.822109,0.81909,0.87151
20,"[0.0003268718719482422, 0.00037288665771484375...","[0.0027701854705810547, 0.005188941955566406, ...","[0.8571428571428571, 0.7346938775510204, 0.854...","[0.8846153846153846, 0.7058823529411765, 0.806...","[0.8518518518518519, 0.8888888888888888, 0.961...",0.822534,0.795223,0.917379
30,"[0.0004458427429199219, 0.0011289119720458984,...","[0.00418400764465332, 0.003712892532348633, 0....","[0.8571428571428571, 0.7346938775510204, 0.875...","[0.8571428571428571, 0.7058823529411765, 0.833...","[0.8888888888888888, 0.8888888888888888, 0.961...",0.822534,0.790562,0.924786
40,"[0.0003230571746826172, 0.00032782554626464844...","[0.0027909278869628906, 0.005269050598144531, ...","[0.8775510204081632, 0.7551020408163265, 0.812...","[0.8888888888888888, 0.7142857142857143, 0.774...","[0.8888888888888888, 0.9259259259259259, 0.923...",0.818197,0.783049,0.932194
50,"[0.0006241798400878906, 0.0004286766052246094,...","[0.004588603973388672, 0.005477190017700195, 0...","[0.8571428571428571, 0.7142857142857143, 0.833...","[0.8571428571428571, 0.6756756756756757, 0.8, ...","[0.8888888888888888, 0.9259259259259259, 0.923...",0.805952,0.772527,0.924501


In [16]:
# return at least 94% cv recall with highest possible cv precision
df_filtered = df_scores[["avg_cv_accuracy", "avg_cv_precision", "avg_cv_recall"]]

# filter for avg_cv_recall above 94%
df_filtered2 = df_filtered[df_filtered["avg_cv_recall"] > 0.94]

# filter for highest avg_cv_precision
df_filtered2.loc[df_filtered2["avg_cv_precision"].idxmax()]

avg_cv_accuracy     0.805867
avg_cv_precision    0.763161
avg_cv_recall       0.947009
Name: 90, dtype: float64

- what if we want to tune with multiple metrics like this, but tuning the threshold as well?
- then we cannot use cross_validate

cv objects

# a. GridSearchCV

In [17]:
from sklearn.model_selection import GridSearchCV

model = KNeighborsClassifier()

grid = {"n_neighbors": np.arange(10, 160, 10), "weights": ["uniform", "distance"]}

gscv = GridSearchCV(model, grid, cv=5, scoring="accuracy")

gscv.fit(X_train_scaled, y_train)

# .fit cross validates the model for each combo in the grid (15 K values * 2 weights = 30 combo)
# best values are found, the tuned model with these values is trained and ready to predict

In [18]:
print(gscv.best_params_)

# best avg CV score
print(gscv.best_score_)

{'n_neighbors': np.int64(30), 'weights': 'uniform'}
0.8225340136054422


after finding the tuned model:
    - for regression tuning is done
    - for classification, we need to tune the threshold as well
    - take gscv.best_estimator_ and use cross_val_predict as above

In [19]:
cv_probs = cross_val_predict(gscv.best_estimator_, X_train_scaled, y_train, cv=5, method='predict_proba')[:, 1]
thresholds = np.arange(0, 1.01, 0.01)

accs = []

for thr in thresholds:
    cv_preds = cv_probs > thr
    acc = accuracy_score(y_train, cv_preds)
    accs.append(acc)

print(np.array(accs).max())
print(thresholds[np.array(accs).argmax()])

0.8223140495867769
0.5


In [20]:
# test evaluation

# from the grid search object use the tuned model
y_pred_probs = gscv.best_estimator_.predict_proba(X_test_scaled)[:, 1]
y_pred_probs = gscv.predict_proba(X_test_scaled)[:, 1] # identical to previous line

tuned_thr = thresholds[np.array(accs).argmax()]
accuracy_score(y_test, y_pred_probs > tuned_thr)

0.7868852459016393

In [21]:
# let's see another attribute

# slicing out the columns of interest -- mean test score has avg cv performances
pd.DataFrame(gscv.cv_results_)[["param_n_neighbors", "param_weights", "mean_test_score"]].head()


Unnamed: 0,param_n_neighbors,param_weights,mean_test_score
0,10,uniform,0.822109
1,10,distance,0.818112
2,20,uniform,0.822534
3,20,distance,0.818452
4,30,uniform,0.822534


In [22]:
# let's add a new input to grid search cv
model = KNeighborsClassifier()

grid = {"n_neighbors": np.arange(10, 160, 10), "weights": ["uniform", "distance"]}

# with multiple metrics, the refit input is used to specify which metric to use
gscv = GridSearchCV(model, grid, cv=5, scoring=["accuracy", "recall"], refit="recall")

gscv.fit(X_train_scaled, y_train)



In [23]:
df_results = pd.DataFrame(gscv.cv_results_)[["param_n_neighbors", "param_weights", "mean_test_accuracy", "mean_test_recall"]]
df_results.head()


Unnamed: 0,param_n_neighbors,param_weights,mean_test_accuracy,mean_test_recall
0,10,uniform,0.822109,0.87151
1,10,distance,0.818112,0.909687
2,20,uniform,0.822534,0.917379
3,20,distance,0.818452,0.925071
4,30,uniform,0.822534,0.924786


In [24]:
# now we can answer questions like "find the tuned model with at least 94% cv recall"
df_filtered = df_results[df_results["mean_test_recall"] > 0.94]
df_filtered.loc[df_filtered["mean_test_accuracy"].idxmax()]

param_n_neighbors           90
param_weights          uniform
mean_test_accuracy    0.805867
mean_test_recall      0.947009
Name: 16, dtype: object

In [25]:
# two more inputs to GridSearchCV

# verbose -- allows us to track where the grid search is at in terms of evaluation by printing outputs
# n_jobs -- how many processing units to use for parallel processing, -1 means use all available units

model = KNeighborsClassifier()

grid = {"n_neighbors": np.arange(10, 160, 10), "weights": ["uniform", "distance"]}

gscv = GridSearchCV(model, grid, cv=5, scoring=["accuracy", "recall"], refit="recall", verbose=3, n_jobs=-1)

gscv.fit(X_train_scaled, y_train)



Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END n_neighbors=10, weights=uniform; accuracy: (test=0.878) recall: (test=0.852) total time=   0.0s
[CV 2/5] END n_neighbors=10, weights=uniform; accuracy: (test=0.816) recall: (test=0.852) total time=   0.0s
[CV 4/5] END n_neighbors=10, weights=distance; accuracy: (test=0.750) recall: (test=0.885) total time=   0.0s
[CV 5/5] END n_neighbors=10, weights=distance; accuracy: (test=0.833) recall: (test=0.923) total time=   0.0s
[CV 2/5] END n_neighbors=10, weights=distance; accuracy: (test=0.776) recall: (test=0.852) total time=   0.0s
[CV 3/5] END n_neighbors=10, weights=uniform; accuracy: (test=0.812) recall: (test=0.885) total time=   0.0s
[CV 4/5] END n_neighbors=10, weights=uniform; accuracy: (test=0.750) recall: (test=0.846) total time=   0.0s
[CV 1/5] END n_neighbors=20, weights=uniform; accuracy: (test=0.857) recall: (test=0.852) total time=   0.0s
[CV 2/5] END n_neighbors=20, weights=uniform; accuracy: (test=0

# b. RandomizedSearchCV

- depending on the grid and the mode, the runtime for tuning the model can be very expensive
- one cheaper option is RandomizedSearchCV

In [26]:
from sklearn.model_selection import RandomizedSearchCV

# a RandomizedSearchCV object is very similar to a GridSearchCV object
# two extra inputs:
    # n_iter: how many random combinations to try
    # random_state: seed to control randomness

model = KNeighborsClassifier()

grid = {"n_neighbors": np.arange(10, 160, 10), "weights": ["uniform", "distance"]}

rscv = RandomizedSearchCV(model, grid, cv=5, scoring=["accuracy", "recall"], refit="accuracy", n_iter=10, random_state=12)

rscv.fit(X_train_scaled, y_train)

RandomizedSearchCV has the exact same methods and attributes as GridSearchCV
everything we did above with gridsearchcv is applicable to randomizedsearchcv

# KFold Objects

- during cross-validaiton we would want to give more settings to the creation of the folds
- we can put these settings together as one object
- we can set the cv input of the cv objects to this object

# a) KFold

In [27]:
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold

# nfolds - number of folds
# shuffle - whether or not to shuffle the data
# random_state - seed to control randomness

cv = KFold(n_splits=5, shuffle = True, random_state=12)

# after the object is ready, use it inside the grid or randomized search

model = KNeighborsClassifier()

grid = {"n_neighbors": np.arange(10, 160, 10), "weights": ["uniform", "distance"]}

gscv = GridSearchCV(model, grid, cv=cv, scoring=["accuracy", "recall"], refit="accuracy", n_jobs=-1)

gscv.fit(X_train_scaled, y_train)

# b) RepeatedKFold

In [28]:
# when we repeat the cross validation process N times, we create k folds N times and tune the model with k*N combinations
# virtually guarantees that any skew or bias in the training observations will be surpressed

cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=12)
# 50 folds

model = KNeighborsClassifier()

grid = {"n_neighbors": np.arange(10, 160, 10), "weights": ["uniform", "distance"]}

gscv = GridSearchCV(model, grid, cv=cv, scoring=["accuracy", "recall"], refit="accuracy", n_jobs=-1)

gscv.fit(X_train_scaled, y_train)

# c) StratifiedKFold

- same as KFold, only makes sure that the class ratio is preserved across all folds
- same inputs

# d) RepeatedStratifiedKFold

- same as RepeatedKFold, only makes sure that the class ratio is preserved across all folds
- same inputs