In [None]:
# sklearn tutorial for mhealth23

# https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

In [None]:
# standard imports
import numpy as np
import pandas as pd

# we'll be using the wine dataset as a toy example: https://archive.ics.uci.edu/ml/datasets/wine
from sklearn.datasets import load_wine

# we'll evaluate a decision tree, a random forest, a suport vector machine and a k-nearest neighbour classifier
from sklearn.tree import DecisionTreeClassifier # https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
from sklearn.tree import plot_tree # to visualize the decision tree
from sklearn.neighbors import KNeighborsClassifier # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
from sklearn.ensemble import RandomForestClassifier # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.svm import SVC # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

# to create a test and train split
from sklearn.model_selection import train_test_split

# evaluation metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

# for plotting (you should only need to run the next two lines once)
# !jupyter labextension install @jupyter-widgets/jupyterlab-manager jupyter-leaflet
# !pip install install nodejs
import matplotlib.pyplot as plt
%matplotlib widget

In [None]:
#  loading and prepping dataset
wine = load_wine()

# some information about the wine
X = pd.DataFrame(wine['data'], columns = wine['feature_names'])

# who cultivated the wine?
y = pd.DataFrame(wine['target'], columns = ['label'])

print('#' * 60, end = '\n\n')
print(X.describe())
print('#' * 60, end = '\n\n')
print(X.head())
print('#' * 60, end = '\n\n')
print(y.head())

In [None]:
# DT: quick fit with default parameters
clf = DecisionTreeClassifier(max_depth = 2)
clf.fit(X, y)

# visualizing the decision tree
plt.figure(figsize=(10,5))
plot_tree(clf, feature_names = X.columns, max_depth = 1)
plt.show()

In [None]:
# prediction
y_preds = clf.predict(X)
print(y_preds)

In [None]:
# performance assessment
print(accuracy_score(y, y_preds))
print()
print(confusion_matrix(y, y_preds))

In [None]:
for md in range(1,10):
    clf = DecisionTreeClassifier(max_depth = md)
    clf.fit(X,y)
    y_preds = clf.predict(X)
    print("{}: {}".format(md, np.round(accuracy_score(y, y_preds), 2)))

In [None]:
clf = DecisionTreeClassifier(max_depth = 3)
clf.fit(X,y)
y_preds = clf.predict(X)

ConfusionMatrixDisplay(confusion_matrix = confusion_matrix(y, y_preds), display_labels = wine['target_names']).plot()
plt.show()

In [None]:
print('precision: {}'.format(np.round(precision_score(y, y_preds,average = 'weighted'), 3)))
print(np.round(precision_score(y, y_preds,average = None),3), end = '\n\n') # tp / (tp + fp)

print('recall {}'.format(np.round(recall_score(y, y_preds,average = 'weighted'), 3)))
print(np.round(recall_score(y, y_preds,average = None),3), end = '\n\n') # tp / (tp + fn)

print('f1 {}'.format(np.round(f1_score(y, y_preds,average = 'weighted'), 3)))
print(np.round(f1_score(y, y_preds,average = None),3), end = '\n\n') # 2 * (precision * recall) / (precision + recall)

In [None]:
# Train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

In [None]:
# Generalization error
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

print("train accuracy:")
train_preds = clf.predict(X_train)
print(confusion_matrix(y_train, train_preds))
print(accuracy_score(y_train, train_preds))

print()
print('#' * 20)
print("test accuracy:")

test_preds = clf.predict(X_test)
print(confusion_matrix(y_test, test_preds))
print(accuracy_score(y_test, test_preds))

In [None]:
# cross validation: DT
# common parameter(s) to tune: max_depth
accs = list()
for md in np.arange(10)+1:
    accs = list()
    for i in range(100):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

        clf = DecisionTreeClassifier(max_depth = md)
        clf.fit(X_train, y_train)

        test_preds = clf.predict(X_test)

        accs.append(accuracy_score(y_test, test_preds))

    print("{}:\t{}".format(md, np.round(np.mean(accs), 2)))

In [None]:
# cross validation: RF
# common parameter(s) to tune: max_depth and n_estimators
accs = list()
for md in np.arange(5)+1:
    accs = list()
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

        clf = RandomForestClassifier(max_depth = md, n_estimators = 100)
        clf.fit(X_train, np.array(y_train).ravel())

        test_preds = clf.predict(X_test)

        accs.append(accuracy_score(y_test, test_preds))

    print("{}:\t{}".format(md, np.round(np.mean(accs), 2)))

In [None]:
# cross validation: KNN
# common parameter(s) to tune: n_neighbors
accs = list()
for nb in np.arange(3, 20)+1:
    accs = list()
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

        clf = KNeighborsClassifier(n_neighbors = nb)
        clf.fit(X_train, np.array(y_train).ravel())

        test_preds = clf.predict(X_test)

        accs.append(accuracy_score(y_test, test_preds))

    print("{}:\t{}".format(nb, np.round(np.mean(accs), 2)))

In [None]:
# cross validation: SVM
# common parameter(s) to tune: kernel and C
accs = list()
for c in np.arange(1, 11)/10:
    accs = list()
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

        clf = SVC(kernel = 'linear', C = c)
        clf.fit(X_train, np.array(y_train).ravel())

        test_preds = clf.predict(X_test)

        accs.append(accuracy_score(y_test, test_preds))

    print("{}:\t{}".format(c, np.round(np.mean(accs), 2)))