## Evaluation of ML models

### 1. Load the libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from adspy_shared_utilities import plot_class_regions_for_classifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

### 2. Load the digits dataset

In [12]:
dataset = load_digits()
X, y = dataset.data, dataset.target
for class_name, class_count in zip(dataset.target_names, np.bincount(dataset.target)):
    print(class_name,class_count)

y_binary_imbalanced = y.copy()
y_binary_imbalanced[y_binary_imbalanced != 1] = 0
#print("original labels: ", y[1:30])
#print("new labels: ", y_binary_imbalanced[1:30])

0 178
1 182
2 177
3 183
4 181
5 182
6 181
7 179
8 174
9 180


### 3. SVM Classifier & confusion matrix

In [23]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state = 0)

#linear kernel with C=1
svm_linear = SVC(kernel='linear', C=1).fit(X_train, y_train)
svm_pred = svm_linear.predict(X_test)
confusion_svm_linear = confusion_matrix(y_test, svm_pred)
print("SVM (linear, C=1): \n", confusion_svm_linear)

#linear kernel with C=1
svm_rbf = SVC(kernel='rbf', C=1).fit(X_train, y_train)
svm_pred = svm_rbf.predict(X_test)
confusion_svm_rbf = confusion_matrix(y_test, svm_pred)
print("SVM (rbf, C=1): \n", confusion_svm_rbf)

SVM (linear, C=1): 
 [[402   5]
 [  5  38]]
SVM (rbf, C=1): 
 [[407   0]
 [  2  41]]


In [24]:
#linear kernel with C=1
svm_rbf = SVC(kernel='rbf', C=1, gamma=0.01).fit(X_train, y_train)
svm_pred = svm_rbf.predict(X_test)
confusion_svm_rbf = confusion_matrix(y_test, svm_pred)
print("SVM (rbf, C=1, gamma = 0.01): \n", confusion_svm_rbf)

svm_rbf = SVC(kernel='rbf', C=1, gamma=0.1).fit(X_train, y_train)
svm_pred = svm_rbf.predict(X_test)
confusion_svm_rbf = confusion_matrix(y_test, svm_pred)
print("SVM (rbf, C=1, gamma = 0.1): \n", confusion_svm_rbf)

svm_rbf = SVC(kernel='rbf', C=1, gamma=1).fit(X_train, y_train)
svm_pred = svm_rbf.predict(X_test)
confusion_svm_rbf = confusion_matrix(y_test, svm_pred)
print("SVM (rbf, C=1, gamma = 1): \n", confusion_svm_rbf)

svm_rbf = SVC(kernel='rbf', C=1, gamma=10).fit(X_train, y_train)
svm_pred = svm_rbf.predict(X_test)
confusion_svm_rbf = confusion_matrix(y_test, svm_pred)
print("SVM (rbf, C=1, gamma = 10): \n", confusion_svm_rbf)

SVM (rbf, C=1, gamma = 0.01): 
 [[407   0]
 [ 41   2]]
SVM (rbf, C=1, gamma = 0.1): 
 [[407   0]
 [ 43   0]]
SVM (rbf, C=1, gamma = 1): 
 [[407   0]
 [ 43   0]]
SVM (rbf, C=1, gamma = 10): 
 [[407   0]
 [ 43   0]]


### 4. Classification Report

In [15]:
from sklearn.metrics import classification_report
print("Classification report: \n", classification_report(y_test, svm_pred))

Classification report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       407
           1       1.00      0.95      0.98        43

    accuracy                           1.00       450
   macro avg       1.00      0.98      0.99       450
weighted avg       1.00      1.00      1.00       450



### 5. Cross validation score

In [5]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
dataset = load_digits()
X, y = dataset.data, dataset.target == 1
clf = SVC(kernel='linear', C=1)

print('Cross-validation (accuracy)', cross_val_score(clf, X, y, cv=5))
print('Cross-validation (AUC)', cross_val_score(clf, X, y, cv=5, scoring = 'roc_auc'))
print('Cross-validation (recall)', cross_val_score(clf, X, y, cv=5, scoring = 'recall'))

Cross-validation (accuracy) [0.91944444 0.98611111 0.97214485 0.97493036 0.96935933]
Cross-validation (AUC) [0.9641871  0.9976571  0.99372205 0.99699002 0.98675611]
Cross-validation (recall) [0.81081081 0.89189189 0.83333333 0.83333333 0.83333333]


### 6. GridSearchCV Evaluation Metrics

#### 6a. Load the data

In [16]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

dataset = load_digits()
X, y = dataset.data, dataset.target == 1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

clf = SVC(kernel='rbf')
svm_rbf = SVC(kernel='rbf', C=1).fit(X_train, y_train)
svm_pred = svm_rbf.predict(X_test)
grid_values = {'gamma': [0.01, 0.1, 1, 10]}

#### 6b. Check the ACCURACY

In [17]:
grid_clf_acc = GridSearchCV(clf, param_grid = grid_values)
grid_clf_acc.fit(X_train, y_train)
y_decision_fn_scores_acc = grid_clf_acc.decision_function(X_test) 

print('Grid best parameter (max. accuracy): ', grid_clf_acc.best_params_)
print('Grid best score (accuracy): ', grid_clf_acc.best_score_)

Grid best parameter (max. accuracy):  {'gamma': 0.01}
Grid best score (accuracy):  0.9205672587085226


#### 6c. Check the PARAMETERS (precision, recall, accuracy, f1) based on provided GAMMA values

In [22]:
dataset = load_digits()
X, y = dataset.data, dataset.target == 1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
from sklearn.metrics import classification_report

from sklearn.metrics import accuracy_score, precision_score, recall_score
for this_gamma in [0.01, 0.1, 1.0, 10.0]:
    clf = SVC(gamma=this_gamma, C=1).fit(X_train, y_train)
    clf_pred = clf.predict(X_test)
    print('Support Vector Classifier (rbf): gamma = {:.2f}'.format(this_gamma))
    print('Accuracy: {}'.format(accuracy_score(y_test, clf_pred)))
    print('Precision: {}'.format(precision_score(y_test, clf_pred)))
    print('Recall: {}'.format(recall_score(y_test, clf_pred)))
    print("Classification report: \n", classification_report(y_test, clf_pred))

Support Vector Classifier (rbf): gamma = 0.01
Accuracy: 0.9088888888888889
Precision: 1.0
Recall: 0.046511627906976744
Classification report: 
               precision    recall  f1-score   support

       False       0.91      1.00      0.95       407
        True       1.00      0.05      0.09        43

    accuracy                           0.91       450
   macro avg       0.95      0.52      0.52       450
weighted avg       0.92      0.91      0.87       450

Support Vector Classifier (rbf): gamma = 0.10
Accuracy: 0.9044444444444445
Precision: 0.0
Recall: 0.0
Classification report: 
               precision    recall  f1-score   support

       False       0.90      1.00      0.95       407
        True       0.00      0.00      0.00        43

    accuracy                           0.90       450
   macro avg       0.45      0.50      0.47       450
weighted avg       0.82      0.90      0.86       450

Support Vector Classifier (rbf): gamma = 1.00
Accuracy: 0.9044444444444445


### 7. Simple GridSearch Example Test

In [10]:
from sklearn.datasets import load_iris
iris = load_iris()
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)
print("Size of training set: %d size of test set: %d" % (X_train.shape[0], X_test.shape[0]))
best_score = 0
for gamma in [0.01, 0.1, 1, 10]:
    for C in [0.01, 0.1, 1, 10]:
        svm = SVC(gamma=gamma, C=C)
        svm.fit(X_train, y_train)
        score = svm.score(X_test, y_test)
        if score > best_score:
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}

print("best score: ", best_score)
print("best parameters: ", best_parameters)

Size of training set: 112 size of test set: 38
best score:  0.9736842105263158
best parameters:  {'C': 10, 'gamma': 0.01}
