## Evaluation of ML models

### 1. Load the libraries

In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from adspy_shared_utilities import plot_class_regions_for_classifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

### 2. Load the digits dataset

In [15]:
dataset = load_digits()
X, y = dataset.data, dataset.target
for class_name, class_count in zip(dataset.target_names, np.bincount(dataset.target)):
    print(class_name,class_count)

y_binary_imbalanced = y.copy()
y_binary_imbalanced[y_binary_imbalanced != 1] = 0
#print("original labels: ", y[1:30])
#print("new labels: ", y_binary_imbalanced[1:30])

0 178
1 182
2 177
3 183
4 181
5 182
6 181
7 179
8 174
9 180


### 3. SVM Classifier & confusion matrix

In [16]:
from sklearn.metrics import confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state = 0)

#linear kernel with C=1
svm_linear = SVC(kernel='linear', C=1).fit(X_train, y_train)
svm_pred = svm_linear.predict(X_test)
confusion_svm_linear = confusion_matrix(y_test, svm_pred)
print("SVM (linear, C=1): \n", confusion_svm_linear)

#linear kernel with C=1
svm_rbf = SVC(kernel='rbf', C=1).fit(X_train, y_train)
svm_pred = svm_rbf.predict(X_test)
confusion_svm_rbf = confusion_matrix(y_test, svm_pred)
print("SVM (rbf, C=1): \n", confusion_svm_rbf)

SVM (linear, C=1): 
 [[402   5]
 [  5  38]]
SVM (rbf, C=1): 
 [[407   0]
 [  2  41]]


### 4. Classification Report

In [17]:
from sklearn.metrics import classification_report
print("Classification report: \n", classification_report(y_test, svm_pred))

Classification report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       407
           1       1.00      0.95      0.98        43

    accuracy                           1.00       450
   macro avg       1.00      0.98      0.99       450
weighted avg       1.00      1.00      1.00       450



### 5. Cross validation score

In [18]:
from sklearn.model_selection import cross_val_score
dataset = load_digits()
X, y = dataset.data, dataset.target == 1
clf = SVC(kernel='linear', C=1)

print('Cross-validation (accuracy)', cross_val_score(clf, X, y, cv=5))
print('Cross-validation (AUC)', cross_val_score(clf, X, y, cv=5, scoring = 'roc_auc'))
print('Cross-validation (recall)', cross_val_score(clf, X, y, cv=5, scoring = 'recall'))

Cross-validation (accuracy) [0.91944444 0.98611111 0.97214485 0.97493036 0.96935933]
Cross-validation (AUC) [0.9641871  0.9976571  0.99372205 0.99699002 0.98675611]
Cross-validation (recall) [0.81081081 0.89189189 0.83333333 0.83333333 0.83333333]


### 6. GridSearchCV Evaluation Metrics

#### 6a. Load Data and Check Accuracy

In [19]:
clf = SVC(kernel='rbf')
svm_rbf = SVC(kernel='rbf', C=1).fit(X_train, y_train)
svm_pred = svm_rbf.predict(X_test)
grid_values = {'gamma': [0.01, 0.1, 1, 10]}

grid_clf_acc = GridSearchCV(clf, param_grid = grid_values)
grid_clf_acc.fit(X_train, y_train)
y_decision_fn_scores_acc = grid_clf_acc.decision_function(X_test) 

print('Grid best parameter (max. accuracy): ', grid_clf_acc.best_params_)
print('Grid best score (accuracy): ', grid_clf_acc.best_score_)

Grid best parameter (max. accuracy):  {'gamma': 0.01}
Grid best score (accuracy):  0.9205672587085226


#### 6b. Check the PARAMETERS (precision, recall, accuracy, f1) based on provided GAMMA values

In [None]:
dataset = load_digits()
X, y = dataset.data, dataset.target == 1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
from sklearn.metrics import classification_report

from sklearn.metrics import accuracy_score, precision_score, recall_score
for this_gamma in [0.01, 0.1, 1.0, 10.0]:
    clf = SVC(gamma=this_gamma, C=1).fit(X_train, y_train)
    clf_pred = clf.predict(X_test)
    print('Support Vector Classifier (rbf): gamma = {:.2f}'.format(this_gamma))
    print('Accuracy: {}'.format(accuracy_score(y_test, clf_pred)))
    print('Precision: {}'.format(precision_score(y_test, clf_pred)))
    print('Recall: {}'.format(recall_score(y_test, clf_pred)))

### 7. Simple GridSearch Example Test

In [24]:
from sklearn.metrics import precision_score, recall_score
X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state = 0)
best_score = 0
#best_score_r = 0
for gamma in [0.01, 0.1, 1, 10]:
    for C in [0.01, 0.1, 1, 10]:
        svm = SVC(gamma=gamma, C=C).fit(X_train, y_train)
        y_pred = svm.predict(X_test)
        #score = svm.score(X_test, y_test)
        best_score_precision = precision_score(y_test, y_pred)
        #best_score_r = recall_score(y_test, y_pred)
        if best_score_precision > best_score:
            best_score = best_score_precision
            best_parameters_p = {'C': C, 'gamma': gamma}
print("best precision score: ", best_score)
print("best parameters precision: ", best_parameters_p)
#print("best recall score: ", best_score_r)
#print("best parameters recall: ", best_parameters_r)

best precision score:  1.0
best parameters precision:  {'C': 1, 'gamma': 0.01}


### 8. Logistic Regression Cross-validation Score

In [8]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

iris = load_iris()
logreg = LogisticRegression()

scores = cross_val_score(logreg, iris.data, iris.target, cv=3)
print("Cross validation scores {}".format(scores))

Cross validation scores [0.98 0.96 0.98]
