In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Load libraries - SVC and GridSearch to be used

In [15]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from adspy_shared_utilities import plot_class_regions_for_classifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

### Load dataset and do test-train split

In [16]:
dataset = load_digits()
X, y = dataset.data, dataset.target
for class_name, class_count in zip(dataset.target_names, np.bincount(dataset.target)):
    print(class_name,class_count)

#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

0 178
1 182
2 177
3 183
4 181
5 182
6 181
7 179
8 174
9 180


In [17]:
y_binary_imbalanced = y.copy()
y_binary_imbalanced[y_binary_imbalanced != 1] = 0
print("original labels: ", y[1:30])
print("new labels: ", y_binary_imbalanced[1:30])

original labels:  [1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]
new labels:  [1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]


In [18]:
np.bincount(y_binary_imbalanced)

array([1615,  182], dtype=int64)

In [19]:
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state = 0)
svm = SVC(kernel='rbf', C=1).fit(X_train, y_train)
svm.score(X_test, y_test)

0.9955555555555555

In [20]:
from sklearn.dummy import DummyClassifier
dummy_majority = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
y_dummy = dummy_majority.predict(X_test)
y_dummy

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [21]:
dummy_majority.score(X_train, y_train)

0.896807720861173

In [22]:
svm = SVC(kernel='linear', C=1).fit(X_train, y_train)
svm.score(X_test, y_test)

0.9777777777777777

# Confusion Matrix

In [24]:
from sklearn.metrics import confusion_matrix
dummy_majority = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
y_majority_pred = dummy_majority.predict(X_test)
confusion = confusion_matrix(y_test, y_majority_pred)
print("Most frequent class (dummy classifier): \n", confusion)


Most frequent class (dummy classifier): 
 [[407   0]
 [ 43   0]]


## Proportional Classifier

In [26]:
dummy_majority_prop = DummyClassifier(strategy='stratified').fit(X_train, y_train)
y_class_prop = dummy_majority_prop.predict(X_test)
confusion_prop = confusion_matrix(y_test, y_class_prop)
print("Random class proportional prediction (dummy classifier): \n", confusion_prop)

Random class proportional prediction (dummy classifier): 
 [[368  39]
 [ 38   5]]


## SVM Linear Classifier

In [27]:
svm = SVC(kernel='linear', C=1).fit(X_train, y_train)
svm_pred = svm.predict(X_test)
confusion_svm = confusion_matrix(y_test, svm_pred)
print("SVM (linear, C=1): \n", confusion_svm)

SVM (linear, C=1): 
 [[402   5]
 [  5  38]]


## Logistic Regression Classifier

In [29]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression().fit(X_train, y_train)
lr_pred = lr.predict(X_test)
confusion_lr = confusion_matrix(y_test, lr_pred)
print("Logistic Regression: \n", confusion_lr)

Logistic Regression: 
 [[401   6]
 [  8  35]]


## Decision Tree Classifier

In [31]:
from sklearn.tree import  DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=2).fit(X_train, y_train)
dt_pred = dt.predict(X_test)
confusion_dt = confusion_matrix(y_test, dt_pred)
print("Decision tree classifier (max_depth = 2): \n", confusion_dt)

Decision tree classifier (max_depth = 2): 
 [[400   7]
 [ 17  26]]


## Classification Report

In [32]:
from sklearn.metrics import classification_report
print("Classification report: \n", classification_report(y_test, dt_pred))

Classification report: 
               precision    recall  f1-score   support

           0       0.96      0.98      0.97       407
           1       0.79      0.60      0.68        43

    accuracy                           0.95       450
   macro avg       0.87      0.79      0.83       450
weighted avg       0.94      0.95      0.94       450



In [37]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

classifier = make_pipeline(StandardScaler(), LinearSVC(random_state=random_state))
classifier.fit(X_train, y_train)

ImportError: cannot import name 'available_if' from 'sklearn.utils.metaestimators' (C:\Users\SYED\AppData\Roaming\Python\Python38\site-packages\sklearn\utils\metaestimators.py)

In [38]:
from sklearn.metrics import PrecisionRecallDisplay
display = PrecisionRecallDisplay.from_estimator(estimator, X, y)
_ = display.ax_.set_title("2-class Precision-Recall curve")

AttributeError: type object 'PrecisionRecallDisplay' has no attribute 'from_estimator'