# Classification Model Evaluation

## Import the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Import the dataset

In [2]:
dataset = pd.read_csv("data/Data.csv")
X = dataset.iloc[:, 1:-1]
y = dataset.iloc[:,-1]

In [3]:
dataset.head(5)

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [4]:
X.head(5)

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1


In [5]:
y.head(5)

0    2
1    2
2    2
3    2
4    2
Name: Class, dtype: int64

## Encode Categorical Variable

In [6]:
from sklearn.preprocessing import LabelEncoder

#We have to encode the y into 0 and 1
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [7]:
print(y[:10])

[0 0 0 0 0 1 0 0 0 0]


## Splitting the dataset into the Training Set and Test Set

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [9]:
print(len(X_train), len(X_test), len(y_train), len(y_test))

546 137 546 137


## Feature Scaling

In [10]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

## Training the Classification model on the Training set

### Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression

logreg_classifier = LogisticRegression(random_state=0)
logreg_classifier.fit(X_train_scaled, y_train)

LogisticRegression(random_state=0)

### K-Nearest Neigbors

In [12]:
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn_classifier.fit(X_train_scaled, y_train)

KNeighborsClassifier()

### Support Vector Machines

In [13]:
from sklearn.svm import SVC

svc_lin_classifier = SVC(kernel='linear', random_state=0)
svc_lin_classifier.fit(X_train_scaled, y_train)

SVC(kernel='linear', random_state=0)

### Kernel Support Vector

In [14]:
from sklearn.svm import SVC

svc_rbf_classifier = SVC(kernel='rbf', random_state=0)
svc_rbf_classifier.fit(X_train_scaled, y_train)

SVC(random_state=0)

### Naive Bayes

In [15]:
from sklearn.naive_bayes import GaussianNB

gnb_classifier = GaussianNB()
gnb_classifier.fit(X_train_scaled, y_train)

GaussianNB()

### Decision Tree Classification

In [16]:
from sklearn.tree import DecisionTreeClassifier

dtree_classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
dtree_classifier.fit(X_train_scaled, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

### Random Forest Classification

In [17]:
from sklearn.ensemble import RandomForestClassifier

rforest_classifier = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0)
rforest_classifier.fit(X_train_scaled, y_train)

RandomForestClassifier(criterion='entropy', random_state=0)

## Evaluating Classification Model

### Calculating Confusion Matrix and Model Accuracy

In [18]:
from sklearn.metrics import confusion_matrix, accuracy_score

y_pred_logreg = logreg_classifier.predict(X_test_scaled)
y_pred_knn = knn_classifier.predict(X_test_scaled)
y_pred_svc_lin = svc_lin_classifier.predict(X_test_scaled)
y_pred_svc_rbf = svc_rbf_classifier.predict(X_test_scaled)
y_pred_gnb = gnb_classifier.predict(X_test_scaled)
y_pred_dtree = dtree_classifier.predict(X_test_scaled)
y_pred_rforest = rforest_classifier.predict(X_test_scaled)


print("LOGISTIC REGRESSION")
print("Confusion Matrix : \n", confusion_matrix(y_test, y_pred_logreg))
print()
print(f"Model Accuracy : {accuracy_score(y_test, y_pred_logreg)}")
print("=============================================================")

print("K-NEAREST NEIGHBORS")
print("Confusion Matrix : \n", confusion_matrix(y_test, y_pred_knn))
print()
print(f"Model Accuracy : {accuracy_score(y_test, y_pred_knn)}")
print("=============================================================")

print("SVC (Linear Kernel)")
print("Confusion Matrix : \n", confusion_matrix(y_test, y_pred_svc_lin))
print()
print(f"Model Accuracy : {accuracy_score(y_test, y_pred_svc_lin)}")
print("=============================================================")

print("SVC (RBF Kernel)")
print("Confusion Matrix : \n", confusion_matrix(y_test, y_pred_svc_rbf))
print()
print(f"Model Accuracy : {accuracy_score(y_test, y_pred_svc_rbf)}")
print("=============================================================")

print("Naive Bayes")
print("Confusion Matrix : \n", confusion_matrix(y_test, y_pred_gnb))
print()
print(f"Model Accuracy : {accuracy_score(y_test, y_pred_gnb)}")
print("=============================================================")

print("Decision Tree Classification")
print("Confusion Matrix : \n", confusion_matrix(y_test, y_pred_dtree))
print()
print(f"Model Accuracy : {accuracy_score(y_test, y_pred_dtree)}")
print("=============================================================")

print("Random Forest Classification")
print("Confusion Matrix : \n", confusion_matrix(y_test, y_pred_rforest))
print()
print(f"Model Accuracy : {accuracy_score(y_test, y_pred_rforest)}")
print("=============================================================")

LOGISTIC REGRESSION
Confusion Matrix : 
 [[84  3]
 [ 3 47]]

Model Accuracy : 0.9562043795620438
K-NEAREST NEIGHBORS
Confusion Matrix : 
 [[83  4]
 [ 2 48]]

Model Accuracy : 0.9562043795620438
SVC (Linear Kernel)
Confusion Matrix : 
 [[83  4]
 [ 2 48]]

Model Accuracy : 0.9562043795620438
SVC (RBF Kernel)
Confusion Matrix : 
 [[82  5]
 [ 1 49]]

Model Accuracy : 0.9562043795620438
Naive Bayes
Confusion Matrix : 
 [[80  7]
 [ 0 50]]

Model Accuracy : 0.948905109489051
Decision Tree Classification
Confusion Matrix : 
 [[84  3]
 [ 3 47]]

Model Accuracy : 0.9562043795620438
Random Forest Classification
Confusion Matrix : 
 [[84  3]
 [ 1 49]]

Model Accuracy : 0.9708029197080292


In [19]:
from sklearn.metrics import f1_score, roc_auc_score

pd.set_option("display.precision", 5)

score_list = [["Logistic Regression", accuracy_score(y_test, y_pred_logreg), f1_score(y_test, y_pred_logreg), roc_auc_score(y_test, y_pred_logreg)],
              ["KNN", accuracy_score(y_test, y_pred_knn), f1_score(y_test, y_pred_knn), roc_auc_score(y_test, y_pred_knn)],
              ["SVC (Linear)", accuracy_score(y_test, y_pred_svc_lin), f1_score(y_test, y_pred_svc_lin), roc_auc_score(y_test, y_pred_svc_lin)],
              ["SVC (RBF)", accuracy_score(y_test, y_pred_svc_rbf), f1_score(y_test, y_pred_svc_rbf), roc_auc_score(y_test, y_pred_svc_rbf)],
              ["Naive Bayes", accuracy_score(y_test, y_pred_gnb), f1_score(y_test, y_pred_gnb), roc_auc_score(y_test, y_pred_gnb)],
              ["Decision Tree", accuracy_score(y_test, y_pred_dtree), f1_score(y_test, y_pred_dtree), roc_auc_score(y_test, y_pred_dtree)],
              ["Random Forest", accuracy_score(y_test, y_pred_rforest), f1_score(y_test, y_pred_rforest), roc_auc_score(y_test, y_pred_rforest)]]

df_model_score = pd.DataFrame(score_list, columns=["Model Name","Accuracy Score","F1 Score", "ROC AUC Score"])

df_model_score.sort_values(by=["Accuracy Score"], ascending=False, ignore_index=True, inplace=True)

df_model_score

Unnamed: 0,Model Name,Accuracy Score,F1 Score,ROC AUC Score
0,Random Forest,0.9708,0.96078,0.97276
1,Logistic Regression,0.9562,0.94,0.95276
2,KNN,0.9562,0.94118,0.95701
3,SVC (Linear),0.9562,0.94118,0.95701
4,SVC (RBF),0.9562,0.94231,0.96126
5,Decision Tree,0.9562,0.94,0.95276
6,Naive Bayes,0.94891,0.93458,0.95977
