# Model evaluation of decision tree classification

## Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import os.path

## Load the dataset

In [2]:
raw_data_file_path = '/Users/vishalsaxena/Documents/DS_Workspace/udemy_ml_algo_python/data/raw/'
dataset = pd.read_csv(os.path.join(raw_data_file_path, 'breast-cancer.csv'))
dataset.drop(columns=['Sample code number'], inplace=True)
dataset

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
678,3,1,1,1,3,2,1,1,1,2
679,2,1,1,1,2,1,1,1,1,2
680,5,10,10,3,7,3,8,10,2,4
681,4,8,6,4,3,4,10,6,1,4


In [3]:
X = dataset.iloc[:,:-1].values
X

array([[ 5,  1,  1, ...,  3,  1,  1],
       [ 5,  4,  4, ...,  3,  2,  1],
       [ 3,  1,  1, ...,  3,  1,  1],
       ...,
       [ 5, 10, 10, ...,  8, 10,  2],
       [ 4,  8,  6, ..., 10,  6,  1],
       [ 4,  8,  8, ..., 10,  4,  1]], shape=(683, 9))

In [4]:
y = dataset.iloc[:,-1].values

## Splitting the dataset into Train/Test

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
X_train, X_test

(array([[ 7,  6, 10, ...,  9, 10,  2],
        [ 8,  3,  8, ...,  8,  9,  8],
        [ 8, 10, 10, ..., 10, 10,  1],
        ...,
        [ 1,  1,  1, ...,  1,  1,  1],
        [ 3,  1,  1, ...,  2,  1,  1],
        [10,  9,  7, ...,  7,  7,  1]], shape=(512, 9)),
 array([[1, 1, 1, ..., 1, 1, 1],
        [3, 1, 1, ..., 2, 1, 1],
        [5, 5, 5, ..., 4, 3, 1],
        ...,
        [5, 1, 1, ..., 3, 1, 2],
        [2, 2, 2, ..., 7, 1, 1],
        [2, 1, 1, ..., 3, 1, 1]], shape=(171, 9)))

In [6]:
y_test, y_train

(array([2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 4, 2, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2,
        4, 2, 4, 4, 2, 2, 2, 4, 2, 4, 4, 2, 2, 2, 4, 4, 2, 4, 2, 2, 2, 2,
        2, 2, 2, 4, 2, 2, 4, 2, 4, 2, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 2,
        2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 4, 2, 2, 4, 2, 2,
        4, 2, 4, 2, 4, 4, 4, 2, 4, 4, 4, 2, 2, 2, 4, 4, 2, 2, 4, 4, 2, 2,
        4, 2, 2, 4, 2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 4, 4, 2, 4, 2, 4, 2, 2,
        4, 2, 2, 4, 2, 4, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 4, 2, 4, 4, 2, 2,
        2, 2, 4, 4, 2, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2]),
 array([4, 4, 4, 2, 2, 2, 4, 2, 2, 4, 4, 2, 4, 2, 2, 4, 4, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 4, 4, 2, 4, 2, 2, 2, 4, 2, 2, 2,
        2, 4, 4, 2, 2, 4, 4, 2, 2, 4, 4, 2, 4, 2, 4, 4, 2, 2, 2, 4, 2, 4,
        2, 4, 2, 2, 2, 2, 2, 4, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 4, 2, 2,
        4, 2, 4, 2, 2, 4, 4, 4, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2,
        2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2

## Feature Scaling

In [7]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)

In [8]:
X_test = sc.transform(X_test)

## Training the model

In [9]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(random_state=0, criterion='entropy')
classifier.fit(X_train, y_train)

## Predicting the result set

In [10]:
y_pred = classifier.predict(X_test)

In [11]:
np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), axis=1)

array([[2, 2],
       [2, 2],
       [4, 4],
       [4, 4],
       [2, 2],
       [2, 2],
       [2, 2],
       [4, 4],
       [2, 2],
       [2, 2],
       [4, 4],
       [2, 2],
       [4, 4],
       [2, 2],
       [2, 2],
       [2, 2],
       [4, 4],
       [4, 4],
       [4, 4],
       [2, 2],
       [2, 2],
       [2, 2],
       [4, 4],
       [2, 2],
       [4, 4],
       [4, 4],
       [2, 2],
       [2, 2],
       [2, 2],
       [4, 4],
       [2, 2],
       [4, 4],
       [4, 4],
       [2, 2],
       [2, 2],
       [2, 2],
       [4, 4],
       [4, 4],
       [2, 2],
       [4, 4],
       [2, 2],
       [2, 2],
       [2, 2],
       [2, 2],
       [2, 2],
       [2, 2],
       [2, 2],
       [4, 4],
       [2, 2],
       [2, 2],
       [4, 4],
       [2, 2],
       [4, 4],
       [2, 2],
       [2, 2],
       [2, 2],
       [4, 4],
       [4, 2],
       [2, 2],
       [4, 4],
       [2, 2],
       [2, 2],
       [2, 2],
       [2, 2],
       [2, 2],
       [2, 2],
       [2,

## Calculating the confusion matrix and accuracy metrics

In [12]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(y_test, y_pred)

array([[104,   3],
       [  4,  60]])

In [13]:
np.round(accuracy_score(y_test, y_pred), 2)

np.float64(0.96)