### Original data set is read and records with missing values are dropped

In [302]:
import pandas as pd
import numpy as np

data = pd.read_csv(r'data/breast-cancer-wisconsin.data.txt')

print("Original data with missing values shape=", data.shape)

Original data with missing values shape= (698, 11)


### Data set is split to test and training and saved as csv files

In [303]:
from sklearn.model_selection import train_test_split

test_size = 244/(455+244)
train_data, test_data = train_test_split(data, test_size=test_size)

print(test_data.shape)
print(train_data.shape)

train_data = train_data.replace(to_replace='?', value=np.nan)
train_data = train_data.dropna(axis='index')

(244, 11)
(454, 11)


In [304]:
train_data.to_csv(r'data/breast-cancer-wisconsin-nullDropped_train.csv', index = None, header=False)
test_data.to_csv(r'data/breast-cancer-wisconsin-nullDropped_test.csv', index = None, header=False)

### Importing C4.5 implementation library and load above preprocessed data set

In [305]:
from implementation import loadCSV, growDecisionTreeFrom, classify, plot, entropy, prune

trainingData = loadCSV('data/breast-cancer-wisconsin-nullDropped_train.csv')

### Grow the decision tree from the training data using entropy as the evaluation function and plot tree

In [306]:
decisionTree = growDecisionTreeFrom(trainingData, evaluationFunction=entropy)
plot(decisionTree)

Column 2: x >= 3?
yes -> Column 2: x >= 4?
		yes -> Column 0: x >= 730881?
				yes -> {4: 104}
				no  -> Column 0: x >= 721482?
						yes -> {2: 1}
						no  -> Column 4: x >= 2?
								yes -> {4: 27}
								no  -> Column 1: x >= 8?
										yes -> {4: 3}
										no  -> {2: 1}
		no  -> Column 6: x >= 3?
				yes -> Column 8: x >= 5?
						yes -> Column 3: x >= 5?
								yes -> {4: 5}
								no  -> Column 0: x >= 1239232?
										yes -> {2: 1}
										no  -> Column 0: x >= 1118039?
												yes -> {4: 2}
												no  -> {2: 1}
						no  -> {4: 10}
				no  -> Column 5: x >= 5?
						yes -> {4: 1}
						no  -> {2: 17}
no  -> Column 1: x >= 7?
		yes -> {4: 5}
		no  -> Column 7: x >= 7?
				yes -> {4: 1}
				no  -> {2: 264}


### Collect the predictions from the decision tree for each of the test data features split above

In [307]:
predictions = []

for index, row in test_data.iloc[:,:-1].iterrows():
    a = classify(list(row), decisionTree, dataMissing=True)
    
    listOfItems = a.items()
    
    for item  in listOfItems:
        predictions.append(item[0])
        break

### Extract actual class lables from test data

In [308]:
print(len(predictions))
print(predictions)

244
[2, 2, 2, 4, 2, 4, 4, 2, 2, 2, 2, 4, 4, 2, 2, 4, 4, 2, 4, 2, 4, 4, 4, 2, 4, 2, 4, 2, 4, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 2, 4, 2, 4, 2, 4, 2, 2, 4, 4, 2, 2, 4, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 4, 2, 4, 4, 4, 2, 2, 4, 4, 2, 2, 4, 4, 2, 2, 4, 4, 4, 2, 4, 2, 4, 2, 2, 2, 2, 2, 2, 2, 4, 2, 4, 2, 4, 2, 2, 4, 4, 4, 4, 2, 4, 2, 4, 4, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 2, 4, 4, 2, 4, 4, 2, 2, 4, 4, 2, 4, 2, 4, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 2, 2, 2, 4, 4, 2, 4, 2, 4, 2, 2, 2, 4, 2, 2, 4, 2, 4, 2, 4, 2, 2, 2, 2, 4, 4, 2, 2, 4, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 2, 2, 2, 2, 4, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 4, 2, 4, 2, 2, 4, 4, 2, 2, 4, 4, 2, 2, 4]


In [309]:
actual_class = test_data.iloc[:,-1]
actual_class.shape

(244,)

### Classification report is generated to cross check the precision, recall, f1 score and support of predictions

In [310]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score

print(classification_report(actual_class, predictions, target_names=['benign','malignant']))

              precision    recall  f1-score   support

      benign       0.95      0.90      0.92       162
   malignant       0.82      0.90      0.86        82

   micro avg       0.90      0.90      0.90       244
   macro avg       0.89      0.90      0.89       244
weighted avg       0.91      0.90      0.90       244



### Confusion matrix and accuracy of the model is calculated

In [311]:
tn, fp, fn, tp = confusion_matrix(actual_class, predictions).ravel()
print("Confusion matrix = True positives:", tp, ", true negatives:", tn, ", false positives:", fp, ", false negatives:", fn)

print("Accuracy sroce of the model =", accuracy_score(actual_class, predictions, normalize=True))

Confusion matrix = True positives: 74 , true negatives: 146 , false positives: 16 , false negatives: 8
Accuracy sroce of the model = 0.9016393442622951


### Same process is followed by introducing tree pruning with minimal gain 0.2

In [312]:
prune(decisionTree, 0.2, evaluationFunction=entropy)
plot(decisionTree)

Column 2: x >= 3?
yes -> Column 2: x >= 4?
		yes -> Column 0: x >= 730881?
				yes -> {4: 104}
				no  -> Column 0: x >= 721482?
						yes -> {2: 1}
						no  -> Column 4: x >= 2?
								yes -> {4: 27}
								no  -> Column 1: x >= 8?
										yes -> {4: 3}
										no  -> {2: 1}
		no  -> Column 6: x >= 3?
				yes -> Column 8: x >= 5?
						yes -> Column 3: x >= 5?
								yes -> {4: 5}
								no  -> Column 0: x >= 1239232?
										yes -> {2: 1}
										no  -> Column 0: x >= 1118039?
												yes -> {4: 2}
												no  -> {2: 1}
						no  -> {4: 10}
				no  -> Column 5: x >= 5?
						yes -> {4: 1}
						no  -> {2: 17}
no  -> {4: 6, 2: 264}


In [313]:
predictions = []

for index, row in test_data.iloc[:,:-1].iterrows():
    a = classify(list(row), decisionTree, dataMissing=True)
    
    listOfItems = a.items()
    
    for item  in listOfItems:
        predictions.append(item[0])
        break

In [314]:
print(len(predictions))
print(predictions)

244
[4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]


In [315]:
actual_class = test_data.iloc[:,-1]
actual_class.shape

(244,)

In [316]:
print(classification_report(actual_class, predictions, target_names=['benign','malignant']))

              precision    recall  f1-score   support

      benign       0.64      0.04      0.08       162
   malignant       0.33      0.95      0.50        82

   micro avg       0.35      0.35      0.35       244
   macro avg       0.49      0.50      0.29       244
weighted avg       0.54      0.35      0.22       244



In [317]:
tn, fp, fn, tp = confusion_matrix(actual_class, predictions).ravel()
print("Confusion matrix = True positives:", tp, ", true negatives:", tn, ", false positives:", fp, ", false negatives:", fn)

print("Accuracy score of the model =", accuracy_score(actual_class, predictions, normalize=True))

Confusion matrix = True positives: 78 , true negatives: 7 , false positives: 155 , false negatives: 4
Accuracy score of the model = 0.3483606557377049
