### Original data set is read and missing values records are dropped

In [112]:
import pandas as pd
import numpy as np

data = pd.read_csv(r'data/breast-cancer-wisconsin.data.txt')
data_withnan = data.replace(to_replace='?', value=np.nan)

print("Data with missing values replaced with NAN shape=", data_withnan.shape)

cleaned_data = data_withnan.dropna(axis='index')

print("Data with missing values dropped shape=", cleaned_data.shape)

Data with missing values replaced with NAN shape= (698, 11)
Data with missing values dropped shape= (682, 11)


### Data set is split to test and training and saved as csv files

In [113]:
from sklearn.model_selection import train_test_split

test_size = 244/(455+244)
train_data, test_data = train_test_split(cleaned_data, test_size=test_size)

print(test_data.shape)
print(train_data.shape)

(239, 11)
(443, 11)


In [114]:
train_data.to_csv(r'data/breast-cancer-wisconsin-nullDropped_train.csv', index = None, header=False)
test_data.to_csv(r'data/breast-cancer-wisconsin-nullDropped_test.csv', index = None, header=False)

### Importing C4.5 implementation library and load above missing values dropped data set

In [115]:
from implementation import loadCSV, growDecisionTreeFrom, classify, plot

trainingData = loadCSV('data/breast-cancer-wisconsin-nullDropped_train.csv')

### Grow the decision tree from the training data and plot it

In [116]:
decisionTree = growDecisionTreeFrom(trainingData)
plot(decisionTree)

Column 2: x >= 3?
yes -> Column 3: x >= 3?
		yes -> Column 6: x >= 9?
				yes -> {4: 81}
				no  -> Column 2: x >= 5?
						yes -> Column 4: x >= 2?
								yes -> {4: 34}
								no  -> Column 0: x >= 1108370?
										yes -> {4: 5}
										no  -> {2: 2}
						no  -> Column 0: x >= 1213375?
								yes -> {2: 5}
								no  -> Column 4: x >= 4?
										yes -> Column 5: x >= 6?
												yes -> Column 0: x >= 1047630?
														yes -> {4: 2}
														no  -> {2: 1}
												no  -> {4: 7}
										no  -> Column 0: x >= 1041801?
												yes -> Column 6: x >= 3?
														yes -> {4: 4}
														no  -> Column 3: x >= 4?
																yes -> {4: 1}
																no  -> {2: 3}
												no  -> {2: 2}
		no  -> Column 0: x >= 1133041?
				yes -> {2: 15}
				no  -> Column 1: x >= 5?
						yes -> {4: 3}
						no  -> {2: 1}
no  -> Column 6: x >= 5?
		yes -> Column 1: x >= 4?
				yes -> Column 4: x >= 10?
						yes -> {2: 1}
						no  -> {4: 4}
				no  -> {2: 

### Collect the predictions from the decision tree for each of the test data features split above

In [117]:
predictions = []

for index, row in test_data.iloc[:,:-1].iterrows():
    a = classify(list(row), decisionTree, dataMissing=False)
    
    listOfItems = a.items()
    
    for item  in listOfItems:
        predictions.append(item[0])
        break

In [118]:
print(len(predictions))
print(predictions)

239
[2, 4, 2, 2, 4, 4, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 4, 2, 2, 2, 2, 4, 2, 2, 4, 2, 4, 4, 2, 2, 2, 2, 2, 2, 4, 2, 4, 4, 4, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 4, 4, 2, 2, 4, 2, 4, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 4, 2, 2, 4, 2, 2, 2, 2, 4, 2, 4, 2, 4, 2, 2, 4, 2, 2, 4, 2, 2, 2, 4, 4, 2, 4, 2, 4, 2, 2, 4, 4, 2, 4, 4, 4, 2, 2, 2, 4, 4, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 2, 4, 4, 2, 2, 4, 4, 2, 2, 4, 2, 4, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 4, 4, 2, 2, 4, 4, 2, 2, 2, 2, 4, 4, 2, 4, 2, 4, 4, 2, 2, 4, 2, 4, 2, 2, 4, 2, 4, 2, 4, 2, 2, 4, 2, 4, 4, 4, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 2, 2, 4, 4, 2, 2, 2, 2, 4, 2, 4, 2, 2, 4, 2, 2, 2, 2, 4]


### Extract actual class lables from test data

In [119]:
actual_class = test_data.iloc[:,-1]
actual_class.shape

(239,)

### Classification report is generated to cross check the precision, recall, f1 score and support of predictions

In [120]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score

print(classification_report(actual_class, predictions, target_names=['benign','malignant']))

              precision    recall  f1-score   support

      benign       0.89      0.99      0.94       141
   malignant       0.98      0.83      0.90        98

   micro avg       0.92      0.92      0.92       239
   macro avg       0.93      0.91      0.92       239
weighted avg       0.93      0.92      0.92       239



### Confusion matrix and accuracy of the model is calculated

In [121]:
tn, fp, fn, tp = confusion_matrix(actual_class, predictions).ravel()
print("Confusion matrix = True positives:", tp, ", true negatives:", tn, ", false positives:", fp, ", false negatives:", fn)

print("Precision of the model =", accuracy_score(actual_class, predictions, normalize=True))

Confusion matrix = True positives: 81 , true negatives: 139 , false positives: 2 , false negatives: 17
Precision of the model = 0.9205020920502092


### Replacing the missing values in original data with 1 and re-run


In [155]:
data_withOne = data.replace(to_replace='?', value='1')
data_withOne.shape

(698, 11)

In [156]:
test_size = 244/(455+244)
train_data, test_data = train_test_split(data_withOne, test_size=test_size)

print(test_data.shape)
print(train_data.shape)

(244, 11)
(454, 11)


In [157]:
train_data.to_csv(r'data/breast-cancer-wisconsin-null_Imputed_train.csv', index = None, header=False)
test_data.to_csv(r'data/breast-cancer-wisconsin-null_Imputed_test.csv', index = None, header=False)

In [158]:
trainingData = loadCSV('data/breast-cancer-wisconsin-null_Imputed_train.csv')
decisionTree = growDecisionTreeFrom(trainingData)
plot(decisionTree)

Column 3: x >= 3?
yes -> Column 6: x >= 2?
		yes -> Column 4: x >= 6?
				yes -> {4: 64}
				no  -> Column 1: x >= 7?
						yes -> Column 4: x >= 5?
								yes -> Column 0: x >= 1214966?
										yes -> {4: 4}
										no  -> Column 0: x >= 1213375?
												yes -> {2: 1}
												no  -> {4: 1}
								no  -> {4: 49}
						no  -> Column 7: x >= 7?
								yes -> {4: 9}
								no  -> Column 8: x >= 3?
										yes -> Column 0: x >= 1065726?
												yes -> {4: 10}
												no  -> Column 0: x >= 846832?
														yes -> {2: 3}
														no  -> Column 0: x >= 616240?
																yes -> Column 0: x >= 695091?
																		yes -> {4: 1}
																		no  -> {2: 1}
																no  -> {4: 1}
										no  -> {2: 4}
		no  -> Column 1: x >= 7?
				yes -> {4: 9}
				no  -> Column 4: x >= 3?
						yes -> Column 0: x >= 1201936?
								yes -> {4: 3}
								no  -> Column 1: x >= 4?
										yes -> {2: 2}
										no  -> {4: 1}
						no  -> {2: 21}
no  -

In [159]:
predictions = []

for index, row in test_data.iloc[:,:-1].iterrows():
    a = classify(list(row), decisionTree, dataMissing=False)
    
    listOfItems = a.items()
    
    for item  in listOfItems:
        predictions.append(item[0])
        break

In [160]:
print(len(predictions))
print(predictions)

244
[4, 4, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 4, 4, 2, 4, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 4, 2, 4, 2, 2, 2, 4, 2, 2, 2, 4, 4, 2, 2, 4, 2, 2, 2, 2, 4, 4, 2, 2, 4, 2, 2, 4, 4, 2, 4, 2, 4, 2, 4, 4, 2, 2, 2, 2, 4, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 2, 2, 4, 4, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 4, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 4, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 4, 2, 2, 2, 2, 4, 4, 2, 2, 4, 4, 4, 4, 2, 2, 2]


In [161]:
actual_class = test_data.iloc[:,-1]
actual_class.shape

(244,)

In [162]:
print(classification_report(actual_class, predictions, target_names=['benign','malignant']))

              precision    recall  f1-score   support

      benign       0.88      0.97      0.92       160
   malignant       0.94      0.74      0.83        84

   micro avg       0.89      0.89      0.89       244
   macro avg       0.91      0.86      0.87       244
weighted avg       0.90      0.89      0.89       244



In [163]:
tn, fp, fn, tp = confusion_matrix(actual_class, predictions).ravel()
print("Confusion matrix = True positives:", tp, ", true negatives:", tn, ", false positives:", fp, ", false negatives:", fn)

print("Precision of the model =", accuracy_score(actual_class, predictions, normalize=True))

Confusion matrix = True positives: 62 , true negatives: 156 , false positives: 4 , false negatives: 22
Precision of the model = 0.8934426229508197
