We are using the Breast Cancer Wisconsin (Diagnostic) dataset. First, we must import the data and format into training and testing datasets. The structure of our imported data (numpy arrays of individual entries) is as follows [x0 = ID number, x1 = diagnosis (label attribute), x2-x32 = mean, standard error and largest measurement of: radius (x2-x4), texture (x5-x7), perimeter (x8-x10), area (x11-x13), smoothness (x14-x16), compactness (x17-x19), concavity (x20-x22), concave points (x23-x25), symmetry (x26-x28), fractal dimension (x29-31)]. 

In [None]:
#Packages
import numpy as np
import attributeMethods as AM
import matplotlib.pyplot as plt

import time
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

#CONSTANTS
N = 569                                                         #Breast cancer dataset contains 569 entries
TEST_PROP = 0.2                                                 #Proportion of data split into test set
TRAINING_QUANT = int(np.round((1-TEST_PROP)*N))                 #Number of training samples
TEST_QUANT = int(np.round(TEST_PROP*N))                         #Number of test samples
D = 9                                                           #Number of attributes in input vector
ATTRIBUTES = ["ID","Diagnosis",
              "radiusMean","radiusSE","radiusWorst",
              "textureMean","textureSE","textureWorst",
              "perimeterMean","perimeterSE","perimeterWorst",
              "areaMean","areaSE","areaWorst",
              "smoothMean","smoothSE","smoothWorst",
              "compactMean","compactSE","compactWorst",
              "concavityMean","concavitySE","concavityWorst",
              "conpointMean","conpointSE","conpointWorst",
              "symmetryMean","symmetrySE","symmetryWorst",
              "fractalMean","fractalSE","fractalWorst"]         #Attribute names
np.random.seed(39217531)                                        #Set seed to student ID

#Load in cancer dataset from CSV into np array
#4-byte floating point for real numbers: around 6-7 decimal places, sufficient for our demonstration
data = np.loadtxt("wdbc.data",delimiter=",",
                              dtype={"names": ATTRIBUTES,
                                     "formats": ("i4","S1","f4","f4","f4","f4","f4","f4","f4","f4","f4","f4","f4","f4","f4","f4","f4","f4","f4","f4","f4","f4","f4","f4","f4","f4","f4","f4","f4","f4","f4","f4")})

#Randomise & Split dataset into 80-20 train/test proportion
randomise = np.random.permutation(N)                            #Generate random permutation of indices
data = data[randomise]                                          #Randomise dataset order using permutation
trainData = data[:TRAINING_QUANT]                               #Split into training data
testData = data[TRAINING_QUANT:]                                #Split into test data
del data                                                        #Delete intermediate variable

#Refactor and isolate diagnosis attribute as 1/0: malignant=1, benign=0
Y_train = (trainData['Diagnosis'] == b'M').astype(np.int8)
Y_test = (testData['Diagnosis'] == b'M').astype(np.int8)

#Isolate x input vectors(exclude ID and Diagnosis)
feature_names = [a for a in ATTRIBUTES if a not in ("ID", "Diagnosis")]
X_train = np.column_stack([trainData[name] for name in feature_names])
X_test = np.column_stack([testData[name] for name in feature_names])

del trainData, testData

print("Length of training data:", len(X_train))
m_train = np.count_nonzero(Y_train == 1)
b_train = np.count_nonzero(Y_train == 0)
print(" - Of which malignant:", m_train)
print(" - Of which benign:", b_train)
print("M/B ratio in training data: %.2f" % (m_train / b_train if b_train > 0 else float('inf')))

print("\nLength of testing data:", len(X_test))
m_test = np.count_nonzero(Y_test == 1)
b_test = np.count_nonzero(Y_test == 0)
print(" - Of which malignant:", m_test)
print(" - Of which benign:", b_test)

print("M/B ratio in testing data: %.2f" % (m_test / b_test if b_test > 0 else float('inf')))

Length of training data: 455
 - Of which malignant: 167
 - Of which benign: 288
M/B ratio in training data: 0.58

Length of testing data: 114
 - Of which malignant: 45
 - Of which benign: 69
M/B ratio in testing data: 0.65


Decision Tree (DT): CART algorithm, Gini Impurity, stop at MAX_DEPTH

In [None]:
MAX_DEPTH = 10              #Maximum depth of decision tree

#Suggest possible thresholds using mean, median, some intervals SD away from mean for category
def possibleThreshold(xVals):
    mean = np.mean(xVals)
    sd = np.std(xVals)
    thresholds = [np.median(xVals)]
    intervals = np.arange(-4,4,0.2).tolist()
    for i in range(0,len(intervals)):
        thresholds.append(mean+intervals[i]*sd)
    return thresholds

print(possibleThreshold(X_train[:, 0]))

[np.float32(13.24), np.float32(0.123607635), np.float32(0.81962776), np.float32(1.5156479), np.float32(2.211666), np.float32(2.9076862), np.float32(3.6037064), np.float32(4.2997265), np.float32(4.9957466), np.float32(5.691765), np.float32(6.387785), np.float32(7.083805), np.float32(7.779825), np.float32(8.475844), np.float32(9.1718645), np.float32(9.867884), np.float32(10.563904), np.float32(11.259924), np.float32(11.955943), np.float32(12.651963), np.float32(13.347982), np.float32(14.044003), np.float32(14.740023), np.float32(15.436042), np.float32(16.132061), np.float32(16.828081), np.float32(17.524101), np.float32(18.220121), np.float32(18.916142), np.float32(19.61216), np.float32(20.30818), np.float32(21.0042), np.float32(21.70022), np.float32(22.39624), np.float32(23.092258), np.float32(23.788279), np.float32(24.484299), np.float32(25.180319), np.float32(25.876339), np.float32(26.572357), np.float32(27.268377)]
