**Importing required libraries
Getting Train, Validation and Test datasets
Encoding Class Label (y)
Dividing the Datasets as X and Y**

In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import graphviz
import scipy.stats as stats

df = pd.read_csv('wdbc_train.csv')
dev = pd.read_csv('wdbc_dev.csv')

df['Diagnosis'] = (df['Diagnosis'] == 'M').astype(int)
dev['Diagnosis'] = (dev['Diagnosis'] == 'M').astype(int)
names = df.columns[:-1].tolist()

X = df[names].values
Y = df['Diagnosis'].values.reshape(-1,1)

devX = dev[names].values
devY = dev['Diagnosis'].values.reshape(-1,1)


**Creating a Decision Tree Node class and Decision Tree class with binning data**

In [2]:
class DTNode():

    def __init__(self, attribute=None, threshold=None, leftsub=None, rightsub=None, gain=None, value=None):
        self.attribute = attribute
        self.threshold = threshold
        self.rightsub = rightsub
        self.leftsub = leftsub
        self.gain = gain
        self.value = value

class DT():

    def __init__(self, max_depth=30, mode="entropy", chithres=0.05):
        self.max_depth = max_depth
        self.mode = mode
        self.chithres = chithres

    def splitdata(self, dataset, attribute, threshold):
        ltemp = dataset[:, attribute] <= threshold
        leftset = dataset[ltemp]
        rtemp = dataset[:, attribute] > threshold
        rightset = dataset[rtemp]
        return leftset, rightset

    def entropy(self, y):
        entropy = 0
        labels = np.unique(y)
        for label in labels:
            pl = len(y[y == label]) / len(y)
            entropy += -pl * np.log2(pl) if pl > 0 else 0
        return entropy

    def gini(self, y):
        gini = 1
        labels = np.unique(y)
        for label in labels:
            pg = len(y[y == label]) / len(y)
            gini -= pg ** 2
        return gini

    def infogain(self, parent, left, right):
        wright = len(right) / len(parent)
        wleft = len(left) / len(parent)
        weighted_en = (wleft * self.entropy(left)) + (wright * self.entropy(right))
        return self.entropy(parent) - weighted_en

    def ginigain(self, parent, left, right):
        wright = len(right) / len(parent)
        wleft = len(left) / len(parent)
        weighted_gini = (wleft * self.gini(left)) + (wright * self.gini(right))
        return self.gini(parent) - weighted_gini

    def chisquare(self, parent, left, right):
        unclass = np.unique(parent)
        size = len(parent)
        obleft = [np.sum(left == cls) for cls in unclass]
        obright = [np.sum(right == cls) for cls in unclass]
        totcount = [np.sum(parent == cls) for cls in unclass]
        exleft = [(count * len(left)) / size for count in totcount]
        exright = [(count * len(right)) / size for count in totcount]
        chisq = 0
        for obsl, obsr, expl, expr in zip(obleft, obright, exleft, exright):
            chisq += ((obsl - expl) ** 2 / expl) if expl > 0 else 0
            chisq += ((obsr - expr) ** 2 / expr) if expr > 0 else 0
        
        return chisq

    def bestsplit(self, dataset, nattributes):
        splitresult = {'gain': -1, 'attribute': None, 'threshold': None}
        for index in range(nattributes):
            values = dataset[:, index]
            thresholds = np.unique(values)
            for threshold in thresholds:
                leftdset, rightdset = self.splitdata(dataset, index, threshold)
                if len(leftdset) and len(rightdset):
                    parent = dataset[:, -1]
                    lefty, righty = leftdset[:, -1], rightdset[:, -1]

                    if self.mode == "entropy":
                        gain = self.infogain(parent, lefty, righty)
                    elif self.mode == "gini":
                        gain = self.ginigain(parent, lefty, righty)
                    chival = self.chisquare(parent, lefty, righty)
                    pval = 1 - stats.chi2.cdf(chival, df=len(np.unique(parent)) - 1)
                    if pval < self.chithres and gain > splitresult["gain"]:
                        splitresult["threshold"] = threshold
                        splitresult["attribute"] = index
                        splitresult["leftset"] = leftdset
                        splitresult["rightset"] = rightdset
                        splitresult["gain"] = gain
        return splitresult

    def leafval(self, y):
        return max(y, key=list(y).count)

    def treeBuilding(self, dataset, currdepth=0):
        X, Y = dataset[:, :-1], dataset[:, -1]
        nsamples, nattributes = X.shape
        if currdepth <= self.max_depth:
            splitresult = self.bestsplit(dataset, nattributes)
            if splitresult["gain"] > 0:
                leftnode = self.treeBuilding(splitresult["leftset"], currdepth + 1)
                rightnode = self.treeBuilding(splitresult["rightset"], currdepth + 1)
                return DTNode(splitresult["attribute"], splitresult["threshold"], leftnode, rightnode, splitresult["gain"])
        leaf = self.leafval(Y)
        return DTNode(value=leaf)

    def fit(self, X, Y):
        dataset = np.concatenate((X, Y.reshape(-1, 1)), axis=1)
        self.root = self.treeBuilding(dataset)

    def predict(self, X):
        predictions = [self.prediction(x, self.root) for x in X]
        return np.array(predictions)

    def prediction(self, x, node):
        if node.value is not None:
            return node.value
        attribute = x[node.attribute]
        if attribute <= node.threshold:
            return self.prediction(x, node.leftsub)
        return self.prediction(x, node.rightsub)

    def treeDiagram(self, node=None, dot=None):
        if dot is None:
            dot = graphviz.Digraph(comment='Decision Tree')
        if node is None:
            node = self.root
        if node.value is not None:
            dot.node(str(id(node)), f"Class: {node.value}", shape='box')
        else:
            dot.node(str(id(node)), f"Feature {node.attribute} <= {node.threshold}")
            if node.leftsub:
                dot.edge(str(id(node)), str(id(node.leftsub)), label="True")
                self.treeDiagram(node.leftsub, dot)
            if node.rightsub:
                dot.edge(str(id(node)), str(id(node.rightsub)), label="False")
                self.treeDiagram(node.rightsub, dot)
        return dot

**Random split function**

In [3]:
def randomsplit(X, Y, randomstate=41, testsize=0.2):
    nsamples = X.shape[0]
    np.random.seed(randomstate)
    shuffled = np.random.permutation(np.arange(nsamples))
    ts = int(nsamples * testsize)
    test = shuffled[:ts]
    train = shuffled[ts:]
    X_train, X_test = X[train], X[test]
    y_train, y_test = Y[train], Y[test]
    return X_train, X_test, y_train, y_test

**Accuracy and Informativeness Metrics**

In [4]:
def accuracy(ytrue, ypred):
    ytrue = ytrue.flatten()
    ypred = ypred.flatten()
    totsample = len(ytrue)
    correctpred = np.sum(ytrue == ypred)
    return (correctpred / totsample) 

def precision(TP, FP):
    if (TP + FP) > 0:
        precision = TP / (TP + FP)
    else: 
        precision = 0
    return precision

def specificity(TN, FP):
    specificity = TN / (TN + FP)
    return specificity

def npv(TN, FN):
    if (TN + FN) > 0:
        npv = TN / (TN + FN)
    else:
        npv = 0
    return npv

def confusionMat(ytrue, ypred):
    ypred = ypred.flatten()
    ytrue = ytrue.flatten()
    nclass = len(np.unique(ytrue))
    for i in range(nclass):
        mpred = ypred == i
        mtrue = ytrue == i
        TP = np.sum(mtrue & mpred)
        TN = np.sum((mtrue != True) & (mpred != True))
        FP = np.sum((mtrue != True) & mpred)
        FN = np.sum(mtrue & (mpred != True))
        s = TP / (TP + FN)
        p = precision(TP, FP)
        specific = specificity(TN, FP)
        f = f1 = 2 * (p * s)/(p + s) if (p+s) > 0 else 0
        fp = FP / (FP + TN)
        fn = FN / (FN + TP)
        n = npv(TN, FN)
        confusionMat = np.array([[TP, FN], [FP, TN]])
        print("Confusion Matrix:")
        print(confusionMat)
    return s, specific, p, f, fp, fn, n

In [5]:
Xtrain, Xtest, Ytrain, Ytest = randomsplit(X, Y, randomstate=42, testsize=0)

In [6]:
mode = input("gini or entropy")
model = DT(7, mode)
model.fit(X, Y)

gini or entropy entropy


**Decision Tree Visualization**

In [7]:
dot = model.treeDiagram(model.root)
dot.render('decision_treeBin.gv', view=True)

'decision_treeBin.gv.pdf'

**Validation with Dev dataset**

In [8]:
devXtrain, devXtest, devYtrain, devYtest = randomsplit(devX, devY, randomstate=42, testsize=1)

prediction = model.predict(devXtest)

In [9]:
print(f"Model's Accuracy: {accuracy(devYtest, prediction)}")
print(f"Model's Confusion Matrix:")
recall, sp, p, f1, fpr, fnr, nvp = confusionMat(devYtest, prediction)
print(f"Recall: {recall}")
print(f"Specificity: {sp}")
print(f"Precision: {p}")
print(f"F1 Score: {f1}")
print(f"FPR Score: {fpr}")
print(f"FNR Score: {fnr}")
print(f"NVP Score: {nvp}")

Model's Accuracy: 0.9649122807017544
Model's Confusion Matrix:
Confusion Matrix:
[[71  0]
 [ 4 39]]
Confusion Matrix:
[[39  4]
 [ 0 71]]
Recall: 0.9069767441860465
Specificity: 1.0
Precision: 1.0
F1 Score: 0.951219512195122
FPR Score: 0.0
FNR Score: 0.09302325581395349
NVP Score: 0.9466666666666667


Error: no "view" rule for type "application/pdf" passed its test case
       (for more information, add "--debug=1" on the command line)


**Testing with Binning**

In [10]:
test = pd.read_csv('wdbc_test.csv')
test['Diagnosis'] = (test['Diagnosis'] == 'M').astype(int)
names = test.columns[:-1].tolist()
Xtest = test[names].values
Ytest = test['Diagnosis'].values.reshape(-1,1)

tXtrain, tXtest, tYtrain, tYtest = randomsplit(Xtest, Ytest, randomstate=42, testsize=1)
py = model.predict(tXtest)
print(py)
print(f"Model's Accuracy: {accuracy(tYtest, py)}")
print(f"Model's Balanced Accuracy:")
recall, sp, p, f1, fpr, fnr, nvp = confusionMat(tYtest, py)
print(f"Recall: {recall}")
print(f"Specificity: {sp}")
print(f"Precision: {p}")
print(f"F1 Score: {f1}")
print(f"FPR Score: {fpr}")
print(f"FNR Score: {fnr}")
print(f"NVP Score: {nvp}")

[?1l>[1 0 0 0 0 1 0 0 0 1 1 1 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 0 1            [22;38H[m[m                           [2;1H                                                                                [3;1H                                                                                [4;1H                                                                                [5;1H                                                                                [6;1H                                                                                [7;1H                                                                                [8;1H                                                                                [9;1H                                                                                [10;1H                                                                                [11;1H                                                                                [12

In [11]:
mt = DT(5, mode='entropy')
mt.fit(Xtrain, Ytrain)

pi = mt.predict(tXtest)
print(f"Model's Accuracy: {accuracy(tYtest, pi)}")
print(f"Model's Balanced Accuracy:")
recall, sp, p, f1, fpr, fnr, nvp = confusionMat(tYtest, pi)
print(f"Recall: {recall}")
print(f"Specificity: {sp}")
print(f"Precision: {p}")
print(f"F1 Score: {f1}")
print(f"FPR Score: {fpr}")
print(f"FNR Score: {fnr}")
print(f"NVP Score: {nvp}")

Model's Accuracy: 0.9736842105263158
Model's Balanced Accuracy:
Confusion Matrix:
[[72  0]
 [ 3 39]]
Confusion Matrix:
[[39  3]
 [ 0 72]]
Recall: 0.9285714285714286
Specificity: 1.0
Precision: 1.0
F1 Score: 0.962962962962963
FPR Score: 0.0
FNR Score: 0.07142857142857142
NVP Score: 0.96


**Creating our own binning Data**

In [12]:
data = pd.read_csv('wdbc_train_raw.csv')
fcolumns = data.columns[:30]
nbins = 6
for col in fcolumns:
    data[col + '_bin'] = pd.qcut(data[col], q=nbins, labels=False)
print(data.head())

   Radius  Texture  Perimeter    Area  Smoothness  Compactness  Concavity  \
0   20.57    17.77     132.90  1326.0     0.08474      0.07864     0.0869   
1   19.69    21.25     130.00  1203.0     0.10960      0.15990     0.1974   
2   11.42    20.38      77.58   386.1     0.14250      0.28390     0.2414   
3   18.25    19.98     119.60  1040.0     0.09463      0.10900     0.1127   
4   13.00    21.82      87.50   519.8     0.12730      0.19320     0.1859   

   ConcavePoints  Symmetry  FractalDimension  ...  worstRadius_bin  \
0        0.07017    0.1812           0.05667  ...                5   
1        0.12790    0.2069           0.05999  ...                5   
2        0.10520    0.2597           0.09744  ...                2   
3        0.07400    0.1794           0.05742  ...                5   
4        0.09353    0.2350           0.07389  ...                3   

   worstTexture_bin  worstPerimeter_bin  worstArea_bin  worstSmoothness_bin  \
0                 2                  