### Decision Tree for Classification of Breast Cancer Wisconsin Diagnostic 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
#conda install -c conda-forge pygraphviz
import graphviz
from sklearn import preprocessing

colname=['ID','Diagnosis']+['feature_%s' %i for i in range(1,31)]
#print(colname)
df = pd.read_csv('wdbc.data',header=None,names=colname)


1. After importing the data, let’s check the if data successfully import.

In [2]:
print(df.head(5))

         ID Diagnosis  feature_1  feature_2  feature_3  feature_4  feature_5  \
0    842302         M      17.99      10.38     122.80     1001.0    0.11840   
1    842517         M      20.57      17.77     132.90     1326.0    0.08474   
2  84300903         M      19.69      21.25     130.00     1203.0    0.10960   
3  84348301         M      11.42      20.38      77.58      386.1    0.14250   
4  84358402         M      20.29      14.34     135.10     1297.0    0.10030   

   feature_6  feature_7  feature_8  ...  feature_21  feature_22  feature_23  \
0    0.27760     0.3001    0.14710  ...       25.38       17.33      184.60   
1    0.07864     0.0869    0.07017  ...       24.99       23.41      158.80   
2    0.15990     0.1974    0.12790  ...       23.57       25.53      152.50   
3    0.28390     0.2414    0.10520  ...       14.91       26.50       98.87   
4    0.13280     0.1980    0.10430  ...       22.54       16.67      152.20   

   feature_24  feature_25  feature_26  featu

In [3]:
df.isnull().any()

ID            False
Diagnosis     False
feature_1     False
feature_2     False
feature_3     False
feature_4     False
feature_5     False
feature_6     False
feature_7     False
feature_8     False
feature_9     False
feature_10    False
feature_11    False
feature_12    False
feature_13    False
feature_14    False
feature_15    False
feature_16    False
feature_17    False
feature_18    False
feature_19    False
feature_20    False
feature_21    False
feature_22    False
feature_23    False
feature_24    False
feature_25    False
feature_26    False
feature_27    False
feature_28    False
feature_29    False
feature_30    False
dtype: bool

2. Let's understand more about the data. We will start by getting to know the type of each column values.  We see that the width and length column are represented using float64 and the name of the species uses object or string.

In [4]:
df.dtypes

ID              int64
Diagnosis      object
feature_1     float64
feature_2     float64
feature_3     float64
feature_4     float64
feature_5     float64
feature_6     float64
feature_7     float64
feature_8     float64
feature_9     float64
feature_10    float64
feature_11    float64
feature_12    float64
feature_13    float64
feature_14    float64
feature_15    float64
feature_16    float64
feature_17    float64
feature_18    float64
feature_19    float64
feature_20    float64
feature_21    float64
feature_22    float64
feature_23    float64
feature_24    float64
feature_25    float64
feature_26    float64
feature_27    float64
feature_28    float64
feature_29    float64
feature_30    float64
dtype: object

In [5]:
len(df['ID'])

569

3. Let’s look at a quick summary of the data.

In [6]:
df.describe()

Unnamed: 0,ID,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,30371830.0,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,125020600.0,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,8670.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,869218.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,906024.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,8813129.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,911320500.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


4. Everything checks out. First, I use Information Gain to construct a DT. But choose which features?

In [7]:
#distribute M and B
dfm=df[df['Diagnosis']=='M']
dfb=df[df['Diagnosis']=='B']
dfm.describe()

Unnamed: 0,ID,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30
count,212.0,212.0,212.0,212.0,212.0,212.0,212.0,212.0,212.0,212.0,...,212.0,212.0,212.0,212.0,212.0,212.0,212.0,212.0,212.0,212.0
mean,36818050.0,17.46283,21.604906,115.365377,978.376415,0.102898,0.145188,0.160775,0.08799,0.192909,...,21.134811,29.318208,141.37033,1422.286321,0.144845,0.374824,0.450606,0.182237,0.323468,0.09153
std,137896500.0,3.203971,3.77947,21.854653,367.937978,0.012608,0.053987,0.075019,0.034374,0.027638,...,4.283569,5.434804,29.457055,597.967743,0.02187,0.170372,0.181507,0.046308,0.074685,0.021553
min,8670.0,10.95,10.38,71.9,361.6,0.07371,0.04605,0.02398,0.02031,0.1308,...,12.84,16.67,85.1,508.1,0.08822,0.05131,0.02398,0.02899,0.1565,0.05504
25%,861345.0,15.075,19.3275,98.745,705.3,0.09401,0.1096,0.109525,0.06462,0.17405,...,17.73,25.7825,119.325,970.3,0.130475,0.244475,0.326425,0.15275,0.2765,0.076302
50%,895366.5,17.325,21.46,114.2,932.0,0.1022,0.13235,0.15135,0.08628,0.1899,...,20.59,28.945,138.0,1303.0,0.14345,0.35635,0.4049,0.182,0.3103,0.0876
75%,8911290.0,19.59,23.765,129.925,1203.75,0.110925,0.1724,0.20305,0.103175,0.20985,...,23.8075,32.69,159.8,1712.75,0.155975,0.44785,0.556175,0.210675,0.359225,0.102625
max,911296200.0,28.11,39.28,188.5,2501.0,0.1447,0.3454,0.4268,0.2012,0.304,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.17,0.291,0.6638,0.2075


In [8]:
dfb.describe()

Unnamed: 0,ID,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30
count,357.0,357.0,357.0,357.0,357.0,357.0,357.0,357.0,357.0,357.0,...,357.0,357.0,357.0,357.0,357.0,357.0,357.0,357.0,357.0,357.0
mean,26543820.0,12.146524,17.914762,78.075406,462.790196,0.092478,0.080085,0.046058,0.025717,0.174186,...,13.379801,23.51507,87.005938,558.89944,0.124959,0.182673,0.166238,0.074444,0.270246,0.079442
std,116739700.0,1.780512,3.995125,11.807438,134.287118,0.013446,0.03375,0.043442,0.015909,0.024807,...,1.981368,5.493955,13.527091,163.601424,0.020013,0.09218,0.140368,0.035797,0.041745,0.013804
min,8913.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1566,0.05521
25%,874662.0,11.08,15.15,70.87,378.2,0.08306,0.05562,0.02031,0.01502,0.158,...,12.08,19.58,78.27,447.1,0.1104,0.112,0.07708,0.05104,0.2406,0.07009
50%,908916.0,12.2,17.39,78.18,458.4,0.09076,0.07529,0.03709,0.02344,0.1714,...,13.35,22.82,86.92,547.4,0.1254,0.1698,0.1412,0.07431,0.2687,0.07712
75%,8812816.0,13.37,19.76,86.1,551.1,0.1007,0.09755,0.05999,0.03251,0.189,...,14.8,26.51,96.59,670.0,0.1376,0.2302,0.2216,0.09749,0.2983,0.08541
max,911320500.0,17.85,33.81,114.6,992.1,0.1634,0.2239,0.4108,0.08534,0.2743,...,19.82,41.78,127.1,1210.0,0.2006,0.5849,1.252,0.175,0.4228,0.1486


In [9]:
#initialize 
avg_m=np.zeros(30)
avg_b=np.zeros(30)
for i_feature in range(30):
    #obtain average value of feature for M and B
    avg_m[i_feature] = np.mean(dfm[lambda dfm: dfm.columns[i_feature+2]])
    avg_b[i_feature] = np.mean(dfb[lambda dfb: dfb.columns[i_feature+2]])
# find top 10 features with average value of M and B differs most
diff=abs(avg_m-avg_b)
diff_sorted = sorted(enumerate(diff),key=lambda x:x[1])
diff_sorted_index=[m[0] for m in diff_sorted]
print(diff_sorted_index[0:10])

[18, 9, 14, 19, 17, 11, 4, 15, 29, 16]


5.construct decision tree only from the 10 features and loop for 20 times

In [10]:
X = df[['feature_18', 'feature_9', 'feature_14', 'feature_19','feature_17',
       'feature_11','feature_4','feature_15','feature_29','feature_16']].values
Y = df['Diagnosis'].values

dt1_accuracy_train=np.zeros(20)
dt1_accuracy_test=np.zeros(20)
dt1_precision=np.zeros(20)
dt1_recall=np.zeros(20)
dt2_accuracy_train=np.zeros(20)
dt2_accuracy_test=np.zeros(20)
dt2_precision=np.zeros(20)
dt2_recall=np.zeros(20)
for i_process in range(20):
    (X_train,X_test,Y_train,Y_test)=train_test_split(X, Y, random_state=2,test_size=0.3,stratify=Y)
    clf_1 = DecisionTreeClassifier(criterion='entropy')
    clf_1.fit(X_train, Y_train)
    Y_pre_train=clf_1.predict(X_train)
    Y_pre_test=clf_1.predict(X_test)
    dt1_accuracy_train[i_process]=accuracy_score(Y_train, Y_pre_train)
    dt1_accuracy_test[i_process]=accuracy_score(Y_test, Y_pre_test)
    dt1_precision[i_process]=precision_score(Y_test, Y_pre_test,pos_label='M')
    dt1_recall[i_process]=recall_score(Y_test, Y_pre_test,pos_label='M')
    
    dt2_depth=3
    last_recall=1
    while dt1_recall[i_process] > dt2_recall[i_process] or last_recall < dt2_recall[i_process]:
        last_recall=dt2_recall[i_process]
        dt2_depth+=1
        clf_2 = DecisionTreeClassifier(criterion='entropy',max_depth=dt2_depth)
        clf_2.fit(X_train, Y_train)
        Y_pre_train=clf_2.predict(X_train)
        Y_pre_test=clf_2.predict(X_test)
        dt2_accuracy_train[i_process]=accuracy_score(Y_train, Y_pre_train)
        dt2_accuracy_test[i_process]=accuracy_score(Y_test, Y_pre_test)
        dt2_precision[i_process]=precision_score(Y_test, Y_pre_test,pos_label='M')
        dt2_recall[i_process]=recall_score(Y_test, Y_pre_test,pos_label='M')

    dt2_recall[i_process]=last_recall
        
dt1_avg_accuracy_train=np.mean(dt1_accuracy_train)
dt1_avg_accuracy_test=np.mean(dt1_accuracy_test)
dt1_avg_precision=np.mean(dt1_precision)
dt1_avg_recall=np.mean(dt1_recall)
dt2_avg_accuracy_train=np.mean(dt2_accuracy_train)
dt2_avg_accuracy_test=np.mean(dt2_accuracy_test)
dt2_avg_precision=np.mean(dt2_precision)
dt2_avg_recall=np.mean(dt2_recall)
print([dt1_accuracy_train,dt2_accuracy_train])
print([dt1_accuracy_test,dt2_accuracy_test])
print([dt1_avg_accuracy_train,dt1_avg_accuracy_test,dt1_avg_precision,dt1_avg_recall])
print([dt2_avg_accuracy_train,dt2_avg_accuracy_test,dt2_avg_precision,dt2_avg_recall])

[array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1.]), array([1.        , 0.99748744, 0.99497487, 1.        , 0.99497487,
       0.99748744, 0.99748744, 1.        , 0.99748744, 0.99748744,
       0.99748744, 0.99748744, 0.99748744, 0.99748744, 0.99497487,
       0.99748744, 1.        , 0.99497487, 0.99748744, 0.99748744])]
[array([0.9122807 , 0.91812865, 0.90643275, 0.9122807 , 0.9005848 ,
       0.89473684, 0.89473684, 0.90643275, 0.88888889, 0.90643275,
       0.9005848 , 0.90643275, 0.90643275, 0.9122807 , 0.9122807 ,
       0.91812865, 0.9122807 , 0.90643275, 0.90643275, 0.90643275]), array([0.9122807 , 0.9005848 , 0.90643275, 0.9005848 , 0.9005848 ,
       0.90643275, 0.90643275, 0.9122807 , 0.9122807 , 0.9122807 ,
       0.9005848 , 0.90643275, 0.90643275, 0.90643275, 0.9005848 ,
       0.91812865, 0.91812865, 0.90643275, 0.90643275, 0.9122807 ])]
[1.0, 0.9064327485380117, 0.8811484600840431, 0.8671875]
[0.9974874371859297, 0.907602339181

6.construct the decision tree from all 30 features and replace depth by minimum leaf samples

In [11]:
X = df[lambda df: df.columns[2:32]].values
Y = df['Diagnosis'].values

dt1_accuracy_train,dt1_accuracy_test,dt1_precision,dt1_recall =(np.zeros(20) for i in range(4))
dt2_accuracy_train,dt2_accuracy_test,dt2_precision,dt2_recall=(np.zeros(20) for i in range(4))
for i_process in range(20):
    (X_train,X_test,Y_train,Y_test)=train_test_split(X, Y, random_state=2,test_size=0.3,stratify=Y)
    clf_1 = DecisionTreeClassifier(criterion='entropy')
    clf_1.fit(X_train, Y_train)
    Y_pre_train=clf_1.predict(X_train)
    Y_pre_test=clf_1.predict(X_test)
    dt1_accuracy_train[i_process]=accuracy_score(Y_train, Y_pre_train)
    dt1_accuracy_test[i_process]=accuracy_score(Y_test, Y_pre_test)
    dt1_precision[i_process]=precision_score(Y_test, Y_pre_test,pos_label='M')
    dt1_recall[i_process]=recall_score(Y_test, Y_pre_test,pos_label='M')
    
    min_sample=2
    last_recall=1
    while dt1_recall[i_process] > dt2_recall[i_process] or last_recall < dt2_recall[i_process]:
        last_recall=dt2_recall[i_process]
        min_sample+=1
        clf_2 = DecisionTreeClassifier(criterion='entropy',min_samples_split=min_sample)
        clf_2.fit(X_train, Y_train)
        Y_pre_train=clf_2.predict(X_train)
        Y_pre_test=clf_2.predict(X_test)
        dt2_accuracy_train[i_process]=accuracy_score(Y_train, Y_pre_train)
        dt2_accuracy_test[i_process]=accuracy_score(Y_test, Y_pre_test)
        dt2_precision[i_process]=precision_score(Y_test, Y_pre_test,pos_label='M')
        dt2_recall[i_process]=recall_score(Y_test, Y_pre_test,pos_label='M')

    dt2_recall[i_process]=last_recall
        
dt1_avg_accuracy_train=np.mean(dt1_accuracy_train)
dt1_avg_accuracy_test=np.mean(dt1_accuracy_test)
dt1_avg_precision=np.mean(dt1_precision)
dt1_avg_recall=np.mean(dt1_recall)
dt2_avg_accuracy_train=np.mean(dt2_accuracy_train)
dt2_avg_accuracy_test=np.mean(dt2_accuracy_test)
dt2_avg_precision=np.mean(dt2_precision)
dt2_avg_recall=np.mean(dt2_recall)
print([dt1_accuracy_train,dt2_accuracy_train])
print([dt1_accuracy_test,dt2_accuracy_test])
print([dt1_avg_accuracy_train,dt1_avg_accuracy_test,dt1_avg_precision,dt1_avg_recall])
print([dt2_avg_accuracy_train,dt2_avg_accuracy_test,dt2_avg_precision,dt2_avg_recall])

[array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1.]), array([0.99497487, 0.98994975, 0.98743719, 0.9798995 , 0.99497487,
       1.        , 0.98994975, 0.99497487, 1.        , 0.94472362,
       0.98994975, 0.98994975, 0.99497487, 0.99497487, 0.98994975,
       0.94472362, 0.98994975, 0.99497487, 0.9798995 , 0.98994975])]
[array([0.9005848 , 0.92982456, 0.92397661, 0.92982456, 0.91812865,
       0.9122807 , 0.93567251, 0.91812865, 0.92397661, 0.92982456,
       0.91812865, 0.92397661, 0.90643275, 0.91812865, 0.9122807 ,
       0.93567251, 0.9122807 , 0.91812865, 0.92982456, 0.9122807 ]), array([0.9122807 , 0.92982456, 0.93567251, 0.93567251, 0.93567251,
       0.92397661, 0.93567251, 0.91812865, 0.92982456, 0.88304094,
       0.92397661, 0.92982456, 0.91812865, 0.92397661, 0.92982456,
       0.88304094, 0.92397661, 0.9122807 , 0.92397661, 0.92982456])]
[1.0, 0.92046783625731, 0.9053330039099803, 0.8796875]
[0.9868090452261307, 0.92192982456140