In [1]:
import pandas as pd, numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score

In [57]:
df = pd.DataFrame(load_breast_cancer().data, columns=load_breast_cancer().feature_names)
df['label'] = load_breast_cancer().target

copy = df.copy()

df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [27]:
dt = DecisionTreeClassifier(random_state=50, min_samples_split=100)
dt.fit(df.iloc[:,:-1],df.iloc[:,-1])
df['prediction'] = dt.predict(df.iloc[:,:-1])
df.iloc[:5,-2:]

Unnamed: 0,label,prediction
0,0,1
1,0,0
2,0,0
3,0,0
4,0,1


In [28]:
print('Accuracy =',accuracy_score(df.label, df.prediction))

Accuracy = 0.945518453427065


In [29]:
df['weight'] = 1/len(df)
df.iloc[:5,-3:]

Unnamed: 0,label,prediction,weight
0,0,1,0.001757
1,0,0,0.001757
2,0,0,0.001757
3,0,0,0.001757
4,0,1,0.001757


In [7]:
no_of_errors = len(df[df.label != df.prediction1])
no_of_errors

31

In [8]:
total_errors = no_of_errors/len(df)
total_errors

0.054481546572934976

In [9]:
alpha = 0.5 * np.log((1-total_errors)/total_errors)
alpha

1.426935677838319

In [32]:
df['weight_updated'] = df.loc[df.label != df.prediction].weight * np.exp(alpha)
df.weight_updated = df['weight_updated'].fillna(df[df.label == df.prediction].weight * np.exp(-alpha))
df.iloc[:5,-4:]

Unnamed: 0,label,prediction,weight,weight_updated
0,0,1,0.001757,0.007321
1,0,0,0.001757,0.000422
2,0,0,0.001757,0.000422
3,0,0,0.001757,0.000422
4,0,1,0.001757,0.007321


In [33]:
df.weight_updated = df.weight_updated/df.weight_updated.sum()
df.iloc[:5,-4:]

Unnamed: 0,label,prediction,weight,weight_updated
0,0,1,0.001757,0.016129
1,0,0,0.001757,0.000929
2,0,0,0.001757,0.000929
3,0,0,0.001757,0.000929
4,0,1,0.001757,0.016129


In [34]:
p = 0
for i in range(len(df)):
    df.loc[i,'ranges'] = df.loc[i,'weight_updated'] + p
    p = df.loc[i,'ranges']
    
df.iloc[:5,-5:]

Unnamed: 0,label,prediction,weight,weight_updated,ranges
0,0,1,0.001757,0.016129,0.016129
1,0,0,0.001757,0.000929,0.017058
2,0,0,0.001757,0.000929,0.017988
3,0,0,0.001757,0.000929,0.018917
4,0,1,0.001757,0.016129,0.035046


In [35]:
resampled = pd.DataFrame(columns=df.columns[:31])
for i in range(len(df)):
    index = df[df.ranges == df[np.random.rand()<df.ranges].ranges.min()].index
    resampled.loc[i] = list(df.iloc[index,:31].values[0])
    
resampled.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
0,12.36,21.8,79.78,466.1,0.08772,0.09445,0.06015,0.03745,0.193,0.06404,...,30.5,91.46,574.7,0.1304,0.2463,0.2434,0.1205,0.2972,0.09261,1.0
1,9.029,17.33,58.79,250.5,0.1066,0.1413,0.313,0.04375,0.2111,0.08046,...,22.65,65.5,324.7,0.1482,0.4365,1.252,0.175,0.4228,0.1175,1.0
2,15.46,11.89,102.5,736.9,0.1257,0.1555,0.2032,0.1097,0.1966,0.07069,...,17.04,125.0,1102.0,0.1531,0.3583,0.583,0.1827,0.3216,0.101,0.0
3,11.76,18.14,75.0,431.1,0.09968,0.05914,0.02685,0.03515,0.1619,0.06287,...,23.39,85.1,553.6,0.1137,0.07974,0.0612,0.0716,0.1978,0.06915,0.0
4,13.44,21.58,86.18,563.0,0.08162,0.06031,0.0311,0.02031,0.1784,0.05587,...,30.25,102.5,787.9,0.1094,0.2043,0.2085,0.1112,0.2994,0.07146,0.0


In [36]:
def adaboost(df):
    dt = DecisionTreeClassifier(random_state=50, min_samples_split=100)
    dt.fit(df.iloc[:,:30],df.iloc[:,30])
    df['prediction'] = dt.predict(df.iloc[:,:30])
    
    df['weight'] = 1/len(df)
    
    no_of_errors = len(df[df.label != df.prediction])
    
    total_errors = no_of_errors/len(df)
    
    alpha = 0.5 * np.log((1-total_errors)/total_errors)
    
    df['weight_updated'] = df.loc[df.label != df.prediction].weight * np.exp(alpha)
    df.weight_updated = df['weight_updated'].fillna(df[df.label == df.prediction].weight * np.exp(-alpha))
    
    df.weight_updated = df.weight_updated/df.weight_updated.sum()
    
    p = 0
    for i in range(len(df)):
        df.loc[i,'ranges'] = df.loc[i,'weight_updated'] + p
        p = df.loc[i,'ranges']
        
    resampled = pd.DataFrame(columns=df.columns[:31])
    for i in range(len(df)):
        index = df[df.ranges == df[np.random.rand()<df.ranges].ranges.min()].index
        resampled.loc[i] = list(df.iloc[index,:31].values[0])  
    
    df = resampled
    
    return [df, dt]

In [58]:
df = copy.copy()

models = []    
    
try:
    for iter in range(20):        
        ada = adaboost(df)
        df = ada[0]    
        models.append(ada[1])
        print('Decision stamp {0}'.format(iter+1))
    
except Exception:
    pass

Decision stamp 1
Decision stamp 2
Decision stamp 3
Decision stamp 4
Decision stamp 5
Decision stamp 6
Decision stamp 7
Decision stamp 8
Decision stamp 9
Decision stamp 10


In [59]:
models

[DecisionTreeClassifier(min_samples_split=100, random_state=50),
 DecisionTreeClassifier(min_samples_split=100, random_state=50),
 DecisionTreeClassifier(min_samples_split=100, random_state=50),
 DecisionTreeClassifier(min_samples_split=100, random_state=50),
 DecisionTreeClassifier(min_samples_split=100, random_state=50),
 DecisionTreeClassifier(min_samples_split=100, random_state=50),
 DecisionTreeClassifier(min_samples_split=100, random_state=50),
 DecisionTreeClassifier(min_samples_split=100, random_state=50),
 DecisionTreeClassifier(min_samples_split=100, random_state=50),
 DecisionTreeClassifier(min_samples_split=100, random_state=50)]

In [60]:
pred = np.zeros(len(df))
for i in range(len(models)):    
    pred += models[i].predict(copy.iloc[:,:-1])

pred

array([ 2.,  1.,  0.,  5.,  2.,  4.,  2.,  5.,  2.,  2.,  4.,  1.,  3.,
        5.,  4.,  2.,  1.,  1.,  0.,  8.,  9., 10.,  3.,  1.,  0.,  1.,
        2.,  0.,  2.,  1.,  0.,  3.,  0.,  1.,  0.,  1.,  4.,  9.,  2.,
        4.,  6.,  8.,  2.,  3.,  4.,  0.,  9.,  3.,  9.,  9.,  7.,  8.,
        9.,  1.,  2.,  9.,  0.,  3.,  8.,  9.,  9.,  8.,  2., 10.,  3.,
        2.,  9.,  8.,  8.,  7.,  1.,  9.,  1.,  4.,  9.,  1.,  8.,  1.,
        2.,  9.,  9.,  9.,  1.,  1.,  9.,  0.,  3.,  1.,  9.,  7.,  7.,
        6.,  9.,  9.,  2.,  0.,  9.,  7.,  8.,  7.,  3.,  8.,  9., 10.,
        9.,  2., 10.,  8.,  0.,  8.,  8., 10.,  6., 10., 10.,  8.,  8.,
        1.,  1.,  3.,  9.,  0.,  2.,  9.,  9.,  8.,  3.,  2.,  5.,  1.,
        9.,  1.,  2.,  8.,  1.,  4.,  8.,  9.,  1., 10.,  9.,  1.,  9.,
        9.,  8.,  9.,  5.,  8.,  7.,  7., 10.,  8.,  6.,  9.,  8.,  9.,
        0.,  5.,  7.,  8.,  8.,  2.,  1.,  7.,  1.,  8.,  7.,  0.,  0.,
        7., 10.,  2.,  4.,  8.,  9.,  9.,  9.,  2.,  6.,  8.,  1

In [64]:
threshold = len(models)/2
vec = np.vectorize(lambda x: 1 if x>threshold else 0)
final_prediction = vec(pred)
final_prediction

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [66]:
copy['final_prediction'] = final_prediction

print('Accuracy =',accuracy_score(copy.label, copy.final_prediction))

Accuracy = 0.9753954305799648
