## Application of Tree Based Classification Methods to Wisconsin Breast Cancer Dataset

In [1]:
import numpy as np
import pandas as pd
from time import time
from functools import reduce
from collections import deque
from sklearn import tree, ensemble
from metrix import accuracy, confusion_matrix, precision, recall, f1_score
import scipy.stats as st

from DecisionTree import DecisionTreeClassifier, DecisionTreeRegressor
from RandomForestClassifier import RandomForestClassifier
from AdaBoostClassifier import AdaBoostClassifier

In [2]:
def train_test_split(X, ratio=0.7):
    X = X.sample(frac=1).reset_index(drop=True)
    return X[:int(len(X) * ratio)], X[int(len(X) * ratio):]

# Helper functions for grid search
def cartesian_product(arr1, arr2):
    product = []
    for i in arr1:
        for j in arr2:
            if isinstance(i, tuple):
                # (1, 2) and 5 -> (1, 2, 5)
                product.append((*i,j))
            else:
                # 1 and 2 -> (1, 2)
                product.append((i,j))
    return product

def all_possible_param_combinations(params):
    return reduce(cartesian_product, map(lambda param_name: params[param_name], params))

In [3]:
def grid_search(model, params_to_optimize, X_train, y_train, X_test, y_test):
    all_possibilities = all_possible_param_combinations(params_to_optimize)
    best_accuracy = 0
    best_model = None
    for index, possibility in enumerate(all_possibilities):
        model_i = model(*possibility)
        a=time()
        model_i.fit(X_train, y_train)
        b=time()
        print('model', index + 1)
        print('trained in', b-a, 'seconds')
        accuracy = model_i.score(X_test, y_test)
        print('accuracy: ', accuracy)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model_i

    return best_accuracy, best_model

## Data Preparation

In [4]:
df = pd.read_csv('breast-cancer-wisconsin.data', header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [5]:
df = df.loc[(df != '?').all(axis=1), :]

  res_values = method(rvalues)


In [6]:
df = df.astype('int')

In [7]:
train_data, test_data = train_test_split(df.iloc[:, 1:])

In [8]:
X_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]

X_test = test_data.iloc[:, :-1]
y_test = test_data.iloc[:, -1]

## Model Selection with Cross-Validation and Grid Search

We have different hyperparameters to consider: min_members is the minimum required examples to label a node as leaf. max_depth is the maximum depth of a feature. max_features is the number of features to consider when making a split. Those features are selected randomly. Now, let's search for the optimal parameters with grid search by evaluating model performance on validation set. First, we'll built single decision trees from our data and later we will compare their performance with some ensemble methods such as Random Forest, AdaBoost etc.

In [9]:
params_to_optimize = {
    'tol': [0.1],
    'max_depth': [2, 3, 4],
    'min_members': [10, 20, 50],
    'criterion': ['gini'],
    'split_method': ['binary'],
    'max_features': [1, 2]
}

In [10]:
best_dt_accuracy, best_dt_model = grid_search(DecisionTreeClassifier, params_to_optimize, X_train, y_train, X_test, y_test)

model 1
trained in 0.0030896663665771484 seconds
accuracy:  0.8975609756097561
model 2
trained in 0.004040956497192383 seconds
accuracy:  0.8975609756097561
model 3
trained in 0.0016019344329833984 seconds
accuracy:  0.7951219512195122
model 4
trained in 0.0034830570220947266 seconds
accuracy:  0.9170731707317074
model 5
trained in 0.0020902156829833984 seconds
accuracy:  0.9121951219512195
model 6
trained in 0.002891063690185547 seconds
accuracy:  0.8926829268292683
model 7
trained in 0.0034492015838623047 seconds
accuracy:  0.9512195121951219
model 8
trained in 0.005025625228881836 seconds
accuracy:  0.9170731707317074
model 9
trained in 0.0025250911712646484 seconds
accuracy:  0.9414634146341463
model 10
trained in 0.005606412887573242 seconds
accuracy:  0.9365853658536586
model 11
trained in 0.0036749839782714844 seconds
accuracy:  0.8926829268292683
model 12
trained in 0.003778219223022461 seconds
accuracy:  0.9365853658536586
model 13
trained in 0.002874135971069336 seconds
accur

In [11]:
best_dt_accuracy

0.9512195121951219

In [12]:
best_dt_model.min_members, best_dt_model.max_depth, best_dt_model.max_features

(10, 3, 1)

From the 18 different models we evaluated, the model with hyperparameters min_members={{best_dt_model.min_members}}, max_depth={{best_dt_model.max_depth}}, max_features={{best_dt_model.max_features}} seems to be the best one with the {{np.round(best_dt_accuracy*100, 2).astype('int')}}% accuracy on validation set. Let us further evaluate this model by looking at other metrics.

In [13]:
pred_dt = best_dt_model.predict(X_test)
pred_dt

array([2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 4, 2,
       2, 2, 4, 4, 4, 4, 2, 4, 2, 2, 4, 2, 2, 2, 4, 2, 4, 2, 4, 2, 2, 4,
       2, 4, 2, 2, 2, 2, 4, 4, 4, 2, 2, 4, 2, 4, 2, 4, 2, 2, 2, 2, 4, 2,
       4, 4, 4, 2, 2, 2, 4, 2, 4, 2, 4, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2,
       4, 2, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2, 2,
       2, 2, 2, 4, 4, 2, 2, 2, 2, 4, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       4, 4, 4, 2, 4, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 4, 2, 4, 2, 4, 4, 4, 4, 2, 2, 4, 2, 2, 2, 2, 2, 4,
       4, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 4,
       2, 2, 2, 2, 2, 2, 2])

In [14]:
conf_matrix = confusion_matrix(y_test, pred_dt, classes=[2, 4])
conf_matrix

Unnamed: 0,Actual Positive,Actual Negative
Predicted Positive,63,0
Predicted Negative,10,132


In [15]:
dt_true_positive = conf_matrix.iloc[0, 0]
dt_predicted_positive = conf_matrix.iloc[0, :].sum()
dt_actual_positive = conf_matrix.iloc[:, 0].sum()

We have {{dt_true_positive}} true positives out of {{dt_predicted_positive}} positively predicted examples. So, our precision rate is {{dt_true_positive}}/{{dt_predicted_positive}}:

In [16]:
dt_precision = precision(y_test, pred_dt, classes=[2, 4])
dt_precision

1.0

The model correctly classified {{dt_true_positive}} out of {{dt_actual_positive}} positive examples. So, the *recall* is:

In [17]:
dt_recall = recall(y_test, pred_dt, classes=[2, 4])
dt_recall

0.863013698630137

Finally, we calculate *f1 score* by $ \frac{2pr}{p + r} $, where p is precision and r is recall:

In [18]:
dt_f1 = f1_score(y_test, pred_dt, classes=[2, 4])
dt_f1

0.9264705882352942

Now, let us examine how well Random Forest will perform on our data. We have an extra hyperparameter, *n_trees*, which defines the number of trees to build.

In [19]:
params_to_optimize = {
    'n_trees': [100, 150, 200],
    'tol': [0.1],
    'max_depth': [3, 4],
    'min_members': [10, 20],
    'criterion': ['gini'],
    'split_method': ['binary'],
    'max_features': [1, 2],
}

In [20]:
best_rf_accuracy, best_rf_model = grid_search(RandomForestClassifier, params_to_optimize, X_train, y_train, X_test, y_test)

model 1
trained in 0.1644904613494873 seconds
accuracy:  0.9804878048780488
model 2
trained in 0.26965975761413574 seconds
accuracy:  0.975609756097561
model 3
trained in 0.16741418838500977 seconds
accuracy:  0.9804878048780488
model 4
trained in 0.25455164909362793 seconds
accuracy:  0.9707317073170731
model 5
trained in 0.21365761756896973 seconds
accuracy:  0.9707317073170731
model 6
trained in 0.3332955837249756 seconds
accuracy:  0.975609756097561
model 7
trained in 0.26999402046203613 seconds
accuracy:  0.9853658536585366
model 8
trained in 0.3378770351409912 seconds
accuracy:  0.975609756097561
model 9
trained in 0.23338055610656738 seconds
accuracy:  0.9804878048780488
model 10
trained in 0.3521308898925781 seconds
accuracy:  0.975609756097561
model 11
trained in 0.22999072074890137 seconds
accuracy:  0.975609756097561
model 12
trained in 0.368499755859375 seconds
accuracy:  0.9804878048780488
model 13
trained in 0.297532320022583 seconds
accuracy:  0.9804878048780488
model 14

In [21]:
best_rf_accuracy

0.9853658536585366

In [22]:
best_rf_model.n_trees, best_rf_model.max_depth, best_rf_model.max_features, best_rf_model.min_members

(100, 4, 1, 20)

From the 24 different models we evaluated, the model with hyperparameters min_members={{best_rf_model.min_members}}, max_depth={{best_rf_model.max_depth}}, max_features={{best_rf_model.max_features}}, n_trees={{best_rf_model.n_trees}} performs best one with the {{np.round(best_rf_accuracy*100, 2).astype('int')}}% accuracy on validation set. Now, again, compute other metrics to see examine the performance of this model.

In [23]:
pred_rf = best_rf_model.predict(X_test)

In [24]:
confusion_matrix(y_test, pred_rf, classes=[2, 4])

Unnamed: 0,Actual Positive,Actual Negative
Predicted Positive,72,2
Predicted Negative,1,130


In [25]:
rf_precision = precision(y_test, pred_rf, classes=[2, 4])
rf_precision

0.972972972972973

In [26]:
rf_recall = recall(y_test, pred_rf, classes=[2, 4])
rf_recall

0.9863013698630136

In [27]:
rf_f1 = f1_score(y_test, pred_rf, classes=[2, 4])
rf_f1

0.979591836734694

Now, let's see how AdaBoost performs on our dataset. We have now, instead of *n_trees*, *n_learners* parameter.

In [28]:
params_to_optimize = {
    'n_learners': [50, 100, 150, 200],
    'tol': [0.1],
    'max_depth': [2],
    'min_members': [10, 20],
    'criterion': ['gini'],
    'split_method': ['binary'],
    'max_features': [1, 2, 3, 4, 5],
}

In [29]:
best_ab_accuracy, best_ab_model = grid_search(AdaBoostClassifier, params_to_optimize, X_train, y_train, X_test, y_test)

model 1
trained in 0.0632774829864502 seconds
accuracy:  0.9512195121951219
model 2
trained in 0.15097641944885254 seconds
accuracy:  0.9512195121951219
model 3
trained in 0.18727898597717285 seconds
accuracy:  0.9560975609756097
model 4
trained in 0.2193620204925537 seconds
accuracy:  0.9560975609756097
model 5
trained in 0.25381016731262207 seconds
accuracy:  0.9414634146341463
model 6
trained in 0.11494898796081543 seconds
accuracy:  0.9609756097560975
model 7
trained in 0.1512315273284912 seconds
accuracy:  0.9463414634146341
model 8
trained in 0.18474102020263672 seconds
accuracy:  0.9560975609756097
model 9
trained in 0.23239398002624512 seconds
accuracy:  0.9560975609756097
model 10
trained in 0.25071191787719727 seconds
accuracy:  0.9414634146341463
model 11
trained in 0.17055296897888184 seconds
accuracy:  0.9512195121951219
model 12
trained in 0.2450859546661377 seconds
accuracy:  0.9512195121951219
model 13
trained in 0.30675339698791504 seconds
accuracy:  0.9560975609756097

In [30]:
best_ab_accuracy

0.9658536585365853

In [31]:
best_ab_model.n_learners, best_ab_model.min_members, best_ab_model.max_features

(200, 20, 1)

From the 64 different models we evaluated, the model with hyperparameters min_members={{best_ab_model.min_members}}, max_features={{best_ab_model.max_features}}, n_learners={{best_ab_model.n_learners}} performs best one with the {{np.round(best_ab_accuracy*100, 2).astype('int')}}% accuracy on validation set. Let's compute some metrics again.

In [32]:
pred_ab = best_ab_model.predict(X_test)

In [33]:
confusion_matrix(y_test, pred_ab, classes=[2, 4])

Unnamed: 0,Actual Positive,Actual Negative
Predicted Positive,68,2
Predicted Negative,5,130


In [34]:
ab_precision = precision(y_test, pred_ab, classes=[2, 4])
ab_precision

0.9714285714285714

In [35]:
ab_recall = recall(y_test, pred_ab, classes=[2, 4])
ab_recall

0.9315068493150684

In [36]:
ab_f1 = f1_score(y_test, pred_ab, classes=[2, 4])
ab_f1

0.9510489510489512

Now, let's take a final look at the models we selected and compare their performance

In [37]:
model_eval_matrix = np.array([
    [best_dt_accuracy, best_rf_accuracy, best_ab_accuracy],
    [dt_precision, rf_precision, ab_precision],
    [dt_recall, rf_recall, ab_recall],
    [dt_f1, rf_f1, ab_f1]
])
model_eval_df = pd.DataFrame(model_eval_matrix, columns=['Decision Tree', 'Random Forest', 'AdaBoost'], index=['Accuracy', 'Precision', 'Recall', 'F1 Score'])

In [38]:
model_eval_df

Unnamed: 0,Decision Tree,Random Forest,AdaBoost
Accuracy,0.95122,0.985366,0.965854
Precision,1.0,0.972973,0.971429
Recall,0.863014,0.986301,0.931507
F1 Score,0.926471,0.979592,0.951049


In [4]:
df_new = pd.read_csv('weatherAUS.csv')
df_new

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,2017-06-20,Uluru,3.5,21.8,0.0,,,E,31.0,ESE,...,27.0,1024.7,1021.2,,,9.4,20.9,No,0.0,No
142189,2017-06-21,Uluru,2.8,23.4,0.0,,,E,31.0,SE,...,24.0,1024.6,1020.3,,,10.1,22.4,No,0.0,No
142190,2017-06-22,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,...,21.0,1023.5,1019.1,,,10.9,24.5,No,0.0,No
142191,2017-06-23,Uluru,5.4,26.9,0.0,,,N,37.0,SE,...,24.0,1021.0,1016.8,,,12.5,26.1,No,0.0,No


In [5]:
df_new.dropna(inplace=True)

In [6]:
for feature in ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']:
    df_new[feature] = df_new[feature].astype('category').cat.codes

In [7]:
train_data, test_data = train_test_split(df_new, ratio=0.7)

In [8]:
X_train = train_data.iloc[:, 1:-1]
y_train = train_data.iloc[:, -1]

X_test = test_data.iloc[:, 1:-1]
y_test = test_data.iloc[:, -1]
X_test

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM
39494,12,13.9,35.0,0.0,8.0,13.0,11,33.0,8,11,...,73.0,34.0,1021.0,1019.6,5.0,3.0,22.3,32.2,0,0.0
39495,14,13.6,29.1,0.0,8.2,13.2,0,35.0,0,0,...,49.0,21.0,1017.5,1014.2,0.0,0.0,18.1,27.2,0,0.0
39496,15,12.0,26.3,0.0,4.4,10.0,12,26.0,0,15,...,51.0,30.0,1027.0,1023.3,1.0,1.0,18.9,25.0,0,0.0
39497,19,22.7,29.1,0.0,9.2,9.8,4,43.0,0,4,...,69.0,54.0,1008.1,1004.9,1.0,1.0,26.7,28.3,0,1.0
39498,6,24.1,34.3,0.0,5.0,11.0,5,43.0,4,4,...,73.0,43.0,1012.7,1008.7,5.0,3.0,28.1,32.9,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56415,2,19.8,28.5,0.0,3.8,9.0,0,30.0,8,2,...,72.0,59.0,1015.5,1012.3,2.0,3.0,24.3,27.2,0,0.0
56416,11,17.6,33.0,0.0,9.6,10.5,5,37.0,3,6,...,49.0,22.0,1020.1,1014.7,0.0,4.0,22.6,31.1,0,0.0
56417,16,17.1,33.6,0.0,10.8,13.3,0,56.0,0,0,...,44.0,19.0,1018.4,1014.4,0.0,1.0,23.3,33.0,0,0.0
56418,5,8.8,21.8,1.8,2.2,10.4,10,22.0,12,1,...,56.0,39.0,1010.3,1008.0,0.0,3.0,19.7,21.2,1,0.0


In [35]:
X_train.iloc[:, 2:3]

Unnamed: 0,MaxTemp
0,25.2
1,42.6
2,33.7
3,19.1
4,22.0
...,...
39489,10.2
39490,16.4
39491,17.7
39492,33.9


Unnamed: 0,MaxTemp
0,29.1
1,33.2
2,32.6
3,16.0
4,24.8
5,18.9
6,31.1
7,35.3
8,14.3
9,27.7
