<a href="https://colab.research.google.com/github/tallerzalan/Applied-Machine-Learning/blob/main/DTs/Exercise_5_gb_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise - Boosting for classification

1. Use the $\texttt{load_breast_cancer}$ data (remember to split your data into a train, validation, and test data). Using your training and validation data, optimize the parameters of your GradientBoostingClassifier. How well does your optimized model perform on the test data?
1. Implement an RF and a SVM and use these as well (**note**: you may want to perform standardization for the SVM). How well do they perform on the test data? Try to "vote" using all three models (boosting, RF, and SVM) and select the class with the most votes. How well does your ensemble of all three models perform?

**See slides for more details!**

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np

X, y = load_breast_cancer(return_X_y = True)

# Use `train_test_split` to split your data into a train and a test set.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.2,
                                                    random_state = 42)

# Use `train_test_split` to split your train data into a train and a validation  set.
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  test_size = 0.2,
                                                  random_state = 42)

print(X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape)

(364, 30) (91, 30) (114, 30) (364,) (91,) (114,)


# Exercise 1

Use the $\texttt{load_breast_cancer}$ data (remember to split your data into a train, validation, and test data). Using your training and validation data, optimize the parameters of your GradientBoostingClassifier. How well does your optimized model perform on the test data?

Let us start by ensuring we can just run an GBT without any optimization.

In [None]:
gbt_current = GradientBoostingClassifier(random_state = 1)

gbt_current.fit(X_train, y_train)

y_val_hat = gbt_current.predict(X_val)

acc = accuracy_score(y_val, y_val_hat)

print(f'Boosting with default settings has validation accuracy of {round(acc * 100, 2)}%.')

Boosting with default settings has validation accuracy of 95.6%.


In [None]:
# Remember you can try other stuff than these specific parameters.
# Just here to get you started!

n_estimators_list = [n_est for n_est in np.arange(2, 11, 1)]
min_samples_split_list = [min_split for min_split in np.arange(2, 11, 1)]
min_samples_leaf_list = [min_leaf for min_leaf in np.arange(2, 11, 1)]

results = []

for n_estimators in n_estimators_list:
    for min_samples_split in min_samples_split_list:
        for min_samples_leaf in min_samples_leaf_list:
            gbt_current = GradientBoostingClassifier(
                n_estimators = n_estimators,
                min_samples_split = min_samples_split,
                min_samples_leaf = min_samples_leaf,
                random_state = 42
                )
            gbt_current.fit(X_train, y_train)
            y_val_hat = gbt_current.predict(X_val)
            acc = accuracy_score(y_val, y_val_hat)

            results.append([acc, n_estimators, min_samples_split, min_samples_leaf])

results = pd.DataFrame(results)
results.columns = ['Accuracy', 'n_estimators', 'min_samples_split', 'min_samples_leaf']
print(results)

     Accuracy  n_estimators  min_samples_split  min_samples_leaf
0    0.604396             2                  2                 2
1    0.604396             2                  2                 3
2    0.604396             2                  2                 4
3    0.604396             2                  2                 5
4    0.604396             2                  2                 6
..        ...           ...                ...               ...
724  0.923077            10                 10                 6
725  0.923077            10                 10                 7
726  0.923077            10                 10                 8
727  0.923077            10                 10                 9
728  0.934066            10                 10                10

[729 rows x 4 columns]


In [None]:
# Extract best parameters.
results[results['Accuracy'] == results['Accuracy'].max()]

Unnamed: 0,Accuracy,n_estimators,min_samples_split,min_samples_leaf
89,0.967033,3,2,10
98,0.967033,3,3,10
107,0.967033,3,4,10
116,0.967033,3,5,10
125,0.967033,3,6,10
134,0.967033,3,7,10
143,0.967033,3,8,10
152,0.967033,3,9,10
161,0.967033,3,10,10


In [None]:
# Initialize your final model
gbt_optimized = GradientBoostingClassifier(
    n_estimators = 3,
    min_samples_split = 2,
    min_samples_leaf = 10,
    random_state = 42
    )

# Use both training and validation data to fit it using np.concatenate (np.concatenate "stacks" the array like rbind in R)
gbt_optimized.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

# Predict on test data
y_test_hat_optimized = gbt_optimized.predict(X_test)

# Obtain and check accuracy on test data
accuracy_optimized = mse = accuracy_score(y_test_hat_optimized, y_test)
print(f'Optimized GBT achieved Accuracy = {round(accuracy_optimized*100, 2)}%.')

Optimized GBT achieved Accuracy = 92.11%.


# Exercise 2

Implement an RF and a SVM and use these as well (**note**: you may want to perform standardization for the SVM). How well do they perform on the test data? Try to "vote" using all three models (boosting, RF, and SVM) and select the class with the most votes. How well does your ensemble of all three models perform?

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn import svm

# Scale your data
scaler = StandardScaler()
Z_train = scaler.fit_transform(X_train)
Z_val = scaler.transform(X_val)
Z_test = scaler.transform(X_test)

In [None]:
# Initialize your final models
# Initialize DT, RF, and GBT
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
gbt = GradientBoostingClassifier()

# Fit your models
dt.fit(Z_train, y_train)
rf.fit(Z_train, y_train)
gbt.fit(Z_train, y_train)

# Use both training and validation data to fit them using np.concatenate (np.concatenate "stacks" the array like rbind in R)
dt.fit(np.concatenate([Z_train, Z_val]), np.concatenate([y_train, y_val]))
rf.fit(np.concatenate([Z_train, Z_val]), np.concatenate([y_train, y_val]))
gbt.fit(np.concatenate([Z_train, Z_val]), np.concatenate([y_train, y_val]))

# Predict on test data
y_test_hat_dt = dt.predict(Z_test)
y_test_hat_rf = rf.predict(Z_test)
y_test_hat_gbt = gbt.predict(Z_test)

# Obtain and check mse on test data. Is it lower or higher than the RF?
accuracy_dt = accuracy_score(y_test_hat_dt, y_test)
accuracy_rf = accuracy_score(y_test_hat_rf, y_test)
accuracy_gbt = accuracy_score(y_test_hat_gbt, y_test)

print(f'DT achieved Accuracy = {round(accuracy_dt*100, 2)}%.')
print(f'RF achieved Accuracy = {round(accuracy_rf*100, 2)}%.')
print(f'GBT achieved Accuracy = {round(accuracy_gbt*100, 2)}%.')

DT achieved Accuracy = 93.86%.
RF achieved Accuracy = 96.49%.
GBT achieved Accuracy = 95.61%.


In [None]:
# Finally combine the predictions

y_test_hat_combined = np.c_[y_test_hat_gbt, y_test_hat_rf, y_test_hat_dt]

y_test_hat_combined = np.round(np.sum(y_test_hat_combined, axis=1) / y_test_hat_combined.shape[1]).astype(int)

acc = accuracy_score(y_test, y_test_hat_combined)

print(f'Ensemble of boosting, RF, and SVM achieved test accuracy of {round(acc * 100, 2)}%.')

Ensemble of boosting, RF, and SVM achieved test accuracy of 95.61%.
