# Decision Trees, Ensemble Methods and Hyperparameter tuning workshop:

# 3rd November 2022

![title](images/pydata_cardiff.jpg)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.datasets import load_wine, load_breast_cancer
from sklearn.metrics import accuracy_score

from palmerpenguins import load_penguins

In [None]:
penguins_raw = load_penguins()
penguins = (
    penguins_raw
    .dropna()
    .drop(columns=["island", "sex", "year"])
)

In [None]:
penguins.head()

In [None]:
sns.pairplot(
    penguins,
    hue="species",
    height=2.5,
    plot_kws={"s": 10}
)

In [None]:
ax = sns.jointplot(
    data=penguins,
    x="flipper_length_mm",
    y="bill_length_mm",
    hue="species"
)

In [None]:
ax = sns.jointplot(
    data=penguins,
    x="flipper_length_mm",
    y="bill_length_mm",
    hue="species"
)
ax.ax_joint.axvline(206, c='r');

In [None]:
ax = sns.jointplot(
    data=penguins.loc[lambda x: x["flipper_length_mm"] < 206],
    x="flipper_length_mm",
    y="bill_length_mm",
    hue="species"
)
ax.ax_joint.axhline(44, c='r');

## Trying out some Decision Trees

In [None]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
X_penguins = penguins.drop(columns="species")
y_penguins = penguins["species"]

In [None]:
y_penguins.value_counts()

In [None]:
y_penguins.value_counts() / y_penguins.value_counts().sum()

## Fitting the models

Note that a decision tree with a depth of 1 can be called a __Decision Stump__

In [None]:
tree1 = DecisionTreeClassifier(max_depth=1)

In [None]:
tree1_scores = cross_val_score(tree1, X_penguins, y_penguins, scoring="accuracy", cv=cv)

In [None]:
tree1_scores

In [None]:
tree1_scores.mean()

In [None]:
tree2 = DecisionTreeClassifier(max_depth=2)

In [None]:
tree2_scores = cross_val_score(tree2, X_penguins, y_penguins, scoring="accuracy", cv=cv)

In [None]:
tree2_scores

In [None]:
tree2_scores.mean()

## Plotting the Trees

In order to plot them using this `plot_tree` function, the trees must first be fitted. So here I am fitting them to the whole training set, even though this would not be used when assessing the performance of the model

In [None]:
tree1.fit(X_penguins, y_penguins)

In [None]:
plot_tree(tree1);

In [None]:
X_penguins.columns[2]

In [None]:
# Note that this has completely discounted the Chinstrap penguins as they are the minority class
np.unique(tree1.predict(X_penguins))

## Gini Impurity

This metric tells us how spread out a collection of categories is. If there are only entries from one category present in a leaf node, then the probability of each entry being correct is 1 and the impurity value will be 0. If there is an equal amount of each category in a leaf node, then this is very uninformative, TO BE CONTINUED

$\Large 1 - \sum\limits_{i=1}^{n} (p_{i})^{2}$

Here we create arrays of probabilities for categories A and B. This represents the situations when a leaf node is fully populated with category A $p(A) = 1$ or category B $p(B) = 1$, and all combinations inbetween.

We can see the the impurity value is 0 when

In [None]:
prob_a = np.linspace(0, 1, 1000)
prob_b = 1 - prob_a

In [None]:
gini_results = 1 - (prob_a**2 + prob_b**2)

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(prob_a, gini_results)
ax.set_xlabel("Probability of A")
ax.set_ylabel("Gini Impurity Value")
plt.suptitle("Gini Impurity for Binary Classification");

In [None]:
1 - (0.333333**2 + 0.333333**2 + 0.33333**2)

In [None]:
entropy_results = -(prob_a[1:] * np.log2(prob_a[1:]) + prob_b[:-1] * np.log2(prob_b[:-1]))

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(prob_a[1:], entropy_results)
ax.set_xlabel("Probability of A")
ax.set_ylabel("Entropy Value")
plt.suptitle("Entropy for Binary Classification");

In [None]:
tree1.criterion = "entropy"

In [None]:
tree1.fit(X_penguins, y_penguins)

In [None]:
plot_tree(tree1);

## Looking at a tree with a depth of 2

In [None]:
tree2.fit(X_penguins, y_penguins)

In [None]:
_, ax = plt.subplots(figsize=(10, 5))
plot_tree(tree2, ax=ax);

### Looking at the information on the left path

This is what we saw earlier, and gets very good classification results

In [None]:
X_penguins.columns[0]

In [None]:
ax = sns.jointplot(
    data=penguins,
    x="flipper_length_mm",
    y="bill_length_mm",
    hue="species"
)

ax.ax_joint.axvline(206.5, c='r')
ax.ax_joint.axhline(43.35, c='r', xmax=0.555)  # This just took experimentation to get the xmax value correct!

In [None]:
ax = sns.jointplot(
    data=penguins,
    x="bill_depth_mm",
    y="flipper_length_mm",
    hue="species"
)

ax.ax_joint.axvline(17.65, c='r', ymin=0.56)
ax.ax_joint.axhline(206.5, c='r');

### Is this last split really necessary?

The split in the top right hand corner has only identified 7 data points. While it can be argued that it is necessary here... it could be _specific to the data used to train the model_. This is a good example of what could very likely be a case of __Overfitting__. This is when we make a model that is too tailored to the particular data points used to train the model.

#### This is a common problem that can occur when using a single decision tree

# Ensemble methods

# Other datasets

In [None]:
cancer_ds = load_breast_cancer()

In [None]:
X_cancer = pd.DataFrame(
    columns=cancer_ds["feature_names"],
    data=cancer_ds["data"]
)

y_cancer = pd.Series(
    cancer_ds["target"],
    name="target"
)

In [None]:
rf = RandomForestClassifier(n_estimators=10)

In [None]:
cross_val_score(rf, X_cancer, y_cancer, cv=cv)

In [None]:
banknotes_raw = pd.read_csv("https://raw.githubusercontent.com/Muhammad-Taufiq-Khan/Counterfeit-Banknote-Detection-System/main/banknotes.csv")

In [None]:
X_banknotes = banknotes_raw.drop(columns="class")
y_banknotes = banknotes_raw["class"]

In [None]:
y_banknotes.mean()

In [None]:
cross_val_score(rf, X_banknotes, y_banknotes, cv=cv)

In [None]:
heart_raw = pd.read_csv("data/heart.csv")

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
scaler = StandardScaler()
pipeline = Pipeline([('scaler', scaler), ('classifier', ExtraTreesClassifier(n_estimators=200))])

In [None]:
X_heart = heart_raw.drop(columns="output")
y_heart = heart_raw["output"]

In [None]:
X_heart

In [None]:
cross_val_score(pipeline, X_heart, y_heart, cv=cv)

## Now looking at the wine dataset

In [None]:
wine_raw = load_wine()

In [None]:
wine_raw.keys()

In [None]:
wine_df = pd.DataFrame(
    data=wine_raw["data"],
    columns=wine_raw["feature_names"]
)

full_wine_df = (
    wine_df
    .assign(**{
        "Type": wine_raw["target"]
    })
)

In [None]:
full_wine_df["Type"].value_counts()

In [None]:
wine_df.corr()

In [None]:
wine_df_norm = (wine_df - wine_df.mean()) / wine_df.std()

In [None]:
sns.clustermap(
    data=wine_df_norm,
    method="ward",
    robust=True
)

## Fitting the tree

In [None]:
wine_X = wine_df.copy()
wine_y = full_wine_df["Type"]

In [None]:
np.random.seed(10)
wine_tree = DecisionTreeClassifier(max_depth=1)

In [None]:
cross_val_score(wine_tree, wine_X, wine_y, cv=cv)

In [None]:
wine_bagging = BaggingClassifier(
    base_estimator=wine_tree,
    n_estimators=100,
    bootstrap=False
)

In [None]:
cross_val_score(wine_bagging, wine_X, wine_y, cv=cv)

In [None]:
rf_wine = RandomForestClassifier(
    n_estimators=10
)

In [None]:
cross_val_score(rf_wine, wine_X, wine_y, cv=cv)

In [None]:
et_wine = ExtraTreesClassifier(
    n_estimators=100
)

In [None]:
cross_val_score(et_wine, wine_X, wine_y, cv=cv)

In [None]:

###############################################################################
#                          1. Importing Libraries                             #
###############################################################################
# The usual stuff
import pandas as pd
import numpy as np

# Iris dataset
from sklearn.datasets import load_iris

# Data preprocessing and machine learning
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

# To measure performance
from sklearn import metrics

In [None]:
iris = load_iris()
iris_df = pd.DataFrame(
    columns=iris["feature_names"],
    data=iris["data"]
)

In [None]:
iris['target_names']

In [None]:
iris_df["Species"] = pd.Series(iris["target"]).map({0: 'Setosa', 1: 'Versicolor', 2: 'Virginica'})

In [None]:
sns.pairplot(
    iris_df,
    hue="Species",
    height=2.5,
    plot_kws={"s": 8}
)

In [None]:
###############################################################################
#                 2. Stump vs Ensemble of 1000 Decision Stumps                #
###############################################################################
# Load data and store it into pandas DataFrame objects
# iris = load_iris()
X_iris = iris_df.drop(columns="Species")
y_iris = iris_df["Species"]

# Splitting Dataset
X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, test_size = 20, random_state = 100)

# Defining the stump
stump = DecisionTreeClassifier(max_depth = 1)

# Creating an ensemble 
ensemble = BaggingClassifier(base_estimator = stump, n_estimators = 1000,
                             bootstrap = False)

# Training classifiers
stump.fit(X_train, y_train)
ensemble.fit(X_train, y_train)

# Making predictions
y_pred_stump = stump.predict(X_test)
y_pred_ensemble = ensemble.predict(X_test)

# Determine performance
stump_accuracy = metrics.accuracy_score(y_test, y_pred_stump)
ensemble_accuracy = metrics.accuracy_score(y_test, y_pred_ensemble)

# Print message to user
print(f"The accuracy of the stump is {stump_accuracy*100:.1f} %")
print(f"The accuracy of the ensemble is {ensemble_accuracy*100:.1f} %")

In [None]:
###############################################################################
#                          3. Stump vs Random Forest                          #
###############################################################################
# Defining the stump
stump = DecisionTreeClassifier(max_depth = 1, splitter = "best", max_features = "sqrt")

# Create Random Forest 
ensemble = BaggingClassifier(base_estimator = stump, n_estimators = 1000,
                             bootstrap = True)

# Training classifiers
stump.fit(X_train, np.ravel(y_train))
ensemble.fit(X_train, np.ravel(y_train))

# Making predictions
y_pred_tree = stump.predict(X_test)
y_pred_ensemble = ensemble.predict(X_test)

# Determine performance
stump_accuracy = metrics.accuracy_score(y_test, y_pred_stump)
ensemble_accuracy = metrics.accuracy_score(y_test, y_pred_ensemble)

# Print message to user
print(f"The accuracy of the stump is {stump_accuracy*100:.1f} %")
print(f"The accuracy of the Random Forest is {ensemble_accuracy*100:.1f} %")