In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.datasets import load_wine

from palmerpenguins import load_penguins

In [None]:
penguins_raw = load_penguins()
penguins = (
    penguins_raw
    .dropna()
    .drop(columns=["island", "sex", "year"])
)

In [None]:
penguins.head()

In [None]:
sns.pairplot(penguins, hue="species", height=2.5)

In [None]:
ax = sns.jointplot(
    data=penguins,
    x="bill_length_mm",
    y="flipper_length_mm",
    hue="species"
)

In [None]:
ax = sns.jointplot(
    data=penguins,
    x="bill_length_mm",
    y="flipper_length_mm",
    hue="species"
)
ax.ax_joint.axhline(206, c='r');

In [None]:
ax = sns.jointplot(
    data=penguins.loc[lambda x: x["flipper_length_mm"] < 206],
    x="bill_length_mm",
    y="flipper_length_mm",
    hue="species"
)
ax.ax_joint.axvline(44, c='r');

## Trying out some Decision Trees

In [None]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
X = penguins.drop(columns="species")
y = penguins["species"]

In [None]:
y.value_counts()

In [None]:
y.value_counts() / y.value_counts().sum()

## Fitting the models

In [None]:
tree1 = DecisionTreeClassifier(max_depth=1)

In [None]:
tree1_scores = cross_val_score(tree1, X, y, scoring="accuracy", cv=cv)

In [None]:
tree1_scores

In [None]:
tree1_scores.mean()

In [None]:
tree2 = DecisionTreeClassifier(max_depth=2)

In [None]:
tree2_scores = cross_val_score(tree2, X, y, scoring="accuracy", cv=cv)

In [None]:
tree2_scores

In [None]:
tree2_scores.mean()

## Plotting the Trees

In order to plot them using this `plot_tree` function, the trees must first be fitted. So here I am fitting them to the whole training set, even though this would not be used when assessing the performance of the model

In [None]:
tree1.fit(X, y)

In [None]:
plot_tree(tree1);

In [None]:
np.unique(tree1.predict(X))

In [None]:
X.columns

In [None]:
tree2.fit(X, y)

In [None]:
plot_tree(tree2);

In [None]:
ax = sns.jointplot(
    data=penguins,
    x="bill_depth_mm",
    y="flipper_length_mm",
    hue="species"
)

ax.ax_joint.axvline(17.65, c='r')
ax.ax_joint.axhline(206.5, c='r');

## Now looking at the wine dataset

In [None]:
wine_raw = load_wine()

In [None]:
wine_raw.keys()

In [None]:
wine_df = pd.DataFrame(
    data=wine_raw["data"],
    columns=wine_raw["feature_names"]
)

full_wine_df = (
    wine_df
    .assign(**{
        "Type": wine_raw["target"]
    })
)

In [None]:
full_wine_df["Type"].value_counts()

In [None]:
wine_df.corr()

In [None]:
sns.clustermap(
    data=wine_df.corr(method="spearman"),
    method="ward",
    robust=True
)

## Fitting the tree

In [None]:
wine_X = wine_df.copy()
wine_y = full_wine_df["Type"]

In [None]:
wine_tree = DecisionTreeClassifier(max_depth=2)

In [None]:
cross_val_score(wine_tree, wine_X, wine_y, cv=cv)

In [None]:
wine_bagging = BaggingClassifier(
    base_estimator=wine_tree,
    n_estimators=10,
    bootstrap=True,
    bootstrap_features=True
)

In [None]:
cross_val_score(wine_bagging, wine_X, wine_y, cv=cv)

In [None]:
rf_wine = RandomForestClassifier(
    n_estimators=10
)

In [None]:
cross_val_score(rf_wine, wine_X, wine_y, cv=cv)

In [None]:
et_wine = ExtraTreesClassifier(
    n_estimators=100
)

In [None]:
cross_val_score(et_wine, wine_X, wine_y, cv=cv)