<a href="https://colab.research.google.com/github/tallerzalan/Applied-Machine-Learning/blob/main/DTs/Exercise_2_dt_k_classes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise - DT for classification of 3 classes

1. Use the $\texttt{load_wine}$ data (remember to split your data into a train, validation, and test data). Find the optimal parameters (using the parameters discussed earlier). How high accuracy can you achieve on the test data?

**See slides for more details!**

In [1]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
import pandas as pd
import numpy as np

# Load the wine dataset
X, y = load_wine(return_X_y = True)

# We use `train_test_split` to split our data into a train and a test set.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.2,
                                                    random_state = 42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Now split the train data to also obtain validation data
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  test_size = 0.2,
                                                  random_state = 42)

print(X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape)

(142, 13) (36, 13) (142,) (36,)
(113, 13) (29, 13) (36, 13) (113,) (29,) (36,)


In [2]:
min_samples_split_list = [min_split for min_split in np.arange(0.01, 1, 0.05)] # input values seperated by ",".
min_samples_leaf_list = [min_leaf for min_leaf in np.arange(0.01, 0.5, 0.05)] # input values seperated by ",".
max_features_list = [max_feat for max_feat in np.arange(0.01, 1, 0.05)] # input values seperated by ",".

results = []

for min_samples_split in min_samples_split_list:
    for min_samples_leaf in min_samples_leaf_list:
        for max_features in max_features_list:
            dt_current = tree.DecisionTreeClassifier(
                min_samples_split = min_samples_split,
                min_samples_leaf = min_samples_leaf,
                max_features = max_features)
            dt_current.fit(X_train, y_train)
            y_val_hat = dt_current.predict(X_val)
            accuracy = accuracy_score(y_val_hat, y_val)

            results.append([accuracy, min_samples_split, min_samples_leaf, max_features])

results = pd.DataFrame(results)
results.columns = ['Accuracy', 'min_samples_split', 'min_samples_leaf', 'max_features']
print(results.sort_values('Accuracy', ascending = False))

      Accuracy  min_samples_split  min_samples_leaf  max_features
1332  0.965517               0.31              0.31          0.61
415   0.965517               0.11              0.01          0.76
405   0.965517               0.11              0.01          0.26
644   0.965517               0.16              0.11          0.21
927   0.965517               0.21              0.31          0.36
...        ...                ...               ...           ...
3300  0.379310               0.81              0.26          0.01
3121  0.379310               0.76              0.31          0.06
3802  0.379310               0.96              0.01          0.11
1942  0.379310               0.46              0.36          0.11
1722  0.379310               0.41              0.31          0.11

[4000 rows x 4 columns]


In [3]:
# Extract best parameters.
results[results['Accuracy'] == results['Accuracy'].max()]

Unnamed: 0,Accuracy,min_samples_split,min_samples_leaf,max_features
9,0.965517,0.01,0.01,0.46
19,0.965517,0.01,0.01,0.96
45,0.965517,0.01,0.11,0.26
405,0.965517,0.11,0.01,0.26
415,0.965517,0.11,0.01,0.76
522,0.965517,0.11,0.31,0.11
533,0.965517,0.11,0.31,0.66
608,0.965517,0.16,0.01,0.41
614,0.965517,0.16,0.01,0.71
617,0.965517,0.16,0.01,0.86


In [4]:
# Initialize your final model
dt_optimized = tree.DecisionTreeClassifier(
    min_samples_split = 0.06,
    min_samples_leaf = 0.01,
    max_features = 0.26)

# Use both training and validation data to fit it (np.concatenate "stacks" the array like rbind in R)
dt_optimized.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

# Predict on test data
y_test_hat_optimized = dt_optimized.predict(X_test)

# Obtain and check accuracy on test data
accuracy_optimized = accuracy_score(y_test_hat_optimized, y_test)
print(f'Optimized DT achieved {round(accuracy_optimized * 100, 1)}% accuracy.')

Optimized DT achieved 100.0% accuracy.
