<a href="https://colab.research.google.com/github/tallerzalan/Applied-Machine-Learning/blob/main/DTs/Exercise_2_dt_k_classes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise - DT for classification of 3 classes

1. Use the $\texttt{load_wine}$ data (remember to split your data into a train, validation, and test data). Find the optimal parameters (using the parameters discussed earlier). How high accuracy can you achieve on the test data?

**See slides for more details!**

In [144]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np

# Load the wine dataset
X, y = load_wine(return_X_y = True)

# We use `train_test_split` to split our data into a train and a test set.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.2,
                                                    random_state = 42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Now split the train data to also obtain validation data
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  test_size = 0.2,
                                                  random_state = 42)

print(X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape)

(142, 13) (36, 13) (142,) (36,)
(113, 13) (29, 13) (36, 13) (113,) (29,) (36,)


In [145]:
min_samples_split_list = [min_split for min_split in np.arange(2, 11, 1)] # input values seperated by ",".
min_samples_leaf_list = [min_leaf for min_leaf in np.arange(2, 11, 1)] # input values seperated by ",".
max_features_list = [max_feat for max_feat in np.arange(2, 11, 1)] # input values seperated by ",".
#max_features_list = ['auto', 'sqrt', 'log2']

results = []

for min_samples_split in min_samples_split_list:
    for min_samples_leaf in min_samples_leaf_list:
        for max_features in max_features_list:
            dt_current = DecisionTreeClassifier(
                min_samples_split = min_samples_split,
                min_samples_leaf = min_samples_leaf,
                max_features = max_features,
                random_state = 42)
            dt_current.fit(X_train, y_train)
            y_val_hat = dt_current.predict(X_val)
            accuracy = accuracy_score(y_val_hat, y_val)

            results.append([accuracy, min_samples_split, min_samples_leaf, max_features])

results = pd.DataFrame(results)
results.columns = ['Accuracy', 'min_samples_split', 'min_samples_leaf', 'max_features']
print(results.sort_values('Accuracy', ascending = False))

     Accuracy  min_samples_split  min_samples_leaf  max_features
651  1.000000                 10                 2             5
327  1.000000                  6                 2             5
408  1.000000                  7                 2             5
570  1.000000                  9                 2             5
246  1.000000                  5                 2             5
..        ...                ...               ...           ...
486  0.758621                  8                 2             2
414  0.758621                  7                 3             2
495  0.758621                  8                 3             2
405  0.758621                  7                 2             2
576  0.758621                  9                 3             2

[729 rows x 4 columns]


In [146]:
# Extract best parameters.
results[results['Accuracy'] == results['Accuracy'].max()]

Unnamed: 0,Accuracy,min_samples_split,min_samples_leaf,max_features
246,1.0,5,2,5
327,1.0,6,2,5
408,1.0,7,2,5
489,1.0,8,2,5
570,1.0,9,2,5
651,1.0,10,2,5


In [147]:
# Initialize your final model
dt_optimized = DecisionTreeClassifier(
    min_samples_split = 6,
    min_samples_leaf = 2,
    max_features = 5,
    random_state = 42)

# Use both training and validation data to fit it (np.concatenate "stacks" the array like rbind in R)
dt_optimized.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

# Predict on test data
y_test_hat_optimized = dt_optimized.predict(X_test)

# Obtain and check accuracy on test data
accuracy_optimized = accuracy_score(y_test_hat_optimized, y_test)
print(f'Optimized DT achieved {round(accuracy_optimized * 100, 1)}% accuracy.')

Optimized DT achieved 91.7% accuracy.
