In [30]:
from sklearn import tree
from sklearn.model_selection import GridSearchCV

import pandas as pd
import numpy as np
import random
import math

In [31]:
wine_data = pd.read_csv("winequality-red.csv")

In [32]:
data = wine_data.to_numpy()

def split_data(data, seed=42):
    np.random.seed(seed)
    indices = np.random.permutation(len(data))
    
    train_end = int(0.7 * len(data))
    val_end = int(0.9 * len(data))

    train = data[indices[:train_end]]
    val = data[indices[train_end:val_end]]
    test = data[indices[val_end:]]

    return train, val, test

def replace_nans(train, val, test):
    train_mean = np.nanmean(train[:, :-1], axis=0)
    for dataset in [train, val, test]:
        for i in range(dataset.shape[1] - 1):
            nan_mask = np.isnan(dataset[:, i])
            dataset[nan_mask, i] = train_mean[i]
    return train, val, test

train, val, test = split_data(data, seed=42)
train, val, test = replace_nans(train, val, test)

X_train, y_train = train[:, :-1], train[:, -1].astype(int)
X_val, y_val = val[:, :-1], val[:, -1].astype(int)
X_test, y_test = test[:, :-1], test[:, -1].astype(int)

In [33]:
X_train.shape, y_train.shape

((1119, 11), (1119,))

In [50]:
hparams = {
    "criterion": ["gini", "entropy"],
    "splitter": ["best", "random"],
}

decision_tree = tree.DecisionTreeClassifier(random_state=42)
grid_search_decision_tree = GridSearchCV(
    clf,
    param_grid=hparams,
    scoring="accuracy",
    cv=4 # number of cross validation folds
)
grid_search_decision_tree.fit(X_train, y_train);

In [51]:
best_decision_tree = grid_search_decision_tree.best_estimator_

print(f"The best hparam configuration for decision tree is {grid_search_decision_tree.best_params_} with score {grid_search_decision_tree.best_score_:.4f}")

The best hparam configuration for decision tree is {'criterion': 'gini', 'splitter': 'random'} with score 0.5424
