# Hyperparameter search for DecisionTreeClassifier

In [1]:
import src.data as data
import src.preprocessing as preprocessing

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

Importing our training data:

In [5]:
ratings_train = data.load_train_ratings()

Selecting the ratings of the 5 closest users that have rated the movie for each couple of user and movie

In [6]:
# Can take several minutes to run (5-10)

five_closest_users = preprocessing.five_closest_users(ratings_train)

Setting the ranges of the hyperparameters to be tested

In [27]:
param_grid = {
    "criterion": ["gini", "entropy"],
    "splitter": ["best", "random"],
    "max_depth": [None, 5, 10, 50, 100, 200, 500],
    "min_samples_split": [2, 5, 10, 20, 50, 100],
    "min_samples_leaf": [1, 2, 5, 10, 20, 50, 100],
    "max_features": ["sqrt", "log2"]
}

Using the `GridSearchCV` function from `sklearn` to find the best hyperparameters. It uses cross-validation to evaluate the performance of each combination of hyperparameters and select the best one based on the chosen metric.

In [None]:
# Takes approx. 2s per fit

classifier = DecisionTreeClassifier()

print("Classifier:", classifier.__class__.__name__)
print("Parameters explored:")
for key in param_grid:
    print(f"\t{key:12}: {param_grid[key]}")
print("\n")

grid_search = GridSearchCV(classifier, param_grid, cv=2, scoring="f1_micro", verbose=1)
grid_search.fit(list(five_closest_users.values()), [rating for _, _, rating in five_closest_users.keys()])
print("Best parameters set found on the training set:")
for key in grid_search.best_params_:
    print(f"\t{key:12}: {grid_search.best_params_[key]}")
print("")
print("Best score:")
print(grid_search.best_score_)