In [1]:
import numpy as np

In [2]:
import pandas as pd

In [32]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, PredefinedSplit
from sklearn.model_selection import GridSearchCV

# 1. Prepare Dataset

## 1.1. Make Random, Uncorrelated Dataset

In [7]:
N = 1_000

In [8]:
train_size = 0.65

In [9]:
np.random.seed(12345)

In [10]:
X_numeric = pd.DataFrame(
    data=np.random.randn(5*N).reshape(-1, 5),
    columns=[f"X{k}" for k in range(1, 6, 1)]
)

In [11]:
X_categ = pd.DataFrame(
    data={
        "X_binary": np.random.binomial(n=1, p=0.5, size=N),
        "X_ternary": np.random.binomial(n=3, p=0.5, size=N)
    }
)

In [12]:
X = pd.concat([X_numeric, X_categ], axis=1)

In [13]:
y = pd.DataFrame(data={"y": np.random.binomial(n=1, p=0.5, size=N)})

In [14]:
X.head()

Unnamed: 0,X1,X2,X3,X4,X5,X_binary,X_ternary
0,-0.204708,0.478943,-0.519439,-0.55573,1.965781,1,1
1,1.393406,0.092908,0.281746,0.769023,1.246435,1,1
2,1.007189,-1.296221,0.274992,0.228913,1.352917,1,2
3,0.886429,-2.001637,-0.371843,1.669025,-0.43857,1,2
4,-0.539741,0.476985,3.248944,-1.021228,-0.577087,1,1


## 2.1. Split Data into Train, Validation and Test

In [15]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, train_size=train_size)

In [16]:
X_train_val = X_train_val.reset_index(drop=True)
y_train_val = y_train_val.reset_index(drop=True)

In [18]:
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, train_size=0.8)

# 2. Define Grid of Hyperparameters

In [21]:
hyperparams = {
    "criterion": ["gini", "entropy"],
    "max_depth": [1, 3, 5, 10, 20, None],
    "max_features": [5, 10, "sqrt", "log2", None]
}

# 3. Define the Single Split to be Used in Cross Validation - Enforce Using Incorrect Approach

In [22]:
ps_vec = np.zeros(X_train_val.shape[0])
ps_vec[X_train.index] = -1
ps_vec[X_val.index] = 1

In [24]:
cv_predef_split = PredefinedSplit(ps_vec)

# 4. Run the Grid Search Cross-Validation

In [25]:
gs_cv = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid=hyperparams,
    scoring="roc_auc",
    cv=cv_predef_split
)

In [26]:
gs_cv.fit(X_train_val, y_train_val)

In [31]:
gs_cv.best_score_

np.float64(0.6058925049309664)

In [29]:
cv_results = pd.DataFrame(gs_cv.cv_results_)

In [30]:
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_features,params,split0_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005579,0.0,0.004164,0.0,gini,1.0,5,"{'criterion': 'gini', 'max_depth': 1, 'max_fea...",0.538462,0.538462,0.0,16
1,0.001964,0.0,0.00208,0.0,gini,1.0,10,"{'criterion': 'gini', 'max_depth': 1, 'max_fea...",0.538462,0.538462,0.0,16
2,0.001466,0.0,0.002557,0.0,gini,1.0,sqrt,"{'criterion': 'gini', 'max_depth': 1, 'max_fea...",0.509615,0.509615,0.0,32
3,0.001258,0.0,0.001652,0.0,gini,1.0,log2,"{'criterion': 'gini', 'max_depth': 1, 'max_fea...",0.538462,0.538462,0.0,16
4,0.001417,0.0,0.001625,0.0,gini,1.0,,"{'criterion': 'gini', 'max_depth': 1, 'max_fea...",0.538462,0.538462,0.0,16
5,0.001754,0.0,0.002099,0.0,gini,3.0,5,"{'criterion': 'gini', 'max_depth': 3, 'max_fea...",0.428254,0.428254,0.0,60
6,0.002209,0.0,0.001641,0.0,gini,3.0,10,"{'criterion': 'gini', 'max_depth': 3, 'max_fea...",0.52071,0.52071,0.0,27
7,0.001296,0.0,0.001587,0.0,gini,3.0,sqrt,"{'criterion': 'gini', 'max_depth': 3, 'max_fea...",0.577046,0.577046,0.0,3
8,0.001216,0.0,0.001958,0.0,gini,3.0,log2,"{'criterion': 'gini', 'max_depth': 3, 'max_fea...",0.449088,0.449088,0.0,56
9,0.001941,0.0,0.001511,0.0,gini,3.0,,"{'criterion': 'gini', 'max_depth': 3, 'max_fea...",0.52071,0.52071,0.0,27
