In [1]:
import numpy as np

In [2]:
import pandas as pd

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, PredefinedSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

In [23]:
from sklearn.preprocessing import OneHotEncoder

In [45]:
from sklearn.compose import ColumnTransformer

In [84]:
from sklearn.pipeline import Pipeline

In [65]:
from sklearn.preprocessing import StandardScaler

## Useful References

Select materials on the nested vs non-nested validation problem:
1. [Article at Ploomber](https://ploomber.io/blog/nested-cv/)
2. [MachineLearning Article on the Topic](https://machinelearningmastery.com/nested-cross-validation-for-machine-learning-with-python/)
3. [Sklearn Example of Nested vs Non-Nested CV](https://scikit-learn.org/stable/auto_examples/model_selection/plot_nested_cross_validation_iris.html)

## 1. Prepare Dataset

### 1.1. Make Random, Uncorrelated Dataset

In [4]:
N = 1_000

In [5]:
train_size = 0.65

In [6]:
np.random.seed(12345)

In [55]:
arrays = [np.random.normal(loc=i*2+3, scale=i, size=N).reshape(-1, 1) for i in range(1, 6, 1)]

In [56]:
array = np.hstack(arrays)

In [67]:
num_cols = [f"X{k}" for k in range(1, 6, 1)]

In [68]:
X_numeric = pd.DataFrame(data=array, columns=num_cols)

In [59]:
X_categ = pd.DataFrame(
    data={
        "X_binary": np.random.binomial(n=1, p=0.5, size=N),
        "X_ternary": np.random.binomial(n=2, p=0.5, size=N)
    }
)

In [60]:
binary_dict = {0: "male", 1: "female"}
ternary_dict = {0: "employed", 1: "unemployed", 2: "not_labor_force"}

In [61]:
X_categ["X_binary"] = X_categ["X_binary"].apply(lambda x: binary_dict[x])
X_categ["X_ternary"] = X_categ["X_ternary"].apply(lambda x: ternary_dict[x])

In [62]:
X = pd.concat([X_numeric, X_categ], axis=1)

In [63]:
y = pd.DataFrame(data={"y": np.random.binomial(n=1, p=0.5, size=N)})

In [64]:
X.head()

Unnamed: 0,X1,X2,X3,X4,X5,X_binary,X_ternary
0,5.697593,8.906967,12.751354,17.627573,22.539039,male,not_labor_force
1,5.638125,8.567413,11.446149,6.54399,8.48967,female,unemployed
2,4.197859,9.306004,10.027298,14.057499,17.834558,female,not_labor_force
3,5.026526,9.299339,9.278422,10.48195,16.708428,male,not_labor_force
4,6.494993,9.365091,6.260618,8.286173,12.800807,female,not_labor_force


### 1.2. Scaling Numerical and One-Hot Encoding Categorical Variables

In [66]:
standard_scaler = StandardScaler()

In [47]:
# note that first category is dropped
binary_ohe = OneHotEncoder(categories=[["male", "female"]], drop="first")
ternary_ohe = OneHotEncoder(categories=[["unemployed", "employed", "not_labor_force"]], drop="first")

In [69]:
# prepare columns tranformer for use in the nested & non-nested cross-validations
ct = ColumnTransformer(
    transformers=[
        ("standard_scaler", standard_scaler, num_cols),
        ("binary_ohe", binary_ohe, ["X_binary"]),
        ("ternary_ohe", ternary_ohe, ["X_ternary"])
    ], 
    remainder="passthrough",
    verbose_feature_names_out=False
)

In [73]:
X_transformed = pd.DataFrame(data=ct.fit_transform(X), columns=ct.get_feature_names_out())

In [75]:
X_transformed.head()

Unnamed: 0,X1,X2,X3,X4,X5,X_binary_female,X_ternary_employed,X_ternary_not_labor_force
0,0.758668,0.946467,1.206742,1.707264,1.884294,0.0,0.0,1.0
1,0.697924,0.776366,0.762471,-1.160488,-0.872219,1.0,0.0,0.0
2,-0.773248,1.146365,0.279516,0.783548,0.961266,1.0,0.0,1.0
3,0.073202,1.143026,0.024611,-0.141585,0.740317,0.0,0.0,1.0
4,1.573179,1.175965,-1.002601,-0.709717,-0.026365,1.0,0.0,1.0


### 1.3. Split Data into Train, Validation and Test

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size)

## 2. Define Grid of Hyperparameters for use in Cross-Validation

In [15]:
hyperparams = {
    "criterion": ["gini", "entropy"],
    "max_depth": [1, 3, 5, 10, 20, None],
    "max_features": [5, 10, "sqrt", "log2", None]
}

## 3. Non-Nested Cross-Validation

### 3.1. Define the Single K-Fold Used

In [138]:
non_nested_cv_fold = KFold(n_splits=10)

### 3.2. Run the Non-Nested Cross-Validation

In [139]:
from sklearn.metrics import make_scorer

In [140]:
from sklearn.metrics import f1_score

In [141]:
f1_scorer = make_scorer(f1_score)

In [142]:
non_nested_cv = GridSearchCV(
    estimator=DecisionTreeClassifier(), 
    param_grid=hyperparams,
    cv=non_nested_cv_fold,
    scoring=f1_scorer
)

### 3.3. Make Runnable Non-Nested Pipeline and Fit It

In [143]:
non_nested_pipeline = Pipeline(steps=[
    ("scaling_and_encoding", ct),
    ("non_nested_cv", non_nested_cv)
])

### 3.4. Cross-Validate in the Non-Nested Manner

In [144]:
non_nested_pipeline.fit(X=X_train, y=y_train)

In [145]:
non_nested_cv_results_data = pd.DataFrame(non_nested_pipeline[1].cv_results_)

In [146]:
non_nested_cv_results_data["mean_test_score"].max()

np.float64(0.6998623087606316)

In [147]:
non_nested_cv_results = cross_validate(
    scoring=f1_scorer, 
    # KEY POINT - SAME FOLDING USED AS FOR SETUP OF THE GridSearchCV INSTANCE !!!
    cv=non_nested_cv_fold, 
    estimator=non_nested_pipeline,
    X=X_train,
    y=y_train
)

In [148]:
non_nested_score_mean = non_nested_cv_results["test_score"].mean()
non_nested_score_std = non_nested_cv_results["test_score"].std()

In [149]:
print(f"Non-nested Cross-Validation mean score: {non_nested_score_mean:.5f} and score std: {non_nested_score_std:.5f}")

Non-nested Cross-Validation mean score: 0.69986 and score std: 0.07104


In [154]:
print(f"Sanity check: {non_nested_cv_results_data['mean_test_score'].max()==non_nested_score_mean}")

Sanity check: True


## 4. Nested Cross-Validation

### 4.1. Make Two Distinct K-Folds

In [109]:
nested_outer_kfold = KFold(n_splits=10)
nested_inner_kfold = KFold(n_splits=10)

### 4.2. Make Cross-Validation - Using Outer KFold

In [110]:
nested_cv = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid=hyperparams,
    cv=nested_outer_kfold,
    scoring=f1_scorer
)

In [111]:
nested_pipeline = Pipeline(steps=[
    ("scaling_and_encoding", ct),
    ("non_nested_cv", non_nested_cv)
])

### 4.3. Fitting The Pipeline - Outer Loop

In [112]:
nested_pipeline.fit(X=X_train, y=y_train)

In [163]:
nested_best_estimator = nested_pipeline[1].best_estimator_
nested_best_estimator

In [165]:
nested_cv_results = pd.DataFrame(nested_pipeline[1].cv_results_)

In [169]:
nested_cv_results["mean_test_score"].max()

np.float64(0.698648716527622)

### 4.4. Scoring - Inner Loop

In [156]:
nested_cv_results = cross_validate(
    # KEY POINT: THE OTHER FOLDING IS USED FOR CROSS VALIDATION
    cv=nested_inner_kfold,
    scoring=f1_scorer,
    X=X_train,
    y=y_train,
    estimator=nested_pipeline
)

In [157]:
nested_cv_results_mean = nested_cv_results["test_score"].mean()
nested_cv_results_std = nested_cv_results["test_score"].std()

In [168]:
print(f"{nested_cv_results_mean=}")

nested_cv_results_mean=np.float64(0.6998623087606316)


In [170]:
print(f"{nested_cv_results["mean_test_score"].max()=}")

nested_cv_results["mean_test_score"].max()=np.float64(0.698648716527622)
