In [2]:
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Load the diabetes dataset (for demonstration purposes)
dataset = load_diabetes()
X, y = dataset.data, dataset.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a full-sized decision tree
tree = DecisionTreeRegressor(random_state=42)
tree.fit(X_train, y_train)

# Calculate the mean squared error (MSE) on the testing set
y_pred = tree.predict(X_test)
y_hat=tree.predict(X_train)
mse_test = mean_squared_error(y_test, y_pred)
mse_train = mean_squared_error(y_train, y_hat)

print(f'The MSE in training is: {mse_train}')
print(f'The MSE in testing is {mse_test}')

The MSE in training is: 0.0
The MSE in testing is 4976.797752808989


Clear case of overfitting. Now we will employ cost-complexity pruning.

In [3]:
#perform cost complexity pruning
ccp_alphas=tree.cost_complexity_pruning_path(
    X_train,
    y_train,
)['ccp_alphas']

In [8]:
# Create a sequence of pruned trees for different values of alpha
pruned_trees=[]
for alpha in ccp_alphas:
    pruned_tree=DecisionTreeRegressor(ccp_alpha=alpha,random_state=42)
    pruned_tree.fit(X_train,y_train)
    pruned_trees.append(pruned_tree)

In [9]:
# Evaluate the pruned trees 
# and select the one with the lowest MSE on the test set

best_mse=float('inf')
best_tree=None

for tree in pruned_trees:
    y_pred=tree.predict(X_test)
    mse=mean_squared_error(y_test,y_pred)
    if mse<best_mse:
        best_mse=mse
        best_tree=tree

In [10]:
# Print the selected pruned tree and its alpha value
print("Selected Pruned Tree:")
print(best_tree)
print("Alpha:", best_tree.ccp_alpha)

Selected Pruned Tree:
DecisionTreeRegressor(ccp_alpha=80.57696804297389, random_state=42)
Alpha: 80.57696804297389


In [11]:
# Use the best tree to make the prediction

# Build a full-sized decision tree
tree_selected = DecisionTreeRegressor(ccp_alpha=80.57696804297389, random_state=42)
tree_selected.fit(X_train, y_train)

# Calculate the mean squared error (MSE) on the testing set
y_pred = tree_selected.predict(X_test)
y_hat=tree_selected.predict(X_train)
mse_test = mean_squared_error(y_test, y_pred)
mse_train = mean_squared_error(y_train, y_hat)

print(f'The MSE in training is: {mse_train}')
print(f'The MSE in testing is {mse_test}')

The MSE in training is: 2873.15185497862
The MSE in testing is 2907.177070734343


- Now we can see comparable MSE's. These values are large, so we can use random forest.
- Now, I will check if I get the same value with Hyperparameter Optimization

In [None]:
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from skopt import BayesSearchCV

# Load the diabetes dataset (for demonstration purposes)
dataset = load_diabetes()
X, y = dataset.data, dataset.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the hyperparameter search space for the decision tree
param_space = {
    'ccp_alpha': (0.0, 0.1, 'uniform')
}

# Perform hyperparameter tuning and cross-validation using Bayesian optimization
optimal_tree = BayesSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_space,
    n_iter=50,  # number of iterations for optimization
    cv=5,  # number of cross-validation folds
    n_jobs=-1  # number of parallel jobs (-1 means using all available processors)
)
optimal_tree.fit(X_train, y_train)

# Print the best hyperparameters and corresponding MSE on the testing set
print("Best Hyperparameters:", optimal_tree.best_params_)
y_pred = optimal_tree.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE on Testing Set:", mse)


In [12]:
from skopt import BayesSearchCV

In [15]:
# Define the hyperparameter search space for the decision tree
param_space = {
    'ccp_alpha': (0.0, 100, 'uniform')
}

# Perform hyperparameter tuning and cross-validation using Bayesian optimization
optimal_tree = BayesSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_space,
    scoring='neg_mean_squared_error',
    n_iter=100,  # number of iterations for optimization
    cv=5,  # number of cross-validation folds
    n_jobs=-1  # number of parallel jobs (-1 means using all available processors)
)

In [17]:
from tqdm import tqdm
# Fit the model with tqdm for progress tracking
with tqdm(total=optimal_tree.total_iterations) as pbar:
    def update_pbar(_):
        pbar.update(1)

    optimal_tree.fit(
    X_train,
    y_train,
    callback=update_pbar)

100%|██████████| 100/100 [05:07<00:00,  3.07s/it]


In [18]:
# Print the best hyperparameters and corresponding MSE on the testing set
print("Best Hyperparameters:", optimal_tree.best_params_)
print('----------------------')
y_pred = optimal_tree.predict(X_test)

mse_test = mean_squared_error(y_test, y_pred)
mse_train = mean_squared_error(y_train, optimal_tree.predict(X_train))

print(f'The MSE in training is: {mse_train}')
print(f'The MSE in testing is {mse_test}')

Best Hyperparameters: OrderedDict([('ccp_alpha', 100)])
----------------------
The MSE in training is: 3050.2975824148516
The MSE in testing is 3346.1976076617243


MSE has increased for both: training and testing set. Hence in this case, we seee that cost_complexity pruning works better, which seems intuitive because in Hyperparameter optimization we just wade through the permutations of parameters. In cost complexity pruning, we are pruning the tree suitably using a proper measure 'SSR'