In [6]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

 ## Read Data

In [2]:
data = pd.read_csv("test_sample.csv")

In [3]:
data.shape

(1000, 4)

In [4]:
data.head()

Unnamed: 0,Y,X1,X2,X3
0,0,2,3,5
1,0,2,4,7
2,0,3,2,5
3,0,5,6,3
4,0,2,1,1


In [7]:
# Prepare the data
X = data.drop('Y', axis=1)  # independent variables
y = data['Y']  # dependent variable

## Optimal ccp_alpha value

In [8]:
# Setup the grid of ccp_alpha values and the GridSearchCV with 10-fold cross-validation
param_grid = {'ccp_alpha': np.linspace(0.01, 0.1, 10)}
tree_regressor = DecisionTreeRegressor(random_state=0)
grid_search = GridSearchCV(tree_regressor, param_grid, cv=10, scoring='neg_mean_squared_error')

# Perform grid search
grid_search.fit(X, y)

# Retrieve the best ccp_alpha value
optimal_ccp_alpha = grid_search.best_params_['ccp_alpha']
optimal_ccp_alpha

0.06000000000000001

## Tree regression MSE

In [9]:
# Fit DecisionTreeRegressor with the optimal ccp_alpha parameter
optimal_tree_regressor = DecisionTreeRegressor(ccp_alpha=optimal_ccp_alpha, random_state=0)
optimal_tree_regressor.fit(X, y)

# Predict using the fitted model
y_pred = optimal_tree_regressor.predict(X)

# Calculate the mean squared error of the fitted model
tree_regression_mse = mean_squared_error(y, y_pred)
tree_regression_mse

1.1841190000000001