In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load datasets
dataset_train = pd.read_csv('train.csv')
X_train = dataset_train.iloc[:, [1, 2, 3, 4, 5, 7, 8, 9, 10]].values
y_train = dataset_train.iloc[:, -1].values

dataset_test = pd.read_csv('test.csv')
X_test = dataset_test.iloc[:, [1, 2, 3, 4, 5, 7, 8, 9, 10]].values

# Impute missing values
imputer_num = SimpleImputer(strategy='mean')
X_train[:, [0]] = imputer_num.fit_transform(X_train[:, [0]])
X_test[:, [0]] = imputer_num.transform(X_test[:, [0]])

imputer_cat = SimpleImputer(strategy='most_frequent')
X_train[:, [5]] = imputer_cat.fit_transform(X_train[:, [5]])
X_test[:, [5]] = imputer_cat.transform(X_test[:, [5]])

# Encode categorical features
categorical_features = [1, 3, 6, 7, 8]
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(sparse_output=False), categorical_features)],
    remainder='passthrough'
)
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

# Train-validation split
X_train_split, X_valid, y_train_split, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

# Parameter grid
param_grid_dt = {
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid search
grid_search_dt = GridSearchCV(DecisionTreeRegressor(random_state=42), param_grid_dt, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_dt.fit(X_train_split, y_train_split)

# Best model
best_dt = grid_search_dt.best_estimator_
y_valid_pred_dt = best_dt.predict(X_valid)

# Metrics
mse_valid_dt = mean_squared_error(y_valid, y_valid_pred_dt)
r2_valid_dt = r2_score(y_valid, y_valid_pred_dt)

print("Optimized Decision Tree - Validation MSE:", mse_valid_dt)
print("Optimized Decision Tree - Validation R² Score:", r2_valid_dt)

Optimized Decision Tree - Validation MSE: 0.2996927078801021
Optimized Decision Tree - Validation R² Score: 0.7208810408146589
