In [2]:
import pandas as pd
import numpy as np

**Loading the Dataset**  
The dataset is loaded using `pd.read_csv`, which reads the CSV file containing credit data.

In [4]:
data = pd.read_csv("DT-Credit.csv")
# data

**Encoding Binary Categorical Variables**  
This cell encodes the binary categorical variables `Own`, `Student`, and `Married` by mapping 'Yes' to 1 and 'No' to 0. This transformation makes the variables numerical.

In [6]:
data['Own'] = data['Own'].map({'Yes': 1, 'No': 0})
data['Student'] = data['Student'].map({'Yes': 1, 'No': 0})
data['Married'] = data['Married'].map({'Yes': 1, 'No': 0})
# data

**One-Hot Encoding for 'Region' Feature**  
Here, `pd.get_dummies` is used to convert the categorical variable `Region` into multiple binary columns (e.g., `Region_East`, `Region_South`, `Region_West`).

In [8]:
data = pd.get_dummies(data, columns=['Region'], prefix='Region')
data

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Own,Student,Married,Balance,Region_East,Region_South,Region_West
0,14.891,3606,283,2,34,11,0,0,1,333,False,True,False
1,106.025,6645,483,3,82,15,1,1,1,903,False,False,True
2,104.593,7075,514,4,71,11,0,0,0,580,False,False,True
3,148.924,9504,681,3,36,11,1,0,0,964,False,False,True
4,55.882,4897,357,2,68,16,0,0,1,331,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,12.096,4100,307,3,32,13,0,0,1,560,False,True,False
396,13.364,3838,296,5,65,17,0,0,0,480,True,False,False
397,57.872,4171,321,5,67,12,1,0,1,138,False,True,False
398,37.728,2525,192,1,44,13,0,0,1,0,False,True,False


**Ensuring Consistent Data Types**  
The newly created region columns are cast to integer type, ensuring that all region-related columns are of the same data type.

In [10]:
data[['Region_East', 'Region_South', 'Region_West']] = data[['Region_East', 'Region_South', 'Region_West']].astype(int)
# data

**Feature Scaling**  
The `MinMaxScaler` is used to normalize the continuous features (`Income`, `Limit`, `Rating`, `Cards`, `Age`, `Education`) to a range between 0 and 1.

In [12]:
from sklearn.preprocessing import MinMaxScaler

features_to_scale = ['Income', 'Limit', 'Rating', 'Cards', 'Age', 'Education']
scaler = MinMaxScaler()
data[features_to_scale] = scaler.fit_transform(data[features_to_scale])
data

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Own,Student,Married,Balance,Region_East,Region_South,Region_West
0,0.025737,0.210675,0.213723,0.125,0.146667,0.400000,0,0,1,333,0,1,0
1,0.542722,0.443406,0.438695,0.250,0.786667,0.666667,1,1,1,903,0,0,1
2,0.534598,0.476336,0.473566,0.375,0.640000,0.400000,0,0,0,580,0,0,1
3,0.786079,0.662353,0.661417,0.250,0.173333,0.400000,1,0,0,964,0,0,1
4,0.258271,0.309542,0.296963,0.125,0.600000,0.733333,0,0,1,331,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,0.009882,0.248507,0.240720,0.250,0.120000,0.533333,0,0,1,560,0,1,0
396,0.017075,0.228442,0.228346,0.500,0.560000,0.800000,0,0,0,480,1,0,0
397,0.269560,0.253944,0.256468,0.500,0.586667,0.466667,1,0,1,138,0,1,0
398,0.155287,0.127891,0.111361,0.000,0.280000,0.533333,0,0,1,0,0,1,0


**Separating Features and Target Variable**  
The target variable (`Balance`) is separated from the feature set (`X`).

In [14]:
X = data.drop(columns=['Balance'])
y = data['Balance']
print(type(X))
print(type(y))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


**Splitting the Data into Training, Validation, and Test Sets**  
The data is split into training, validation, and test sets. First, 30% of the data is held out, then split into validation and test sets, each containing 15% of the total data.

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)
print(X_test.shape)
print(y_test.shape)

(280, 12)
(280,)
(60, 12)
(60,)
(60, 12)
(60,)


**Training the Decision Tree Regressor (Default Settings)**  
In this cell, a `DecisionTreeRegressor` is created and trained with default parameters. After training, predictions are made on the validation and test sets, and the model's performance is evaluated using Mean Squared Error (MSE).

In [18]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train)

y_pred_val = regressor.predict(X_val)
y_pred_test = regressor.predict(X_test)

mse_val = mean_squared_error(y_val, y_pred_val)
mse_test = mean_squared_error(y_test, y_pred_test)

print(f'Validation Mean Squared Error: {mse_val}')
print(f'Test Mean Squared Error: {mse_test}')

Validation Mean Squared Error: 17215.166666666668
Test Mean Squared Error: 25405.483333333334


**Hyperparameter Tuning for Decision Tree Regressor**  
Parameters like `max_depth`, `min_samples_split`, and `min_samples_leaf` are explored to improve model performance, aiming to reduce overfitting or underfitting. The best combination of hyperparameters is then used to retrain the model and evaluate it on validation and test sets, comparing it to the baseline MSE.

In [20]:
model1 = DecisionTreeRegressor(max_depth=10, min_samples_leaf=3, min_samples_split=2)
model2 = DecisionTreeRegressor(max_depth=8, min_samples_leaf=1, min_samples_split=3)
model3 = DecisionTreeRegressor(max_depth=12, min_samples_leaf=2, min_samples_split=2)

model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)

y_pred_val = model1.predict(X_val)
mse_val = mean_squared_error(y_val, y_pred_val)
print(f'Validation Mean Squared Error for Model 1: {mse_val}')

y_pred_val = model2.predict(X_val)
mse_val = mean_squared_error(y_val, y_pred_val)
print(f'Validation Mean Squared Error for Model 2: {mse_val}')

y_pred_val = model3.predict(X_val)
mse_val = mean_squared_error(y_val, y_pred_val)
print(f'Validation Mean Squared Error for Model 3: {mse_val}')

y_pred_test = model3.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred_test)
print(f'Test Mean Squared Error for Model 3: {mse_test}')

Validation Mean Squared Error for Model 1: 22550.41163425926
Validation Mean Squared Error for Model 2: 16938.91655002572
Validation Mean Squared Error for Model 3: 14411.27222222222
Test Mean Squared Error for Model 3: 22972.022222222222


**Training the XGBoost Regressor (Default Settings)**  
Here, the XGBoost model is initialized and trained on the training data with its default parameters. The model is evaluated on validation and test sets using MSE, providing a baseline before hyperparameter tuning.

In [22]:
import xgboost as xgb

xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', random_state=42) 
xgb_regressor.fit(X_train, y_train)

y_pred_val = xgb_regressor.predict(X_val)
y_pred_test = xgb_regressor.predict(X_test)

mse_val = mean_squared_error(y_val, y_pred_val)
mse_test = mean_squared_error(y_test, y_pred_test)

print(f"VALIDATION MSE with XGBoost: {mse_val}")
print(f"Test MSE with XGBoost: {mse_test}")

VALIDATION MSE with XGBoost: 8353.495770652122
Test MSE with XGBoost: 10506.853026500776


**Hyperparameter Tuning for XGBoost Regressor**  
In this cell, hyperparameter tuning is performed on the XGBoost model. Parameters such as `learning_rate`, `max_depth`, `n_estimators`, and `subsample` are explored to optimize the model. The best-performing hyperparameters are selected based on the validation set, and the model is re-evaluated on the test set to measure improvement from the baseline.


In [24]:
import xgboost as xgb

model1 = xgb.XGBRegressor(subsample=0.7, reg_lambda=0.1, n_estimators=140, max_depth=3, learning_rate=0.2)
model2 = xgb.XGBRegressor(subsample=0.6, reg_lambda=0.01, n_estimators=200, max_depth=4, learning_rate=0.1)
model3 = xgb.XGBRegressor(subsample=0.8, reg_lambda=0.01, n_estimators=100, max_depth=4, learning_rate=0.2)

model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)

y_pred_val = model1.predict(X_val)
mse_val = mean_squared_error(y_val, y_pred_val)
print(f'Validation Mean Squared Error for Model 1: {mse_val}')

y_pred_val = model2.predict(X_val)
mse_val = mean_squared_error(y_val, y_pred_val)
print(f'Validation Mean Squared Error for Model 2: {mse_val}')

y_pred_val = model3.predict(X_val)
mse_val = mean_squared_error(y_val, y_pred_val)
print(f'Validation Mean Squared Error for Model 3: {mse_val}')

y_pred_test = model3.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred_test)
print(f'Test Mean Squared Error for Model 3: {mse_test}')

Validation Mean Squared Error for Model 1: 5760.271266279406
Validation Mean Squared Error for Model 2: 4266.977825464567
Validation Mean Squared Error for Model 3: 3545.4052187847597
Test Mean Squared Error for Model 3: 6067.242074347191
