In [54]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [55]:
credit_df = pd.read_csv('DT-Credit.csv')
credit_df

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Own,Student,Married,Region,Balance
0,14.891,3606,283,2,34,11,No,No,Yes,South,333
1,106.025,6645,483,3,82,15,Yes,Yes,Yes,West,903
2,104.593,7075,514,4,71,11,No,No,No,West,580
3,148.924,9504,681,3,36,11,Yes,No,No,West,964
4,55.882,4897,357,2,68,16,No,No,Yes,South,331
...,...,...,...,...,...,...,...,...,...,...,...
395,12.096,4100,307,3,32,13,No,No,Yes,South,560
396,13.364,3838,296,5,65,17,No,No,No,East,480
397,57.872,4171,321,5,67,12,Yes,No,Yes,South,138
398,37.728,2525,192,1,44,13,No,No,Yes,South,0


In [56]:
corr_matrix = credit_df.corr(numeric_only=True)
corr_matrix

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Balance
Income,1.0,0.792088,0.791378,-0.018273,0.175338,-0.027692,0.463656
Limit,0.792088,1.0,0.99688,0.010231,0.100888,-0.023549,0.861697
Rating,0.791378,0.99688,1.0,0.053239,0.103165,-0.030136,0.863625
Cards,-0.018273,0.010231,0.053239,1.0,0.042948,-0.051084,0.086456
Age,0.175338,0.100888,0.103165,0.042948,1.0,0.003619,0.001835
Education,-0.027692,-0.023549,-0.030136,-0.051084,0.003619,1.0,-0.008062
Balance,0.463656,0.861697,0.863625,0.086456,0.001835,-0.008062,1.0


In [57]:
credit_df = credit_df.drop(columns=['Limit', 'Age', 'Education'])
credit_df.head()

Unnamed: 0,Income,Rating,Cards,Own,Student,Married,Region,Balance
0,14.891,283,2,No,No,Yes,South,333
1,106.025,483,3,Yes,Yes,Yes,West,903
2,104.593,514,4,No,No,No,West,580
3,148.924,681,3,Yes,No,No,West,964
4,55.882,357,2,No,No,Yes,South,331


In [58]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

# Select columns with object or category dtype (typically categorical)
cat_cols = credit_df.select_dtypes(include=['object', 'category']).columns

for col in cat_cols:
    credit_df[col] = encoder.fit_transform(credit_df[col])

credit_df.head()

Unnamed: 0,Income,Rating,Cards,Own,Student,Married,Region,Balance
0,14.891,283,2,0,0,1,1,333
1,106.025,483,3,1,1,1,2,903
2,104.593,514,4,0,0,0,2,580
3,148.924,681,3,1,0,0,2,964
4,55.882,357,2,0,0,1,1,331


In [59]:
from sklearn.model_selection import train_test_split

X = credit_df.drop(columns=['Balance'])
y = credit_df['Balance']

#Train
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42)
#Test and eval
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)

print("Train:", X_train.shape)
print("Validation:", X_val.shape)
print("Test:", X_test.shape)

Train: (280, 7)
Validation: (60, 7)
Test: (60, 7)


In [60]:
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(criterion='squared_error', random_state=0)
regressor.fit(X_train, y_train)

In [61]:
def mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

In [62]:
y_val_pred = regressor.predict(X_val)
val_mse = mse(y_val, y_val_pred)
print(f"Validation MSE: {val_mse:.4f}")

Validation MSE: 14460.4833


In [63]:
best_val_mse = float('inf')
best_depth = None

# Hyperparameter tuning: max_depth
for depth in range(1, 20):
    model = DecisionTreeRegressor(criterion='squared_error', max_depth=depth, random_state=0)
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    val_mse = mse(y_val, y_val_pred)
    print(f"max_depth={depth}, Validation MSE={val_mse:.4f}")
    if val_mse < best_val_mse:
        best_val_mse = val_mse
        best_depth = depth

print(f"\nBest max_depth: {best_depth} with Validation MSE: {best_val_mse:.4f}")

max_depth=1, Validation MSE=77119.4680
max_depth=2, Validation MSE=55648.8846
max_depth=3, Validation MSE=39372.1546
max_depth=4, Validation MSE=37860.6002
max_depth=5, Validation MSE=30743.1378
max_depth=6, Validation MSE=17006.5919
max_depth=7, Validation MSE=12993.7342
max_depth=8, Validation MSE=21891.5745
max_depth=9, Validation MSE=23098.3232
max_depth=10, Validation MSE=23417.4954
max_depth=11, Validation MSE=23013.7883
max_depth=12, Validation MSE=21978.0833
max_depth=13, Validation MSE=23022.4833
max_depth=14, Validation MSE=21089.5167
max_depth=15, Validation MSE=14460.4833
max_depth=16, Validation MSE=14460.4833
max_depth=17, Validation MSE=14460.4833
max_depth=18, Validation MSE=14460.4833
max_depth=19, Validation MSE=14460.4833

Best max_depth: 7 with Validation MSE: 12993.7342


In [64]:
final_dt = DecisionTreeRegressor(criterion='squared_error', max_depth=best_depth, random_state=0)
final_dt.fit(X_train, y_train)

In [65]:
y_test_pred = final_dt.predict(X_test)
test_mse = mse(y_test, y_test_pred)
print(f"Test MSE: {test_mse:.4f}")

Test MSE: 29601.5121


In [66]:
from xgboost import XGBRegressor

regressor = XGBRegressor(objective='reg:squarederror', random_state=0)
regressor.fit(X_train, y_train)

In [67]:
y_val_pred = regressor.predict(X_val)
val_mse = mse(y_val, y_val_pred)
print(f"Validation MSE: {val_mse:.4f}")

Validation MSE: 10015.7756


In [68]:
best_val_mse = float('inf')
best_depth = None

# Hyperparameter tuning: max_depth
for depth in range(2, 11):
    model = XGBRegressor(objective='reg:squarederror', max_depth=depth, random_state=0)
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    val_mse = mse(y_val, y_val_pred)
    print(f"max_depth={depth}, Validation MSE={val_mse:.4f}")
    if val_mse < best_val_mse:
        best_val_mse = val_mse
        best_depth = depth

print(f"\nBest max_depth: {best_depth} with Validation MSE: {best_val_mse:.4f}")

max_depth=2, Validation MSE=8514.9149
max_depth=3, Validation MSE=7752.8048
max_depth=4, Validation MSE=12449.2717
max_depth=5, Validation MSE=11594.1183
max_depth=6, Validation MSE=10015.7756
max_depth=7, Validation MSE=12352.7387
max_depth=8, Validation MSE=12278.5607
max_depth=9, Validation MSE=12646.7272
max_depth=10, Validation MSE=12844.4970

Best max_depth: 3 with Validation MSE: 7752.8048


In [69]:
final_xgb = XGBRegressor(objective='reg:squarederror', max_depth=best_depth, random_state=0)
final_xgb.fit(X_train, y_train)

In [70]:
y_test_pred = final_xgb.predict(X_test)
test_mse = mse(y_test, y_test_pred)
print(f"Test MSE: {test_mse:.4f}")

Test MSE: 10300.1518
