In [2]:
import pandas as pd
import numpy as np

**Loading the Dataset**  
The dataset is loaded using `pd.read_csv`, which reads the CSV file containing wage data.

In [4]:
data = pd.read_csv("DT-Wage.csv")
# data

**Dropping Uninformative Column**  
The `region` column is dropped because all its values are identical.

In [6]:
data = data.drop('region', axis=1)
data 

Unnamed: 0,year,age,maritl,race,education,jobclass,health,health_ins,logwage,wage
0,2006,18,1. Never Married,1. White,1. < HS Grad,1. Industrial,1. <=Good,2. No,4.318063,75.043154
1,2004,24,1. Never Married,1. White,4. College Grad,2. Information,2. >=Very Good,2. No,4.255273,70.476020
2,2003,45,2. Married,1. White,3. Some College,1. Industrial,1. <=Good,1. Yes,4.875061,130.982177
3,2003,43,2. Married,3. Asian,4. College Grad,2. Information,2. >=Very Good,1. Yes,5.041393,154.685293
4,2005,50,4. Divorced,1. White,2. HS Grad,2. Information,1. <=Good,1. Yes,4.318063,75.043154
...,...,...,...,...,...,...,...,...,...,...
2995,2008,44,2. Married,1. White,3. Some College,1. Industrial,2. >=Very Good,1. Yes,5.041393,154.685293
2996,2007,30,2. Married,1. White,2. HS Grad,1. Industrial,2. >=Very Good,2. No,4.602060,99.689464
2997,2005,27,2. Married,2. Black,1. < HS Grad,1. Industrial,1. <=Good,2. No,4.193125,66.229408
2998,2005,27,1. Never Married,1. White,3. Some College,1. Industrial,2. >=Very Good,1. Yes,4.477121,87.981033


**One-Hot Encoding for Categorical Features**  
This cell converts categorical columns (`maritl`, `race`, `education`, `jobclass`, `health`, `health_ins`) into binary dummy variables, allowing the model to process them as numerical features.

In [8]:
data = pd.get_dummies(data, columns=['maritl', 'race', 'education', 'jobclass', 'health', 'health_ins'], prefix=['maritl', 'race', 'education', 'jobclass', 'health', 'health_ins'])

**Ensuring Consistent Data Types**  
Here, all columns in the dataset are cast to integer type to maintain uniformity across the data.

In [10]:
data = data.astype(int)

**Cleaning Column Names**  
Special characters are removed from column names to ensure compatibility with XGBoost and other libraries that may have restrictions on column naming.

In [12]:
data.columns = data.columns.str.replace(r'[^\w\s]', '', regex=True)
data

Unnamed: 0,year,age,logwage,wage,maritl_1 Never Married,maritl_2 Married,maritl_3 Widowed,maritl_4 Divorced,maritl_5 Separated,race_1 White,...,education_2 HS Grad,education_3 Some College,education_4 College Grad,education_5 Advanced Degree,jobclass_1 Industrial,jobclass_2 Information,health_1 Good,health_2 Very Good,health_ins_1 Yes,health_ins_2 No
0,2006,18,4,75,1,0,0,0,0,1,...,0,0,0,0,1,0,1,0,0,1
1,2004,24,4,70,1,0,0,0,0,1,...,0,0,1,0,0,1,0,1,0,1
2,2003,45,4,130,0,1,0,0,0,1,...,0,1,0,0,1,0,1,0,1,0
3,2003,43,5,154,0,1,0,0,0,0,...,0,0,1,0,0,1,0,1,1,0
4,2005,50,4,75,0,0,0,1,0,1,...,1,0,0,0,0,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2008,44,5,154,0,1,0,0,0,1,...,0,1,0,0,1,0,0,1,1,0
2996,2007,30,4,99,0,1,0,0,0,1,...,1,0,0,0,1,0,0,1,0,1
2997,2005,27,4,66,0,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
2998,2005,27,4,87,1,0,0,0,0,1,...,0,1,0,0,1,0,0,1,1,0


**Separating Features and Target Variable**  
The target variable (`wage`) is separated from the feature set (`X`). This prepares the data for model training and testing.

In [14]:
X = data.drop(columns=['wage'])
y = data['wage']
print(type(X))
print(type(y))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


**Splitting the Data into Training, Validation, and Test Sets**  
The dataset is split into training, validation, and test sets, with 70% for training and the remaining 30% split equally for validation and testing.

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)
print(X_test.shape)
print(y_test.shape)

(2100, 23)
(2100,)
(450, 23)
(450,)
(450, 23)
(450,)


**Training the Decision Tree Regressor (Default Settings)**  
In this cell, a `DecisionTreeRegressor` is created and trained with default parameters. Predictions are made on the validation and test sets, and model performance is evaluated using Mean Squared Error (MSE), serving as a baseline for comparison.

In [18]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train)

y_pred_val = regressor.predict(X_val)
y_pred_test = regressor.predict(X_test)

mse_val = mean_squared_error(y_val, y_pred_val)
mse_test = mean_squared_error(y_test, y_pred_test)

print(f'Validation Mean Squared Error: {mse_val}')
print(f'Test Mean Squared Error: {mse_test}')

Validation Mean Squared Error: 1087.098950617284
Test Mean Squared Error: 1239.701234567901


**Hyperparameter Tuning for Decision Tree Regressor**  
This cell fine-tunes hyperparameters for the Decision Tree Regressor. Parameters like `max_depth`, `min_samples_split`, and `min_samples_leaf` are adjusted to optimize performance. The best configuration is selected based on validation performance and used to re-evaluate the model on the test set, compared to the baseline MSE.

In [20]:
model1 = DecisionTreeRegressor(max_depth=8, min_samples_leaf=3, min_samples_split=4)
model2 = DecisionTreeRegressor(max_depth=7, min_samples_leaf=2, min_samples_split=3)
model3 = DecisionTreeRegressor(max_depth=6, min_samples_leaf=1, min_samples_split=2)
model4 = DecisionTreeRegressor(max_depth=5, min_samples_leaf=4, min_samples_split=3)

model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)
model4.fit(X_train, y_train)

y_pred_val = model1.predict(X_val)
mse_val = mean_squared_error(y_val, y_pred_val)
print(f'Validation Mean Squared Error for Model 1: {mse_val}')

y_pred_val = model2.predict(X_val)
mse_val = mean_squared_error(y_val, y_pred_val)
print(f'Validation Mean Squared Error for Model 2: {mse_val}')

y_pred_val = model3.predict(X_val)
mse_val = mean_squared_error(y_val, y_pred_val)
print(f'Validation Mean Squared Error for Model 3: {mse_val}')

y_pred_val = model4.predict(X_val)
mse_val = mean_squared_error(y_val, y_pred_val)
print(f'Validation Mean Squared Error for Model 4: {mse_val}')

y_pred_test = model4.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred_test)
print(f'Test Mean Squared Error for Model 4: {mse_test}')

Validation Mean Squared Error for Model 1: 702.2004919983306
Validation Mean Squared Error for Model 2: 703.7715989533183
Validation Mean Squared Error for Model 3: 673.2172449291448
Validation Mean Squared Error for Model 4: 635.7949913923894
Test Mean Squared Error for Model 4: 678.1998542533339


**Training the XGBoost Regressor (Default Settings)**  
An XGBoost model is initialized and trained on the training data with default settings. The model's performance on validation and test sets is evaluated using MSE, providing a baseline prior to tuning.

In [22]:
import xgboost as xgb

xgb_regressor = xgb.XGBRegressor() 
xgb_regressor.fit(X_train, y_train)

y_pred_val = xgb_regressor.predict(X_val)
y_pred_test = xgb_regressor.predict(X_test)

mse_val = mean_squared_error(y_val, y_pred_val)
mse_test = mean_squared_error(y_test, y_pred_test)

print(f"VALIDATION MSE with XGBoost: {mse_val}")
print(f"Test MSE with XGBoost: {mse_test}")

VALIDATION MSE with XGBoost: 752.2398262672573
Test MSE with XGBoost: 665.9098112909601


**Hyperparameter Tuning for XGBoost Regressor**  
This cell tunes hyperparameters for the XGBoost model, exploring options like `learning_rate`, `max_depth`, `n_estimators`, and `subsample`. The model is retrained with the best parameters and evaluated on the test set, comparing performance against the baseline.

In [24]:
import xgboost as xgb

model1 = xgb.XGBRegressor(subsample=0.7, reg_lambda=0.1, n_estimators=140, max_depth=3, learning_rate=0.2)
model2 = xgb.XGBRegressor(subsample=0.6, reg_lambda=0.01, n_estimators=200, max_depth=4, learning_rate=0.1)
model3 = xgb.XGBRegressor(subsample=0.8, reg_lambda=0.01, n_estimators=100, max_depth=4, learning_rate=0.2)
model4 = xgb.XGBRegressor(subsample=0.8, reg_lambda=0.5, n_estimators=95, max_depth=3, learning_rate=0.05)

model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)
model4.fit(X_train, y_train)

y_pred_val = model1.predict(X_val)
mse_val = mean_squared_error(y_val, y_pred_val)
print(f'Validation Mean Squared Error for Model 1: {mse_val}')

y_pred_val = model2.predict(X_val)
mse_val = mean_squared_error(y_val, y_pred_val)
print(f'Validation Mean Squared Error for Model 2: {mse_val}')

y_pred_val = model3.predict(X_val)
mse_val = mean_squared_error(y_val, y_pred_val)
print(f'Validation Mean Squared Error for Model 3: {mse_val}')

y_pred_val = model4.predict(X_val)
mse_val = mean_squared_error(y_val, y_pred_val)
print(f'Validation Mean Squared Error for Model 4: {mse_val}')

y_pred_test = model4.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred_test)
print(f'Test Mean Squared Error for Model 4: {mse_test}')

Validation Mean Squared Error for Model 1: 662.9258639564487
Validation Mean Squared Error for Model 2: 733.1837182378528
Validation Mean Squared Error for Model 3: 649.4374298489231
Validation Mean Squared Error for Model 4: 569.2347972814398
Test Mean Squared Error for Model 4: 583.1017755011026
