In [63]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV


In [46]:
# Read the Excel file into a DataFrame
df = pd.read_excel('../data/Dataset_Number2&3a.xlsx')

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,female,Group B,master's degree,standard,none,single,sometimes,yes,4,school_bus,< 5,87,93.0,91
1,male,Group C,some college,standard,none,married,sometimes,yes,0,school_bus,5-10,76,78.0,75
2,female,Group B,some college,standard,completed,widowed,never,no,1,private,5-10,85,93.0,89
3,male,Group B,some college,free/reduced,none,married,sometimes,yes,1,private,> 10,41,43.0,39
4,male,Group D,high school,free/reduced,completed,single,sometimes,no,3,private,> 10,65,64.0,68


In [47]:
label_encoders = {}

# List of categorical columns to encode
categorical_columns = ['Gender', 'EthnicGroup', 'ParentEduc', 'LunchType', 'TestPrep', 
                       'ParentMaritalStatus', 'PracticeSport', 'IsFirstChild', 'TransportMeans', 'WklyStudyHours']

# Loop through each categorical column and apply label encoding
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [48]:
# Separate features (X) and target variable (y)

X = df.drop(['MathScore', 'ReadingScore', 'WritingScore'], axis=1)
y = df['MathScore']

In [50]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
# Feature Scaling
from sklearn.discriminant_analysis import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [54]:
# Hyperparameter tuning with GridSearchCV for better model performance
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1],
    'colsample_bytree': [0.8, 0.9, 1]
}

In [55]:
model = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse')

grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 243 candidates, totalling 729 fits


In [57]:
# Best parameters from GridSearchCV
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Re-fit model with best parameters
best_model = grid_search.best_estimator_

# Predictions
y_pred = best_model.predict(X_test_scaled)

Best parameters: {'colsample_bytree': 1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Mean Squared Error: 162.37770294433622
Root Mean Squared Error (RMSE): 12.742750995932402


In [65]:
# MAE
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.4f}")

# MSE
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.4f}")

# RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

# R²
r2 = r2_score(y_test, y_pred)
print(f"R² (Coefficient of Determination): {r2:.4f}")

Mean Absolute Error (MAE): 10.2992
Mean Squared Error (MSE): 162.3777
Root Mean Squared Error (RMSE): 12.7428
R² (Coefficient of Determination): 0.3109


(3810, 11)