In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [2]:
data_path = '/hdd/data/adp_data/student_lifestyle_dataset.csv'
df = pd.read_csv(data_path, index_col = 'Student_ID')
df.head()

Unnamed: 0_level_0,Study_Hours_Per_Day,Extracurricular_Hours_Per_Day,Sleep_Hours_Per_Day,Social_Hours_Per_Day,Physical_Activity_Hours_Per_Day,Stress_Level,Gender,Grades
Student_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,6.9,3.8,8.7,2.8,1.8,Moderate,Male,7.48
2,5.3,3.5,8.0,4.2,3.0,Low,Female,6.88
3,5.1,3.9,9.2,1.2,4.6,Low,Male,6.68
4,6.5,2.1,7.2,1.7,6.5,Moderate,Male,7.2
5,8.1,0.6,6.5,2.2,6.6,High,Male,8.78


In [3]:
df_encoded = pd.get_dummies(df, columns = ['Stress_Level', 'Gender'], drop_first=True)

X = df_encoded.drop(columns = ['Grades'])
y = df_encoded['Grades']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 526)

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state = 526),
    'XGBoost': XGBRegressor(random_state = 526,  verbosity = 0)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f"{name} - RMSE: {rmse:.2f}, R^2: {r2:.2f}")
    print('-' * 30)

Linear Regression - RMSE: 0.49, R^2: 0.54
------------------------------
Random Forest - RMSE: 0.52, R^2: 0.49
------------------------------
XGBoost - RMSE: 0.56, R^2: 0.39
------------------------------


In [7]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [2, 3, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb = XGBRegressor(random_state = 526)
grid_search = GridSearchCV(xgb, param_grid, cv = 5, scoring = 'r2', n_jobs = -1, verbose = 1)
grid_search.fit(X_train, y_train)

print('Best Params:', grid_search.best_params_)
print('Best R2 Score (CV):', grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f'XGBoost Tuned - RMSE: {rmse:.2f}, R^2: {r2:.2f}')

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 50, 'subsample': 1.0}
Best R2 Score (CV): 0.5286591860549876
XGBoost Tuned - RMSE: 0.49, R^2: 0.54
