In [2]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[0]

sys.path.insert(0, str(PROJECT_ROOT))

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

import lightgbm as lgb

In [4]:
DATA_TRAIN = Path("../data/processed/dataset_train.csv")
PIPELINE_PATH = Path("../models/fe_pipeline.joblib")

MODEL_BASELINE_PATH = Path("../models/ridge_baseline.joblib")
MODEL_MAIN_PATH = Path("../models/lightgbm_model.joblib")

DROP_COLS = ["age", "gender", "internet_access", "course", "exam_difficulty", "id"]

#### Load FEATURE ENGINEERING PIPELINE

In [5]:
from src.feature_engineering import add_features

pipeline = joblib.load(PIPELINE_PATH)

In [6]:
DATA_PATH = PROJECT_ROOT / "data" / "processed" / "dataset_train.csv"
df = pd.read_csv(DATA_PATH)

print(df.shape)
df.head()

(630000, 14)


Unnamed: 0,study_hours,class_attendance,sleep_hours,study_attend_product,sleep_deficit,study_hours_sq,sleep_attend_product,sleep_quality,facility_rating,study_method_group_study,study_method_mixed,study_method_online_videos,study_method_self-study,exam_score
0,1.655875,1.538302,-1.245269,2.535425,1.245269,2.093542,-0.143196,1.0,0.0,0.0,0.0,1.0,0.0,78.3
1,0.401573,1.308814,-1.359895,0.919045,1.359895,0.148905,-0.356236,0.0,1.0,0.0,0.0,0.0,1.0,46.7
2,0.28716,1.182595,-0.729454,0.733247,0.729454,0.016075,0.149401,0.0,2.0,0.0,0.0,0.0,0.0,99.0
3,-0.848492,-1.290141,0.703367,-0.997639,-0.703367,-0.898495,-0.548004,1.0,2.0,1.0,0.0,0.0,0.0,63.9
4,1.545699,0.855575,1.448434,1.931198,-1.448434,1.886867,1.791175,2.0,2.0,0.0,0.0,0.0,1.0,100.0


In [7]:
TARGET = "exam_score"

X = df.drop(columns=[TARGET])
y = df[TARGET]

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

print(X_train.shape, X_valid.shape)

(504000, 13) (126000, 13)


# Baseline: Ridge Regression

In [9]:
ridge = Ridge(alpha=1.0)

ridge.fit(X_train, y_train)

ridge_pred = ridge.predict(X_valid)
rmse_ridge = np.sqrt(mean_squared_error(y_valid, ridge_pred))

print(f"[Baseline | Ridge] RMSE = {rmse_ridge:.4f}")

[Baseline | Ridge] RMSE = 8.8868


# Main: LightGBM

In [10]:
lgbm = lgb.LGBMRegressor(
    n_estimators=3000,
    learning_rate=0.02,
    num_leaves=127,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_samples=20,
    lambda_l1=0.1,
    lambda_l2=0.1,
    min_split_gain=0.01,
    random_state=42,
)

lgbm.fit(
    X_train,
    y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="rmse",
    callbacks=[
        lgb.early_stopping(100),
        lgb.log_evaluation(50),
    ],
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017007 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1402
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 13
[LightGBM] [Info] Start training from score 62.482335
Training until validation scores don't improve for 100 rounds
[50]	valid_0's rmse: 11.2105	valid_0's l2: 125.676
[100]	valid_0's rmse: 9.37607	valid_0's l2: 87.9106
[150]	valid_0's rmse: 8.96569	valid_0's l2: 80.3836
[200]	valid_0's rmse: 8.85839	valid_0's l2: 78.471
[250]	valid_0's rmse: 8.82191	valid_0's l2: 77.8261
[300]	valid_0's rmse: 8.80255	valid_0's l2: 77.4849
[350]	valid_0's rmse: 8.79093	valid_0's l2: 77.2804
[400]	valid_0's rmse: 8.78163	valid_0's l2: 77.1171
[450]	valid_0's rmse: 8.77472	valid_0's l2: 76.9957
[500]	valid_0's rmse: 8.77074	valid_0's l2: 76.9258
[550]	valid_0's rmse: 8.76626	valid_0's l2: 76.8473
[600]	valid_0's rmse: 8.

0,1,2
,boosting_type,'gbdt'
,num_leaves,127
,max_depth,-1
,learning_rate,0.02
,n_estimators,3000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.01
,min_child_weight,0.001


In [13]:
import pandas as pd

importance = lgbm.feature_importances_
feature_names = X.columns  # từ dataset_train.csv

fi = pd.DataFrame({
    "feature": feature_names,
    "importance": importance
}).sort_values(by="importance", ascending=False)

fi.head(10)


Unnamed: 0,feature,importance
0,study_hours,35799
1,class_attendance,34202
6,sleep_attend_product,27629
3,study_attend_product,26810
2,sleep_hours,23909
5,study_hours_sq,10781
7,sleep_quality,9166
8,facility_rating,8862
4,sleep_deficit,5564
12,study_method_self-study,4811
