# LightGBM

## Load preprocessed data


In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder


DATA_CLEAN = Path("../data/interim/data_cleaned.csv")
TEST_RAW = Path("../data/raw/test.csv")
SUBMISSION_OUT = Path("../data/processed/submission_lightgbm.csv")


DROP_COLS = ["age", "gender", "internet_access", "course", "exam_difficulty"]
TARGET = "exam_score"

NUM_BASE = ["study_hours", "class_attendance", "sleep_hours"]
CAT_ONEHOT = ["study_method"]
CAT_ORDINAL = ["sleep_quality", "facility_rating"]

train_df_raw = pd.read_csv(DATA_CLEAN)
train_df = train_df_raw.drop(columns=DROP_COLS)

train_df["study_attend_product"] = train_df["study_hours"] * train_df["class_attendance"]
train_df["sleep_deficit"] = 8 - train_df["sleep_hours"]

NUM_FEATURES = NUM_BASE + ["study_attend_product", "sleep_deficit"]

le_dict = {}
for col in CAT_ONEHOT + CAT_ORDINAL:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    le_dict[col] = le

X = train_df[NUM_FEATURES + CAT_ONEHOT + CAT_ORDINAL]
y = train_df[TARGET]

train_df[NUM_FEATURES + CAT_ONEHOT + CAT_ORDINAL].head()

Unnamed: 0,study_hours,class_attendance,sleep_hours,study_attend_product,sleep_deficit,study_method,sleep_quality,facility_rating
0,7.91,98.8,4.9,781.508,3.1,3,0,1
1,4.95,94.8,4.7,469.26,3.3,4,2,2
2,4.68,92.6,5.8,433.368,2.2,0,2,0
3,2.0,49.5,8.3,99.0,-0.3,1,0,0
4,7.65,86.9,9.6,664.785,-1.6,4,1,0


## Train/validation split

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Train: {X_train.shape}, Valid: {X_valid.shape}")

Train: (504000, 8), Valid: (126000, 8)


## Train LightGBM

In [6]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

lgbm = lgb.LGBMRegressor(
    n_estimators=3000,
    learning_rate=0.02,
    num_leaves=127,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_samples=20,
    lambda_l1=0.1,
    lambda_l2=0.1,
    min_split_gain=0.01,
    random_state=42,
)

lgbm.fit(
    X_train,
    y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="rmse",
    callbacks=[
        lgb.early_stopping(100),
        lgb.log_evaluation(50),
    ],
)

valid_pred = lgbm.predict(X_valid, num_iteration=lgbm.best_iteration_)
mse = mean_squared_error(y_valid, valid_pred)
rmse = np.sqrt(mse)
print(f"\n{'='*50}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Best iteration: {lgbm.best_iteration_}")
print(f"{'='*50}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061371 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 885
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 8
[LightGBM] [Info] Start training from score 62.482335
Training until validation scores don't improve for 100 rounds
[50]	valid_0's rmse: 11.1268	valid_0's l2: 123.805
[100]	valid_0's rmse: 9.31772	valid_0's l2: 86.8198
[150]	valid_0's rmse: 8.92793	valid_0's l2: 79.708
[200]	valid_0's rmse: 8.83594	valid_0's l2: 78.0739
[250]	valid_0's rmse: 8.80555	valid_0's l2: 77.5378
[300]	valid_0's rmse: 8.79141	valid_0's l2: 77.2888
[350]	valid_0's rmse: 8.78197	valid_0's l2: 77.123
[400]	valid_0's rmse: 8.77525	valid_0's l2: 77.005
[450]	valid_0's rmse: 8.77	valid_0's l2: 76.9129
[500]	valid_0's rmse: 8.76583	valid_0's l2: 76.8398
[550]	valid_0's rmse: 8.76263	valid_0's l2: 76.7836
[600]	valid_0's rmse: 8.76005	v

## Generate predictions for test set

In [None]:
best_iter = lgbm.best_iteration_ or 3000

lgbm_full = lgb.LGBMRegressor(
    n_estimators=best_iter,
    learning_rate=0.02,
    num_leaves=127,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_samples=20,
    lambda_l1=0.1,
    lambda_l2=0.1,
    min_split_gain=0.01,
    random_state=42,
)
lgbm_full.fit(X, y)

# Load and preprocess test data
test_raw = pd.read_csv(TEST_RAW)

# Drop columns
test_df = test_raw.drop(columns=DROP_COLS)

# Apply same feature engineering
test_df['study_attend_product'] = test_df['study_hours'] * test_df['class_attendance']
test_df['sleep_deficit'] = 8 - test_df['sleep_hours']

# Encode categorical features using the same encoders
for col in CAT_ONEHOT + CAT_ORDINAL:
    test_df[col] = le_dict[col].transform(test_df[col])

# Select features
X_test = test_df[NUM_FEATURES + CAT_ONEHOT + CAT_ORDINAL]

# Predict
submission = pd.DataFrame({
    "id": test_raw["id"],
    "exam_score": lgbm_full.predict(X_test),
})

SUBMISSION_OUT.parent.mkdir(parents=True, exist_ok=True)
submission.to_csv(SUBMISSION_OUT, index=False)

print(f"Submission saved to {SUBMISSION_OUT}")
submission.head()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043046 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 885
[LightGBM] [Info] Number of data points in the train set: 630000, number of used features: 8
[LightGBM] [Info] Start training from score 62.506672
Submission saved to ../data/processed/submission_lightgbm.csv


Unnamed: 0,id,exam_score
0,630000,70.416394
1,630001,70.488209
2,630002,87.612391
3,630003,57.307602
4,630004,47.541229
