# Setup environment

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

try:
    import lightgbm as lgb
except:
    print("lightgbm is not installed")

DATA_DIR = "/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/merged-data/by-station"
STATION_IDS = ["211", "212", "213", "214", "215", "216"]

BASE_FEATURE_COLS = [
    "NO2_quantrac",
    "PM25_quantrac",
    "O3_quantrac",
    "CO_quantrac",
    "Temperature_quantrac",
    "Humid_quantrac",
    "SO2_quantrac"
]

label = "NO2_quantrac"

# Define lag and rolling windows for feature engineering
LAG_STEPS = [1, 2, 3, 6, 12, 24, 48, 72]
ROLL_WINDOWS = [3, 6, 12, 24]

# Define horizon (= n_past = n_future)
HORIZONS = list(range(1, 73))

RANDOM_STATE = 42

In [2]:
import lightgbm
print(lightgbm.__version__)

4.6.0


In [3]:
def load_station_csv(station_id, data_dir=DATA_DIR):
    csv_path = os.path.join(data_dir, f"{station_id}.csv")
    df = pd.read_csv(csv_path)

    # Convert to datetime
    df["date"] = pd.to_datetime(df["date"])
    df = df.sort_values("date").reset_index(drop=True)

    # Remove unncessary column
    if "Unnamed: 0" in df.columns:
        df = df.drop(columns=["Unnamed: 0"])

    # Add station_id column
    df["station_id"] = int(station_id)
    return df

# Load dataset
dfs = []
for sid in STATION_IDS:
    df_sid = load_station_csv(sid, DATA_DIR)
    dfs.append(df_sid)
df_all = pd.concat(dfs, ignore_index=True)

# Sort data by station then date
df_all = df_all.sort_values(["station_id", "date"]).reset_index(drop=True)

# Remove uncessary features
df_all = df_all.loc[:, ["date"] + BASE_FEATURE_COLS + ["station_id"]]

print(df_all.shape)
print(df_all.columns)
df_all

(92966, 9)
Index(['date', 'NO2_quantrac', 'PM25_quantrac', 'O3_quantrac', 'CO_quantrac',
       'Temperature_quantrac', 'Humid_quantrac', 'SO2_quantrac', 'station_id'],
      dtype='object')


Unnamed: 0,date,NO2_quantrac,PM25_quantrac,O3_quantrac,CO_quantrac,Temperature_quantrac,Humid_quantrac,SO2_quantrac,station_id
0,2021-02-23 21:00:00,112.740762,15.604762,55.431381,90.000000,28.361905,63.188095,,211
1,2021-02-23 22:00:00,112.366471,14.594118,58.197176,1200.603529,28.320588,63.773529,,211
2,2021-02-23 23:00:00,112.700433,13.436667,55.029433,1177.897000,28.336667,64.205000,,211
3,2021-02-24 00:00:00,112.480867,12.365000,54.767700,90.000000,28.305000,64.735000,,211
4,2021-02-24 01:00:00,114.331500,11.636667,53.786200,90.000000,28.300000,65.188333,,211
...,...,...,...,...,...,...,...,...,...
92961,2022-12-31 19:00:00,0.088233,41.450000,75.916667,2.087167,27.690000,67.290000,0.105667,216
92962,2022-12-31 20:00:00,0.083217,50.530000,66.116667,2.006500,27.401667,68.026667,0.104167,216
92963,2022-12-31 21:00:00,0.084300,55.055000,55.133333,2.418833,27.603333,66.965000,0.130167,216
92964,2022-12-31 22:00:00,0.066650,26.078333,46.750000,1.481500,27.036667,63.361667,0.075833,216


In [4]:
def add_time_features(df, time_col="date"):
    df = df.copy()
    df["hour"] = df[time_col].dt.hour
    df["dayofweek"] = df[time_col].dt.dayofweek
    df["month"] = df[time_col].dt.month
    return df

# Add lag features for all FEATURES including LABEL
def add_lag_features(df, group_col, target_cols, lag_steps):
    df = df.copy()
    df = df.sort_values(["station_id", "date"])

    for col in target_cols:
        for lag in lag_steps:
            df[f"{col}_lag{lag}"] = df.groupby(group_col)[col].shift(lag)
    return df

# Just add the rolling features for LABEL
def add_rolling_features(df, group_col, target_cols, windows):
    assert isinstance(target_cols, list), "target_cols should be a list"

    df = df.copy()
    df = df.sort_values(["station_id", "date"])

    for col in target_cols:
        for w in windows:
            df[f"{col}_roll{w}_mean"] = (
                df.groupby(group_col)[col]
                  .shift(1) # use the past, avoid leaking
                  .rolling(window=w, min_periods=1)
                  .mean()
                  .reset_index(level=0, drop=True)
            )
            df[f"{col}_roll{w}_std"] = (
                df.groupby(group_col)[col]
                  .shift(1)
                  .rolling(window=w, min_periods=1)
                  .std()
                  .reset_index(level=0, drop=True)
            )
    return df

# Add time features and lag feature for all gases
# df_no2 = add_time_features(df_all)
# df_no2 = add_lag_features(df_no2, group_col="station_id", target_cols=BASE_FEATURE_COLS, lag_steps=LAG_STEPS)
# df_no2 = add_rolling_features(df_no2, group_col="station_id", target_cols=[label], windows=ROLL_WINDOWS)

# df_no2

In [None]:
df_no2 = add_time_features(df_all)
df_no2 = add_lag_features(df_no2, group_col="station_id", target_cols=BASE_FEATURE_COLS, lag_steps=LAG_STEPS)
df_no2 = add_rolling_features(df_no2, group_col="station_id", target_cols=[label], windows=ROLL_WINDOWS)

df_no2

In [7]:
df_no2 = add_time_features(df_all)
df_no2 = add_lag_features(df_no2, group_col="station_id", target_cols=BASE_FEATURE_COLS, lag_steps=LAG_STEPS)
df_no2 = add_rolling_features(df_no2, group_col="station_id", target_cols=[label], windows=ROLL_WINDOWS)

df_no2.shape

(92966, 76)

In [8]:
# Reframe past future
def build_supervised_for_horizon(df, horizon_h, target_col):
    # Sort data by station_id then date
    df = df.copy()
    df = df.sort_values(["station_id", "date"])

    # Define name of target column
    target_name = f"{target_col}_t_plus_{horizon_h}h"

    # Mục tiêu: NO2 tại tương lai t+h
    df[target_name] = df.groupby("station_id")[target_col].shift(-horizon_h)

    # Define feature columns
    ignore_cols = ["date", target_col, target_name]
    feature_cols = [c for c in df.columns if c not in ignore_cols]

    # Remove duplicated columns (if possible)
    feature_cols = list(dict.fromkeys(feature_cols))

    # Do not use station_id as feature
    feature_cols_no_sid = [c for c in feature_cols if c != "station_id"]
    data = df[["date", "station_id"] + feature_cols_no_sid + [target_name]].dropna()

    X = data[feature_cols_no_sid]
    y = data[target_name]
    meta = data[["date", "station_id"]]

    return X, y, meta, feature_cols_no_sid

# Example
X_h1, y_h1, meta_h1, feat_cols = build_supervised_for_horizon(df_no2, horizon_h=72, target_col=label)
print("Horizon 1h:", X_h1.shape, y_h1.shape)
print(X_h1.columns)
display(X_h1)
display(y_h1)

Horizon 1h: (5798, 73) (5798,)
Index(['PM25_quantrac', 'O3_quantrac', 'CO_quantrac', 'Temperature_quantrac',
       'Humid_quantrac', 'SO2_quantrac', 'hour', 'dayofweek', 'month',
       'NO2_quantrac_lag1', 'NO2_quantrac_lag2', 'NO2_quantrac_lag3',
       'NO2_quantrac_lag6', 'NO2_quantrac_lag12', 'NO2_quantrac_lag24',
       'NO2_quantrac_lag48', 'NO2_quantrac_lag72', 'PM25_quantrac_lag1',
       'PM25_quantrac_lag2', 'PM25_quantrac_lag3', 'PM25_quantrac_lag6',
       'PM25_quantrac_lag12', 'PM25_quantrac_lag24', 'PM25_quantrac_lag48',
       'PM25_quantrac_lag72', 'O3_quantrac_lag1', 'O3_quantrac_lag2',
       'O3_quantrac_lag3', 'O3_quantrac_lag6', 'O3_quantrac_lag12',
       'O3_quantrac_lag24', 'O3_quantrac_lag48', 'O3_quantrac_lag72',
       'CO_quantrac_lag1', 'CO_quantrac_lag2', 'CO_quantrac_lag3',
       'CO_quantrac_lag6', 'CO_quantrac_lag12', 'CO_quantrac_lag24',
       'CO_quantrac_lag48', 'CO_quantrac_lag72', 'Temperature_quantrac_lag1',
       'Temperature_quantrac_lag2'

Unnamed: 0,PM25_quantrac,O3_quantrac,CO_quantrac,Temperature_quantrac,Humid_quantrac,SO2_quantrac,hour,dayofweek,month,NO2_quantrac_lag1,...,SO2_quantrac_lag48,SO2_quantrac_lag72,NO2_quantrac_roll3_mean,NO2_quantrac_roll3_std,NO2_quantrac_roll6_mean,NO2_quantrac_roll6_std,NO2_quantrac_roll12_mean,NO2_quantrac_roll12_std,NO2_quantrac_roll24_mean,NO2_quantrac_roll24_std
84278,15.130000,43.433333,0.624833,29.078333,62.088333,0.050500,0,1,1,0.063250,...,0.073833,0.088139,0.061544,0.001788,0.066033,0.006117,0.057625,0.012378,0.065115,0.012229
84279,12.840000,52.400000,0.642167,28.348333,85.075000,0.050667,1,1,1,0.061917,...,0.047966,0.065333,0.061617,0.001802,0.063647,0.003629,0.059007,0.011793,0.064784,0.012202
84280,9.021667,46.983333,0.363500,27.545000,80.896667,0.024000,2,1,1,0.075300,...,0.039167,0.053167,0.066822,0.007372,0.065308,0.006037,0.061811,0.011281,0.065159,0.012387
84281,7.460000,66.366667,0.322833,24.828333,93.530000,0.019167,3,1,1,0.059317,...,0.031500,0.045167,0.065511,0.008577,0.063528,0.005952,0.062094,0.011162,0.064624,0.012349
84282,6.306667,68.300000,0.512667,25.085000,95.483333,0.030000,4,1,1,0.077950,...,0.029000,0.044500,0.070856,0.010080,0.066236,0.008219,0.065560,0.008612,0.065071,0.012638
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92889,32.163333,89.283333,2.196333,29.625000,58.043333,0.119667,19,2,12,0.093567,...,0.072000,0.087333,0.084544,0.011076,0.060633,0.028507,0.067747,0.029623,0.080519,0.024886
92890,37.030000,116.950000,2.206833,28.140000,71.948333,0.103333,20,2,12,0.094700,...,0.066167,0.113000,0.092050,0.003653,0.072367,0.024810,0.068039,0.029891,0.080944,0.025044
92891,40.385000,120.166667,1.735333,27.395000,75.831667,0.055833,21,2,12,0.115133,...,0.038000,0.126333,0.101133,0.012138,0.082928,0.027616,0.069067,0.031370,0.082074,0.025971
92892,38.216667,105.000000,1.451333,26.738333,79.206667,0.046500,22,2,12,0.112750,...,0.039167,0.097833,0.107528,0.011173,0.096036,0.016046,0.070396,0.032950,0.082821,0.026604


84278    0.070547
84279    0.070900
84280    0.069633
84281    0.067533
84282    0.067383
           ...   
92889    0.088233
92890    0.083217
92891    0.084300
92892    0.066650
92893    0.067783
Name: NO2_quantrac_t_plus_72h, Length: 5798, dtype: float64

In [9]:
def train_test_validation_split(X, y, meta, train_ratio=0.7, val_ratio=0.15):
    n = len(X)
    train_end = int(n * train_ratio)
    val_end = int(n * (train_ratio + val_ratio))

    X_train = X.iloc[:train_end]
    y_train = y.iloc[:train_end]
    meta_train = meta.iloc[:train_end]

    X_val = X.iloc[train_end:val_end]
    y_val = y.iloc[train_end:val_end]
    meta_val = meta.iloc[train_end:val_end]

    X_test = X.iloc[val_end:]
    y_test = y.iloc[val_end:]
    meta_test = meta.iloc[val_end:]

    return (X_train, y_train, meta_train, X_val, y_val, meta_val, X_test, y_test, meta_test)


In [11]:
testmodel = lgb.Booster(model_file='../lightgbm_results/2025_12_02-13_08_05/NO2_quantrac_lightgbm_72h')
horizon_h = 72
target_col = "NO2_quantrac"
station_id = 216
n_points = 300

# Prepare data for reference
X, y, meta, feat_cols = build_supervised_for_horizon(df_no2, horizon_h, target_col)
(_, _, _,
 _, _, _,
 X_test, y_test, meta_test) = train_test_validation_split(X, y, meta)
print(X_test.shape, y_test.shape)

# Get the data of station
mask = (meta_test["station_id"] == station_id)
X_test_sid = X_test[mask]
y_test_sid = y_test[mask]
meta_sid = meta_test[mask]
assert len(X_test_sid) != 0, f"No test sample for station {station_id}"

# Prediction
y_pred_sid = testmodel.predict(X_test_sid, num_iteration=getattr(testmodel, "best_iteration", None))

# Plotting
X_plot = meta_sid.iloc[-n_points:]
y_true_plot = y_test_sid.iloc[-n_points:]
y_pred_plot = y_pred_sid[-n_points:]

plt.figure(figsize=(14, 5))
plt.plot(X_plot["date"], y_true_plot, label=f"Actual {target_col}", linewidth=1.5)
plt.plot(X_plot["date"], y_pred_plot, label=f"Predicted {target_col} (LightGBM)", linestyle="--")
plt.title(f"Station {station_id} - Horizon {horizon_h}h")
plt.xlabel("Time")
plt.ylabel(f"{target_col} (µg/m³)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

(870, 73) (870,)


[LightGBM] [Fatal] The number of features in data (73) is not the same as it was in training data (77).
You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.


LightGBMError: The number of features in data (73) is not the same as it was in training data (77).
You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.