In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Preprocessed_Data_Final.xls")


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1449833 entries, 0 to 1449832
Data columns (total 50 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Unnamed: 0                 1449833 non-null  int64  
 1   StationId                  1449833 non-null  object 
 2   StationId_enc              1449833 non-null  int64  
 3   PM2.5                      1449833 non-null  float64
 4   NO2                        1449833 non-null  float64
 5   CO                         1449833 non-null  float64
 6   SO2                        1449833 non-null  float64
 7   O3                         1449833 non-null  float64
 8   AQI                        1449833 non-null  float64
 9   Temperature                1449833 non-null  float64
 10  DewPoint                   1449833 non-null  float64
 11  WindDirection              1449833 non-null  float64
 12  WindSpeed                  1449833 non-null  float64
 13  Pressure    

In [22]:
df.sample(3)

Unnamed: 0.1,Unnamed: 0,StationId,StationId_enc,PM2.5,NO2,CO,SO2,O3,AQI,Temperature,...,Year,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday,Season_PostMonsoon,Season_Summer,Season_Winter
864257,866033,TG003,75,43.5,9.38,0.24,12.32,38.2,125.0,24.0,...,2019,0,1,0,0,0,0,0,0,1
702478,704206,DL023,28,197.0,63.75,1.3,8.21,4.37,335.0,16.6,...,2018,1,0,0,0,0,0,1,0,0
1355291,1357523,TN001,79,0.22,10.37,0.74,6.37,22.16,69.0,29.0,...,2019,0,0,0,0,1,0,1,0,0


In [4]:
# STEP 12 — Time-based train/test split

# Ensure sorted (safe)
df = df.sort_values("Datetime").reset_index(drop=True)

# 85% Train, 15% Test
split_idx = int(len(df) * 0.85)

train_df = df.iloc[:split_idx]
test_df  = df.iloc[split_idx:]

print("Train size:", train_df.shape)
print("Test size:", test_df.shape)

Train size: (1232358, 50)
Test size: (217475, 50)


In [5]:
# Define target
y_train = train_df["AQI"]
y_test  = test_df["AQI"]

# Define features (drop non-feature columns)
X_train = train_df.drop(columns=["AQI", "StationId", "Datetime"])
X_test  = test_df.drop(columns=["AQI", "StationId", "Datetime"])

print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

X_train: (1232358, 47)
X_test: (217475, 47)
y_train: (1232358,)
y_test: (217475,)


In [6]:
# SAVE FEATURE COLUMNS (VERY IMPORTANT)
# Ensure X_train exists and contains final training features
import pickle
feature_cols = X_train.columns.tolist()

with open("feature_cols_new.pkl", "wb") as f:
    pickle.dump(feature_cols, f)

print("Feature columns saved as feature_cols.pkl")

Feature columns saved as feature_cols.pkl


# Model 

In [29]:
import pandas as pd
import numpy as np

In [30]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
results = {}

def evaluate(name, model, X_test, y_test):
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae  = mean_absolute_error(y_test, preds)
    r2   = r2_score(y_test, preds)
    results[name] = [rmse, mae, r2]

In [31]:
from sklearn.linear_model import LinearRegression

# 1. Linear Regression (baseline)
lr = LinearRegression(n_jobs=-1)
lr.fit(X_train, y_train)
evaluate("Linear Regression", lr, X_test, y_test)

In [11]:
from sklearn.ensemble import RandomForestRegressor

# 2. Random Forest
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=20,
    n_jobs=-1,
    random_state=42
)
rf.fit(X_train, y_train)
evaluate("Random Forest", rf, X_test, y_test)


In [32]:
from catboost import CatBoostRegressor

# 5. CatBoost (BEST MODEL)
cat = CatBoostRegressor(
    depth=10,
    learning_rate=0.05,
    iterations=300,
    loss_function="RMSE",
    random_state=42,
    verbose=100
)
cat.fit(X_train, y_train)
evaluate("CatBoost", cat, X_test, y_test)

0:	learn: 142.4889376	total: 528ms	remaining: 2m 37s
100:	learn: 59.2880241	total: 46.1s	remaining: 1m 30s
200:	learn: 53.9363587	total: 1m 35s	remaining: 47.1s
299:	learn: 50.7059759	total: 2m 23s	remaining: 0us


In [22]:
# Compare results
df_results = pd.DataFrame(results, index=["RMSE", "MAE", "R2"]).T
print(df_results)

                        RMSE        MAE        R2
Linear Regression  58.360781  35.484673  0.844038
CatBoost           56.144475  18.491473  0.855659


# For LightGBM

In [16]:
df = pd.read_csv("Preprocessed_Data_Final.csv")

In [21]:
# Fix LightGBM unsupported characters in column names

df.columns = (
    df.columns
    .str.replace(" ", "_", regex=False)
    .str.replace(".", "_", regex=False)
    .str.replace("[", "", regex=False)
    .str.replace("]", "", regex=False)
    .str.replace("(", "", regex=False)
    .str.replace(")", "", regex=False)
    .str.replace("/", "_", regex=False)
    .str.replace("\\", "_", regex=False)
    .str.replace(":", "_", regex=False)
)

print("Renamed columns (Safe for LightGBM):")
print(df.columns.tolist())


Renamed columns (Safe for LightGBM):
['Unnamed__0', 'StationId', 'StationId_enc', 'PM2_5', 'NO2', 'CO', 'SO2', 'O3', 'AQI', 'Temperature', 'DewPoint', 'WindDirection', 'WindSpeed', 'Pressure', 'RelativeHumidity', 'PM25_lag1', 'PM25_lag3', 'PM25_lag6', 'PM25_lag24', 'NO2_lag1', 'O3_lag1', 'PM25_roll3', 'PM25_roll6', 'PM25_roll12', 'PM25_roll24', 'Hour_sin', 'Hour_cos', 'Month_sin', 'Month_cos', 'Pollution_Load', 'PM_Ratio', 'Temp_Humidity_Interaction', 'Wind_Inv', 'Datetime', 'Hour', 'DayOfWeek', 'IsWeekend', 'Month', 'Quarter', 'DayOfYear', 'Year', 'Day_Monday', 'Day_Saturday', 'Day_Sunday', 'Day_Thursday', 'Day_Tuesday', 'Day_Wednesday', 'Season_PostMonsoon', 'Season_Summer', 'Season_Winter']


In [22]:
# STEP 12 — Time-based train/test split

# Ensure sorted (safe)
df = df.sort_values("Datetime").reset_index(drop=True)

# 85% Train, 15% Test
split_idx = int(len(df) * 0.85)

train_df = df.iloc[:split_idx]
test_df  = df.iloc[split_idx:]

print("Train size:", train_df.shape)
print("Test size:", test_df.shape)

Train size: (1232358, 50)
Test size: (217475, 50)


In [23]:
# Define target
y_train = train_df["AQI"]
y_test  = test_df["AQI"]

# Define features (drop non-feature columns)
X_train = train_df.drop(columns=["AQI", "StationId", "Datetime"])
X_test  = test_df.drop(columns=["AQI", "StationId", "Datetime"])

print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

X_train: (1232358, 47)
X_test: (217475, 47)
y_train: (1232358,)
y_test: (217475,)


In [24]:
from lightgbm import LGBMRegressor

# 4. LightGBM
lgbm = LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
lgbm.fit(X_train, y_train)
evaluate("LightGBM", lgbm, X_test, y_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024748 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6907
[LightGBM] [Info] Number of data points in the train set: 1232358, number of used features: 47
[LightGBM] [Info] Start training from score 189.197461


In [25]:
# Compare results
df_results = pd.DataFrame(results, index=["RMSE", "MAE", "R2"]).T
print(df_results)

                        RMSE        MAE        R2
Linear Regression  58.360781  35.484673  0.844038
Random Forest      63.948947  36.221569  0.812741
XGBoost            53.423485  28.889954  0.869311
CatBoost           59.722744  36.524684  0.836674
LightGBM           59.551926  33.884292  0.837598


In [26]:
import joblib

feature_cols = X_train.columns.tolist()
joblib.dump(feature_cols, "feature_cols.pkl")

print("Saved feature columns:", len(feature_cols))


Saved feature columns: 47


In [27]:
import pickle

# SAVE TRAINED MODEL 
# Example: model = CatBoostRegressor() ... model.fit(...)
with open("Linear_Reg.pkl", "wb") as f:
    pickle.dump(lr, f)

with open("Random_Forest.pkl", "wb") as f:
    pickle.dump(rf, f)

with open("XGBoost.pkl", "wb") as f:
    pickle.dump(xgb_model, f)

with open("CatBoost.pkl", "wb") as f:
    pickle.dump(cat, f)

with open("LightGBM.pkl", "wb") as f:
    pickle.dump(lgbm, f)



# Optimized CatBoost

In [9]:
import joblib
optimized_model = joblib.load("CatBoost_Optimized.pkl")


In [33]:
evaluate("O_CatBoost", optimized_model, X_test, y_test)

In [34]:
df_results = pd.DataFrame(results, index=["RMSE", "MAE", "R2"]).T
print(df_results)

                        RMSE        MAE        R2
Linear Regression  58.360781  35.484673  0.844038
CatBoost           59.722744  36.524684  0.836674
O_CatBoost         56.144475  18.491473  0.855659
