In [None]:
!pip install hopsworks[python] lightgbm requests matplotlib pandas numpy scikit-learn


Collecting hopsworks[python]
  Downloading hopsworks-4.3.2-py3-none-any.whl.metadata (11 kB)
Collecting pyhumps==1.6.1 (from hopsworks[python])
  Downloading pyhumps-1.6.1-py3-none-any.whl.metadata (3.7 kB)
Collecting furl (from hopsworks[python])
  Downloading furl-2.1.4-py2.py3-none-any.whl.metadata (25 kB)
Collecting boto3 (from hopsworks[python])
  Downloading boto3-1.40.17-py3-none-any.whl.metadata (6.7 kB)
Collecting numpy
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyjks (from hopsworks[python])
  Downloading pyjks-20.0.0-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting mock (from hopsworks[python])
  Downloading mock-5.2.0-py3-none-any.whl.metadata (3.1 kB)
Collecting avro==1.11.3 (from hopsworks[python])
  Downloading avro-1.11.3.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
import joblib

import hopsworks
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# ------------------
# Configuration
# ------------------
FEATURE_GROUP_NAME = "aqi_weather_features"
FEATURE_GROUP_VER  = 2
LATITUDE  = 33.5973
LONGITUDE = 73.0479
HORIZON_H = 72
TZ = "Asia/Karachi"
MAX_LAG_H = 120

In [None]:
ARTIFACT_DIR = "lgb_aqi_artifacts"
PLOTS_DIR    = os.path.join(ARTIFACT_DIR, "plots")
os.makedirs(PLOTS_DIR, exist_ok=True)
os.makedirs(ARTIFACT_DIR, exist_ok=True)


In [None]:
# ------------------
# Functions
# ------------------
def create_lag_features(df: pd.DataFrame, feat_cols, lags=None):
    if lags is None:
        lags = [1,2,3,6,12,24,48,72,96,120]
    out = df.copy()
    for f in feat_cols:
        for lag in lags:
            out[f"{f}_lag_{lag}"] = out[f].shift(lag)
        out[f"{f}_roll_mean_24"] = out[f].rolling(24, min_periods=24).mean()
        out[f"{f}_roll_std_24"]  = out[f].rolling(24, min_periods=24).std()
        out[f"{f}_roll_mean_72"] = out[f].rolling(72, min_periods=72).mean()
        out[f"{f}_roll_std_72"]  = out[f].rolling(72, min_periods=72).std()
    return out

def ensure_utc(ts_series: pd.Series) -> pd.Series:
    s = pd.to_datetime(ts_series)
    try:
        if s.dt.tz is None:
            return s.dt.tz_localize("UTC")
        else:
            return s.dt.tz_convert("UTC")
    except AttributeError:
        s = pd.to_datetime(s, errors="coerce")
        s = s.dt.tz_localize("UTC")
        return s

def utc_to_tz(ts_series: pd.Series, tz: str) -> pd.Series:
    s = ensure_utc(ts_series)
    return s.dt.tz_convert(tz)

def metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2


In [None]:

# ------------------
# 1) Load data from Hopsworks
# ------------------
print("[1/7] Logging into Hopsworks and reading Feature Group...")
project = hopsworks.login()
fs = project.get_feature_store()
fg = fs.get_feature_group(name=FEATURE_GROUP_NAME, version=FEATURE_GROUP_VER)
df_raw = fg.read()
df_raw = df_raw.sort_values("time", ascending=True).reset_index(drop=True)

cols_needed = ["time", "pm_10", "pm_25", "carbon_monoxidegm", "nitrogen_dioxide",
               "sulphur_dioxide", "ozone", "us_aqi"]
missing = [c for c in cols_needed if c not in df_raw.columns]
if missing: raise ValueError(f"Missing columns: {missing}")

df = df_raw[cols_needed].copy()
df["time_utc"] = ensure_utc(df["time"])

[1/7] Logging into Hopsworks and reading Feature Group...
Copy your Api Key (first register/login): https://c.app.hopsworks.ai/account/api/generated

Paste it here: ··········




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'



Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1239199
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.65s) 


In [None]:
# ------------------
# 2) Feature engineering
# ------------------
print("[2/7] Creating lag & rolling features...")
features = ["pm_10", "pm_25", "carbon_monoxidegm", "nitrogen_dioxide", "sulphur_dioxide", "ozone"]
work = df.set_index("time_utc")[features + ["us_aqi"]].copy()
work = create_lag_features(work, features)
work.dropna(inplace=True)

all_features = [c for c in work.columns if c != "us_aqi"]
X = work[all_features]
y = work["us_aqi"]

[2/7] Creating lag & rolling features...


In [None]:
# ------------------
# 3) Train/test split
# ------------------
print("[3/7] Train/test split 80/20...")
split_idx = int(len(work)*0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

[3/7] Train/test split 80/20...


In [None]:
import lightgbm as lgb

# ------------------
# 4) Train LightGBM model
# ------------------
print("[4/7] Training LightGBM regressor...")
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval  = lgb.Dataset(X_test, y_test, reference=lgb_train)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'seed': 42
}

# Remove verbose_eval, use callback for early stopping
model = lgb.train(
    params,
    lgb_train,
    num_boost_round=800,
    valid_sets=[lgb_eval],
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(0)]  # log_evaluation(0) suppresses printing
)

# ------------------
# Save last trained timestamp for incremental training
# ------------------
last_trained_time = work.index.max()  # since work is indexed by time_utc
last_trained_path = os.path.join(ARTIFACT_DIR, "last_trained_timestamp.pkl")
joblib.dump(last_trained_time, last_trained_path)
print(f"[info] Last trained timestamp saved: {last_trained_time} -> {last_trained_path}")


[4/7] Training LightGBM regressor...
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[487]	valid_0's rmse: 5.01372
[info] Last trained timestamp saved: 2025-08-25 20:00:00+00:00 -> lgb_aqi_artifacts/last_trained_timestamp.pkl


In [None]:
# ------------------
# 5) Evaluate
# ------------------
print("[5/7] Evaluating on test set...")
y_pred = model.predict(X_test)
mae, rmse, r2 = metrics(y_test, y_pred)
print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2:.4f}")

[5/7] Evaluating on test set...
MAE: 2.50, RMSE: 25.14, R²: 0.9791


In [None]:
# ------------------
# 6) Save model & feature list
# ------------------
joblib.dump(model, os.path.join(ARTIFACT_DIR, "lgb_model.pkl"))
joblib.dump(all_features, os.path.join(ARTIFACT_DIR, "lgb_features.pkl"))
print(f"Artifacts saved to: {ARTIFACT_DIR}")


Artifacts saved to: lgb_aqi_artifacts


In [None]:
# ------------------
# 7) Future forecast (72h)
# ------------------
print("[6/7] Preparing 72h forecast...")
last_known_utc = df["time_utc"].iloc[-1]
start_utc = (last_known_utc + pd.Timedelta(hours=1)).strftime("%Y-%m-%dT%H:%M:%SZ")
end_utc   = (last_known_utc + pd.Timedelta(hours=HORIZON_H)).strftime("%Y-%m-%dT%H:%M:%SZ")

air_url = "https://air-quality-api.open-meteo.com/v1/air-quality"
air_params = {
    "latitude": LATITUDE,
    "longitude": LONGITUDE,
    "hourly": "pm10,pm2_5,carbon_monoxide,nitrogen_dioxide,sulphur_dioxide,ozone",
    "start": start_utc,
    "end": end_utc,
    "timezone": "UTC",
}

resp = requests.get(air_url, params=air_params)
resp.raise_for_status()
raw = resp.json()

df_future = pd.DataFrame({
    "time_utc": pd.to_datetime(raw["hourly"]["time"]),
    "pm_10": raw["hourly"]["pm10"],
    "pm_25": raw["hourly"]["pm2_5"],
    "carbon_monoxidegm": raw["hourly"]["carbon_monoxide"],
    "nitrogen_dioxide": raw["hourly"]["nitrogen_dioxide"],
    "sulphur_dioxide": raw["hourly"]["sulphur_dioxide"],
    "ozone": raw["hourly"]["ozone"],
})

if len(df_future) > HORIZON_H:
    df_future = df_future.sort_values("time_utc").iloc[:HORIZON_H].reset_index(drop=True)

history_block = work[features].tail(MAX_LAG_H).copy()
combined_vals = pd.concat([history_block.reset_index(drop=True),
                           df_future[features].reset_index(drop=True)], axis=0).reset_index(drop=True)
combined = create_lag_features(combined_vals, features)
future_block = combined.iloc[len(history_block): len(history_block)+len(df_future)].copy()
for c in all_features:
    if c not in future_block.columns:
        future_block[c] = 0.0
future_block = future_block[all_features]

future_pred = model.predict(future_block)
future_times_utc = df_future["time_utc"]
future_times_tz  = utc_to_tz(future_times_utc, TZ)

forecast_df = pd.DataFrame({
    "datetime": future_times_tz,
    "datetime_utc": future_times_utc.dt.tz_localize("UTC") if future_times_utc.dt.tz is None else future_times_utc.dt.tz_convert("UTC"),
    "predicted_us_aqi": future_pred,
})

forecast_path = os.path.join(ARTIFACT_DIR, "lgb_72h_forecast.csv")
forecast_df.to_csv(forecast_path, index=False)
print(f"72h forecast saved to: {forecast_path}")

[6/7] Preparing 72h forecast...
72h forecast saved to: lgb_aqi_artifacts/lgb_72h_forecast.csv


In [None]:
import os
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error
from hsml.model import ModelSchema, Schema

# ------------------
# 7) Fetch US AQI for same timestamps & align (same TZ)
# ------------------
print("[7/7] Fetching US AQI for the exact forecast window (aligned TZ)...")
start_date_local = forecast_df["datetime"].min().strftime("%Y-%m-%d")
end_date_local   = forecast_df["datetime"].max().strftime("%Y-%m-%d")

aqi_params = {
    "latitude": LATITUDE,
    "longitude": LONGITUDE,
    "hourly": "us_aqi",
    "timezone": TZ,
    "start_date": start_date_local,
    "end_date": end_date_local,
}

resp2 = requests.get(air_url, params=aqi_params)
resp2.raise_for_status()
raw2 = resp2.json()

actual_df = pd.DataFrame({
    "datetime": pd.to_datetime(raw2["hourly"]["time"]).tz_localize(TZ),
    "us_aqi_actual": raw2["hourly"]["us_aqi"],
})

# Inner join ensures perfect hour alignment on shared timestamps
merged = pd.merge(forecast_df[["datetime", "predicted_us_aqi"]], actual_df, on="datetime", how="inner").sort_values("datetime")
merged_path = os.path.join(ARTIFACT_DIR, "lgb_forecast_vs_us_aqi.csv")
merged.to_csv(merged_path, index=False)
print(f"[info] Saved aligned forecast-vs-actual to: {merged_path}")

# ------------------
# Plot 72h alignment
# ------------------
plt.figure(figsize=(12,6))
plt.plot(merged["datetime"], merged["predicted_us_aqi"], label="Predicted US AQI (LGB)")
plt.plot(merged["datetime"], merged["us_aqi_actual"], label="US AQI from API", linestyle="--")
plt.title(f"Next 72 Hours: Predicted vs US AQI (Aligned in {TZ})")
plt.xlabel(f"Datetime ({TZ})")
plt.ylabel("US AQI")
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plot_path = os.path.join(PLOTS_DIR, "lgb_72h_pred_vs_actual.png")
plt.savefig(plot_path, dpi=140)
plt.close()
print(f"[info] Plot saved to: {plot_path}")

# ----------------------
# Quick accuracy summary
# ----------------------
if not merged.empty:
    mae_f = mean_absolute_error(merged["us_aqi_actual"], merged["predicted_us_aqi"])
    rmse_f = mean_squared_error(merged["us_aqi_actual"], merged["predicted_us_aqi"])
    corr_f = np.corrcoef(merged["us_aqi_actual"].values, merged["predicted_us_aqi"].values)[0,1] if len(merged) > 1 else np.nan
    print(f"\n[summary] 72h window alignment metrics (API us_aqi vs LGB prediction):\n  MAE:  {mae_f:.2f}\n  RMSE: {rmse_f:.2f}\n  Corr: {corr_f:.4f}")
else:
    print("[summary] No overlapping rows after merge; check timezone and date range.")

print("\n✅ Done. Artifacts are in:", os.path.abspath(ARTIFACT_DIR))

[7/7] Fetching US AQI for the exact forecast window (aligned TZ)...
[info] Saved aligned forecast-vs-actual to: lgb_aqi_artifacts/lgb_forecast_vs_us_aqi.csv
[info] Plot saved to: lgb_aqi_artifacts/plots/lgb_72h_pred_vs_actual.png

[summary] 72h window alignment metrics (API us_aqi vs LGB prediction):
  MAE:  2.64
  RMSE: 16.87
  Corr: 0.9858

✅ Done. Artifacts are in: /content/lgb_aqi_artifacts


In [None]:
# ----------------------
# Save LightGBM model to Hopsworks
# ----------------------
# --- Create schema ---
input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

# --- Model registry ---
mr = project.get_model_registry()

model_meta = mr.python.create_model(
    name="lgb_aqi_forecaster",
    metrics={"mae": mae_f, "rmse": rmse_f, "corr": corr_f},
    model_schema=model_schema,
    description="LightGBM model for AQI forecasting using weather & pollutant lags"
)

# --- Create artifact folder ---
ARTIFACT_DIR = "lgb_artifacts"
os.makedirs(ARTIFACT_DIR, exist_ok=True)
# Get last timestamp from training dataset
last_trained_time = work.index.max()  # 'work' has time_utc as index
timestamp_file = os.path.join(ARTIFACT_DIR, "last_trained_timestamp.pkl")
# Save LightGBM model
joblib.dump(model, os.path.join(ARTIFACT_DIR, "lgb_model.pkl"))

# Save features list
joblib.dump(all_features, os.path.join(ARTIFACT_DIR, "lgb_features.pkl"))

joblib.dump(last_trained_time, timestamp_file)
print(f"[info] Last trained timestamp saved: {last_trained_time} -> {timestamp_file}")

# --- Save to Model Registry ---
model_meta.save(ARTIFACT_DIR)
print("[info] LightGBM model saved and registered successfully.")

[info] Last trained timestamp saved: 2025-08-25 20:00:00+00:00 -> lgb_artifacts/last_trained_timestamp.pkl


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading /content/lgb_artifacts/lgb_model.pkl: 0.000%|          | 0/1383821 elapsed<00:00 remaining<?

Uploading /content/lgb_artifacts/lgb_features.pkl: 0.000%|          | 0/1933 elapsed<00:00 remaining<?

Uploading /content/lgb_artifacts/last_trained_timestamp.pkl: 0.000%|          | 0/140 elapsed<00:00 remaining<…

Uploading /content/model_schema.json: 0.000%|          | 0/7493 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1239199/models/lgb_aqi_forecaster/1
[info] LightGBM model saved and registered successfully.
