In [None]:
# ==============================
# Linear Regression AQI Forecast
# ==============================

!pip install hopsworks[python] requests scikit-learn pandas matplotlib

import hopsworks
import pandas as pd
import numpy as np
import requests
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import math

# ----------------------
# 1) Connect to Hopsworks
# ----------------------
project = hopsworks.login()
fs = project.get_feature_store()

# Load your AQI + Weather historical data
fg = fs.get_feature_group("aqi_weather_features", version=1)
df_hist = fg.read()

# ----------------------
# 2) Prepare data
# ----------------------
df_hist = df_hist.sort_values("datetime")
df_hist.reset_index(drop=True, inplace=True)

# Feature list (excluding datetime & target)
features = [c for c in df_hist.columns if c not in ["datetime", "aqi"]]

# Add lag features for AQI
n_lags = 3
for lag in range(1, n_lags + 1):
    df_hist[f"aqi_lag{lag}"] = df_hist["aqi"].shift(lag)

# Drop NaN rows from lags
df_hist = df_hist.dropna()

# Split train/test (last 72 hours as test)
train_df = df_hist.iloc[:-72]
test_df  = df_hist.iloc[-72:]

X_train = train_df[features + [f"aqi_lag{i}" for i in range(1, n_lags+1)]]
y_train = train_df["aqi"]

X_test  = test_df[features + [f"aqi_lag{i}" for i in range(1, n_lags+1)]]
y_test  = test_df["aqi"]

# ----------------------
# 3) Train Linear Regression
# ----------------------
model = LinearRegression()
model.fit(X_train, y_train)

# ----------------------
# 4) Evaluate
# ----------------------
y_pred = model.predict(X_test)
mae  = mean_absolute_error(y_test, y_pred)
rmse = math.sqrt(mean_squared_error(y_test, y_pred))
r2   = r2_score(y_test, y_pred)

print(f"MAE:  {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²:   {r2:.4f}")

# ----------------------
# 5) Fetch forecasted weather (Open-Meteo)
# ----------------------
lat, lon = 33.6844, 73.0479  # Islamabad example
url = (
    f"https://api.open-meteo.com/v1/forecast?"
    f"latitude={lat}&longitude={lon}"
    "&hourly=pm10,pm2_5,carbon_monoxide,nitrogen_dioxide,sulphur_dioxide,ozone",
"
    "&forecast_days=3"
)
resp = requests.get(url).json()
forecast_df = pd.DataFrame(resp["hourly"])
forecast_df["datetime"] = pd.to_datetime(forecast_df["time"])
forecast_df.drop(columns=["time"], inplace=True)

# ----------------------
# 6) Match lag features for forecast
# ----------------------
# Get last known AQI values for lag creation
last_known = df_hist.iloc[-n_lags:][["datetime", "aqi"]].copy()

forecast_lags = []
for i in range(len(forecast_df)):
    row = forecast_df.iloc[i].to_dict()
    for lag in range(1, n_lags+1):
        if i - lag >= 0:
            row[f"aqi_lag{lag}"] = forecast_df.iloc[i-lag]["aqi_pred"] if f"aqi_pred" in forecast_df.columns else last_known["aqi"].iloc[-lag]
        else:
            row[f"aqi_lag{lag}"] = last_known["aqi"].iloc[i-lag]
    forecast_lags.append(row)

forecast_df = pd.DataFrame(forecast_lags)

# ----------------------
# 7) Predict future AQI
# ----------------------
X_forecast = forecast_df[features + [f"aqi_lag{i}" for i in range(1, n_lags+1)]]
forecast_df["aqi_pred"] = model.predict(X_forecast)

print("\nForecasted AQI (next hours):")
print(forecast_df[["datetime", "aqi_pred"]])

# ----------------------
# 8) Plot results
# ----------------------
import matplotlib.pyplot as plt

plt.figure(figsize=(12,6))
plt.plot(test_df["datetime"], y_test, label="Actual AQI")
plt.plot(test_df["datetime"], y_pred, label="Predicted AQI (Test)", linestyle="--")
plt.plot(forecast_df["datetime"], forecast_df["aqi_pred"], label="Forecasted AQI", linestyle=":")
plt.legend()
plt.xlabel("Datetime")
plt.ylabel("AQI")
plt.title("Linear Regression AQI Prediction")
plt.grid(True)
plt.show()
