# Data Load, Cleaning

In [None]:
def loadData():    
    import pandas as pd
    import numpy as np

    filepath='combined_emissions_sources2.csv'
        # Step 1: Load the CSV file into a pandas DataFrame
    try:
        print(f"Loading data from '{filepath}'...")
        df = pd.read_csv(filepath)
        print("Data loaded successfully.")
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.")
        print("Please make sure you have already run the first script to generate this file,")
        print("and that it is in the same directory as this script.")
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
    # Step 2: Display the size of the DataFrame (rows, columns)
    rows, cols = df.shape
    print("\n--- DataFrame Size ---")
    print(f"The DataFrame has {rows} rows and {cols} columns.")
    return df

In [None]:
df=loadData()

In [None]:
df = df.dropna(subset=['start_time'])
# --- 2. Identify and Drop Columns with Nulls > 100,000 ---
threshold = 100000
null_counts = df.isnull().sum()
columns_to_drop = null_counts[null_counts > threshold].index.tolist()

if not columns_to_drop:
    print("No columns have more than 100,000 null values. No columns were dropped.")
else:

    print(f"--- Columns to be Dropped (>{threshold} nulls) ---")
    print(columns_to_drop)
    print("\n--- Columns Dropped ---")
    print(f"Old shape of DataFrame: {df.shape}")
    df.drop(columns=columns_to_drop, axis=1, inplace=True)
    print(f"New shape of the DataFrame: {df.shape}")

columns_to_drop=['modified_date','source_id','iso3_country']
df.drop(columns=columns_to_drop, axis=1, inplace=True)
print(df.columns)



# --- 2. Find and Display Null Value Counts ---
null_counts = df.isnull().sum()
df_imputed = df.copy()
sector_null_count = df_imputed['sector'].isnull().sum()
if sector_null_count > 0:
    df_imputed.dropna(subset=['sector'], inplace=True)

print("--- Imputing Missing Values ---")
# Loop through each column to apply the correct imputation strategy
for col in df_imputed.columns:
    if df_imputed[col].isnull().any():
        # STRATEGY 1: For non-numeric (object/categorical) columns
        if df_imputed[col].dtype == 'object':
            mode_value = df_imputed[col].mode()[0]
            df_imputed[col].fillna(mode_value, inplace=True)
            print(f"Imputed non-numeric column '{col}' with mode: '{mode_value}'")
            
        # STRATEGY 2: For numeric (float/int) columns
        else:
            mean_value = df_imputed[col].mean()
            df_imputed[col].fillna(mean_value, inplace=True)
            print(f"Imputed numeric column '{col}' with mean: {mean_value:.2f}")

print("\nImputation complete.")
print("\n--- Null values count after imputation ---")
display(df_imputed.isnull().sum().sum())

In [None]:
# Step 4: Display unique values from the 'source_name' column
print("\n--- Unique Values in 'sectors' ---")
if 'sector' in df_imputed.columns:
    unique_sectors = df_imputed['sector'].unique()
    print(f"Found {len(unique_sectors)} unique source names. Displaying a sample:")
    # Display the first 15 unique names, or all if less than 15
    display_limit = min(15, len(unique_sectors))
    for i, name in enumerate(unique_sectors[:display_limit]):
        print(f"- {name}")
    if len(unique_sectors) > display_limit:
        print(f"... and {len(unique_sectors) - display_limit} more.")
else:
    print("The column 'sector' was not found in the DataFrame.")

all_null_rows_count = df_imputed.isnull().all(axis=1).sum()
print(f"Contains {all_null_rows_count} rows where all values are null.")
print(f"The shape of current cleaned df: {df.shape}")

In [None]:
import pandas as pd
import numpy as np

def impute_categorical_with_percentiles(df, columns):
    label_to_quantile = {
        'very high': 0.95,
        'high': 0.90,
        'medium': 0.65,
        'low': 0.45,
        'very low': 0.35,
    }
    
    for col in columns:
        if col not in df.columns:
            print(f"Warning: column '{col}' not found. Skipping.")
            continue

        s = df[col].astype(str)
        numeric_series = pd.to_numeric(df[col], errors='coerce')
        numeric_only = numeric_series.dropna()

        if numeric_only.empty:
            print(f"Warning: column '{col}' has no numeric data to compute percentiles. Skipping.")
            df[col] = pd.to_numeric(df[col], errors='coerce')
            continue

        qvals = numeric_only.quantile([
            label_to_quantile['very high'],
            label_to_quantile['high'],
            label_to_quantile['medium'],
            label_to_quantile['low'],
            label_to_quantile['very low']
        ])
        # Map quantile index back to labels
        pmap = {
            'very high': qvals.loc[label_to_quantile['very high']],
            'high': qvals.loc[label_to_quantile['high']],
            'medium': qvals.loc[label_to_quantile['medium']],
            'low': qvals.loc[label_to_quantile['low']],
            'very low': qvals.loc[label_to_quantile['very low']],
        }

        s_clean = s.str.strip().str.lower().replace(pmap)
        df[col] = pd.to_numeric(s_clean, errors='coerce')

        # Report
        print(f"Processed '{col}': converted to float. Percentiles used: "
              f"very high={pmap['very high']:.6g}, high={pmap['high']:.6g}, "
              f"medium={pmap['medium']:.6g}, low={pmap['low']:.6g}, very low={pmap['very low']:.6g}")

    return df


In [None]:
cols_to_fix = ['capacity','capacity_factor','activity','emissions_factor','emissions_quantity']

impute_categorical_with_percentiles(df_imputed, cols_to_fix)
for c in cols_to_fix:
    if c in df_imputed.columns:
        print(c, df_imputed[c].dtype, "-> sample:", df_imputed[c].dropna().head().tolist())

print("\n--- Time Clean Ups ---\n")
import pandas as pd
df_imputed['start_time'] = pd.to_datetime(df_imputed['start_time'], infer_datetime_format=True)
df_imputed['end_time']   = pd.to_datetime(df_imputed['end_time'], infer_datetime_format=True)

# Optional: verify conversion
print(df_imputed[['start_time', 'end_time']].dtypes)
print(df_imputed[['start_time', 'end_time']].head())

# Add time features
df_imputed['year'] = df_imputed['start_time'].dt.year
df_imputed['month'] = df_imputed['start_time'].dt.month

In [None]:
import pandas as pd
null_counts = df_imputed.isnull().sum()
print(null_counts)


# Data Cleaning Summary



## 1. Data Consolidation & Cleaning
- **Consolidation**
  - Collected emissions data from multiple sector folders.  
  - Merged into a single unified dataset.  
- **Cleaning**
  - Parsed datetime fields: `start_time`, `end_time`.  
  - Imputed missing values:  
    - **Numeric values** → replaced with column means.  
    - **Categorical strings** (e.g., *“very high”, “low”*) → substituted with percentile-based numeric values.  
  - Converted mixed-type numeric columns → coerced to `float`.  
  - Dropped columns with excessive nulls.  
  - Removed rows where `sector` was null.  

- **Cleaned data size**
  - `(4527140, 9)`
  - 2021-2025 (May)

---

## 2. Feature Engineering
- **Time Features**
  - Extracted calendar components: year, month, quarter, month start/end.  
  - Added **cyclical encodings** for month (sine/cosine representation).  
- **Autoregressive Lags**
  - Created lag features for 1, 2, 3, 6, and 12 months.  
- **Aggregation**
  - Standardized the dataset into a consistent **monthly time series**.  

---


# EDA Timeline, Sector Data Quality

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from prophet import Prophet
from sklearn.metrics import mean_absolute_error, mean_squared_error
plt.style.use("seaborn-v0_8")  # updated style name
sns.set_palette("viridis")

%matplotlib inline

def sectorWise(df_sector):

    # =========================================================
    # 1) Build monthly series from df_imputed (NO 'ds','y' assumed)
    # =========================================================
    # Aggregate total emissions by month
    df_monthly = (
        df_sector.groupby(['year', 'month'])['emissions_quantity']
        .sum()
        .reset_index()
    )

    # Create month-start datetime column 'ds'
    df_monthly['ds'] = pd.to_datetime(
        df_monthly['year'].astype(str) + '-' + df_monthly['month'].astype(str) + '-01'
    ).dt.to_period('M').dt.to_timestamp(how='start')  # enforce month-start timestamps


    # Prophet expects columns: ds (date), y (target)
    df_monthly = df_monthly.rename(columns={'emissions_quantity': 'y'})
    df_monthly = df_monthly[['ds', 'y']].sort_values('ds').reset_index(drop=True)

    # =========================================================
    # 2) Train/Test split (Train: 2021-01 → 2023-12, Test: 2024-01 → 2025-05)
    # =========================================================
    train = df_monthly[(df_monthly['ds'] >= '2021-01-01') & (df_monthly['ds'] < '2024-01-01')]
    test  = df_monthly[(df_monthly['ds'] >= '2024-01-01') & (df_monthly['ds'] <= '2025-05-01')]

    print(f"Train size: {len(train)} | range: {train['ds'].min().date()} → {train['ds'].max().date()}")
    print(f"Test  size: {len(test)} | range: {test['ds'].min().date()} → {test['ds'].max().date()}")

    # =========================================================
    # 3) Fit Prophet on training data
    # =========================================================
    m = Prophet(yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False)
    m.fit(train)

    # =========================================================
    # 4) Create futures
    #    - For evaluation/plots, we want a full forecast that includes history.
    #    - For metrics, we align exactly to test months to avoid any NaN/mismatch.
    # =========================================================
    # Full forecast through the end of test period for nice Prophet plots
    # (periods = number of months from end of train to 2025-05 inclusive)
    last_needed = pd.Timestamp('2025-05-01')
    months_needed = (last_needed.to_period('M') - train['ds'].max().to_period('M')).n + 1
    future_full = m.make_future_dataframe(periods=months_needed, freq='MS')
    forecast_full = m.predict(future_full)

    # For strict evaluation: forecast exactly on test dates
    future_test = pd.DataFrame({'ds': test['ds']})
    forecast_test_only = m.predict(future_test)[['ds', 'yhat']]

    # Alignment check (should be zero missing)
    missing_in_forecast = sorted(set(test['ds']) - set(forecast_test_only['ds']))
    if missing_in_forecast:
        print("Warning: these test dates are missing predictions:", missing_in_forecast)

    # =========================================================
    # 5) Evaluation (MAE, RMSE, MAPE, sMAPE) on 2024-01 → 2025-05
    # =========================================================
    eval_df = test.merge(forecast_test_only, on='ds', how='left').copy()

    # Safety check for any NaNs (should not happen; if it does, we surface it)
    if eval_df['yhat'].isna().any() or eval_df['y'].isna().any():
        n_nan_pred = eval_df['yhat'].isna().sum()
        n_nan_true = eval_df['y'].isna().sum()
        raise ValueError(f"Found NaNs after alignment -> yhat NaNs: {n_nan_pred}, y NaNs: {n_nan_true}. "
                        "Check the monthly continuity or date alignment.")

    y_true = eval_df['y'].to_numpy()
    y_pred = eval_df['yhat'].to_numpy()

    mae  = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    # Robust MAPE: ignore zero-true months in the percentage calc
    nonzero_mask = y_true != 0
    if nonzero_mask.sum() == 0:
        mape = np.nan
    else:
        mape = np.mean(np.abs((y_true[nonzero_mask] - y_pred[nonzero_mask]) / y_true[nonzero_mask])) * 100

    # sMAPE handles zeros better
    smape = 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

    print("\nEvaluation (Test: 2024-01 → 2025-05)")
    print(f"MAE   : {mae:.3f}")
    print(f"RMSE  : {rmse:.3f}")
    print(f"MAPE  : {mape:.3f}% (computed on non-zero actuals only)")
    print(f"sMAPE : {smape:.3f}%")

    # =========================================================
    # 6) Plots — keep EVERYTHING
    # =========================================================

    # A) Forecast vs Actual (Train/Test + Forecast-on-test)
    plt.figure(figsize=(12,4))
    plt.plot(train['ds'], train['y'], label="Train", linewidth=2)
    plt.plot(test['ds'], test['y'], label="Test (Actual)", linewidth=2)
    plt.plot(eval_df['ds'], eval_df['yhat'], '--', label="Forecast on Test", linewidth=2)
    plt.axvline(pd.Timestamp('2024-01-01'), color='gray', linestyle='--', label="Train/Test Split")
    plt.title(f"Prophet Forecast vs Actual (Train: 2021–2023, Test: 2024–2025-05),{df_sector['sector'].unique()}")
    plt.xlabel("Date"); plt.ylabel("Total Emissions"); plt.legend(); plt.grid(alpha=0.3)
    plt.show()

    return mape
    # # B) Prophet Forecast (full range) — nice overview figure
    # fig_forecast = m.plot(forecast_full)
    # plt.title("Prophet Forecast (Full Range Through 2025-05)")
    # plt.axvline(pd.Timestamp('2024-01-01'), color='gray', linestyle='--', label="Forecast Start")
    # plt.legend(); plt.grid(alpha=0.3)
    # plt.show()
    # # Plot forecast with uncertainty intervals + actual test points
    # plt.figure(figsize=(12, 6))
    # plt.plot(train["ds"], train["y"], label="Train", color="blue")
    # plt.plot(test["ds"], test["y"], label="Test (actual)", color="black", linestyle="dashed")
    # plt.plot(forecast_full["ds"], forecast_full["yhat"], label="Forecast", color="red")
    # plt.fill_between(forecast_full["ds"], forecast_full["yhat_lower"], forecast_full["yhat_upper"], color="pink", alpha=0.3)

    # # Add test points as scatter dots
    # plt.scatter(test["ds"], test["y"], color="black", marker="o", s=40, label="Test points")

    # plt.title("Forecast vs Actuals (with Test Points)")
    # plt.xlabel("Date")
    # plt.ylabel("y")
    # plt.legend()
    # plt.show()

    # # C) Prophet Components (Trend + Seasonality) — make easier to read
    # fig_comp = m.plot_components(forecast_full)
    # fig_comp.set_size_inches(12, 8)
    # for ax in fig_comp.axes:
    #     ax.grid(alpha=0.3)
    #     ax.set_ylabel("Effect")
    # plt.suptitle("Trend & Yearly Seasonality Effects", fontsize=16)
    # plt.show()

    # # D) Residuals over time (Test period)
    # residuals = y_true - y_pred
    # plt.figure(figsize=(10,5))
    # plt.plot(eval_df['ds'], residuals, marker='o')
    # plt.axhline(0, color='red', linestyle='--')
    # plt.title("Residuals Over Time (Test: 2024–2025-05)")
    # plt.xlabel("Date"); plt.ylabel("Residual (Actual - Predicted)"); plt.grid(alpha=0.3)
    # plt.show()

    # # E) Correlation Heatmap for numeric columns in df_imputed (as requested earlier)
    # num_cols = df_imputed.select_dtypes(include=[np.number]).columns
    # if len(num_cols) > 1:
    #     plt.figure(figsize=(10,7))
    #     corr = df_imputed[num_cols].corr()
    #     sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
    #     plt.title("Correlation Heatmap (Numerical Features)")
    #     plt.show()

    # # Actual vs Predicted on historical data
    # plt.figure(figsize=(10,6))
    # plt.plot(eval_df['ds'], eval_df['y'], label="Actual", marker='o')
    # plt.plot(eval_df['ds'], eval_df['yhat'], label="Predicted", marker='x')
    # plt.title("Prophet: Actual vs Predicted ")
    # plt.xlabel("Date")
    # plt.ylabel("Total Emissions")
    # plt.legend()
    # plt.grid(alpha=0.3)
    # plt.show()

def timeline(df):
    monthly = df.groupby(['year','month'])['emissions_quantity'].sum().reset_index()
    monthly['date'] = pd.to_datetime(monthly[['year','month']].assign(day=1))

    plt.figure(figsize=(14,6))
    sns.lineplot(data=monthly, x="date", y="emissions_quantity")
    plt.title("Emissions Over Time")
    plt.ylabel("Total Monthly Emissions")
    plt.show()

In [None]:
# Sector-wise over time (Top 5)
top_sectors1 = ['agriculture','forestry-and-land-use','power','fossil-fuel-operations']
top_sectors2 = ['buildings','manufacturing','transportation']
top_sectors3=['mineral-extraction','waste']

def timelineCombo(top_sectors):
    sector_trend = (df_imputed[df_imputed['sector'].isin(top_sectors)]
                    .groupby(['year','month','sector'])['emissions_quantity']
                    .sum()
                    .reset_index())
    sector_trend['date'] = pd.to_datetime(sector_trend[['year','month']].assign(day=1))

    plt.figure(figsize=(14,8))
    sns.lineplot(data=sector_trend, x="date", y="emissions_quantity", hue="sector")
    plt.title(f"Sector-wise Emissions Over Time: {top_sectors}")
    plt.show()

timelineCombo(top_sectors1)
timelineCombo(top_sectors2)
timelineCombo(top_sectors3)



# Prophet Testing

In [None]:
sectors=['waste','manufacturing','fossil-fuel-operations','transportation','power','agriculture','buildings']
df_final=df_imputed[df_imputed['sector'].isin(sectors)]
prophet_mape=sectorWise(df_final)

# Classical Methods: Prophet, Holt-Winters, ARIMA, SARIMA 

In [None]:
df_final.info()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_percentage_error

# =========================================================
# 1) Train/Test split (Train: 2021-01 → 2023-12, Test: 2024-01 → 2025-05)
# =========================================================
df_monthly = df_final.groupby('start_time')['emissions_quantity'].sum().reset_index()
df_monthly = df_monthly.rename(columns={'start_time':'ds','emissions_quantity':'y'})
train = df_monthly[(df_monthly['ds'] >= '2021-01-01') & (df_monthly['ds'] < '2024-01-01')]
test  = df_monthly[(df_monthly['ds'] >= '2024-01-01') & (df_monthly['ds'] <= '2025-05-01')]

y_train = train.set_index('ds')['y']
y_test  = test.set_index('ds')['y']

print(f"Train size: {len(train)} | range: {train['ds'].min().date()} → {train['ds'].max().date()}")
print(f"Test  size: {len(test)} | range: {test['ds'].min().date()} → {test['ds'].max().date()}")
# =========================================================
# 1.2) Fit Prophet on training data
# =========================================================
m = Prophet(yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False)
m.fit(train)
# =========================================================
# 1.3) Create futures
# =========================================================
# (periods = number of months from end of train to 2025-05 inclusive)
last_needed = pd.Timestamp('2025-05-01')
months_needed = (last_needed.to_period('M') - train['ds'].max().to_period('M')).n + 1
future_full = m.make_future_dataframe(periods=months_needed, freq='MS')
forecast_full = m.predict(future_full)
future_test = pd.DataFrame({'ds': test['ds']})
forecast_test_only = m.predict(future_test)[['ds', 'yhat']]
missing_in_forecast = sorted(set(test['ds']) - set(forecast_test_only['ds']))
if missing_in_forecast:
    print("Warning: these test dates are missing predictions:", missing_in_forecast)

eval_df = test.merge(forecast_test_only, on='ds', how='left').copy()
if eval_df['yhat'].isna().any() or eval_df['y'].isna().any():
    n_nan_pred = eval_df['yhat'].isna().sum()
    n_nan_true = eval_df['y'].isna().sum()
    raise ValueError(f"Found NaNs after alignment -> yhat NaNs: {n_nan_pred}, y NaNs: {n_nan_true}. "
                    "Check the monthly continuity or date alignment.")
y_true = eval_df['y'].to_numpy()
y_pred = eval_df['yhat'].to_numpy()
mae  = mean_absolute_error(y_true, y_pred)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
# Robust MAPE: ignore zero-true months in the percentage calc
nonzero_mask = y_true != 0
if nonzero_mask.sum() == 0:
    mape = np.nan
else:
    mape = np.mean(np.abs((y_true[nonzero_mask] - y_pred[nonzero_mask]) / y_true[nonzero_mask])) * 100
# sMAPE handles zeros better
smape = 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))
print("\nEvaluation (Test: 2024-01 → 2025-05)")
print(f"MAPE  : {mape:.3f}% (computed on non-zero actuals only)")

# -------------------------
# (2) Holt-Winters (monthly seasonality)
# -------------------------
hw_model = ExponentialSmoothing(y_train,
                                trend='add',
                                seasonal='add',
                                seasonal_periods=12).fit()
hw_forecast = hw_model.forecast(len(y_test))

# -------------------------
# (3) ARIMA
# -------------------------
arima_model = ARIMA(y_train, order=(12,1,12))  
arima_fit = arima_model.fit()
arima_forecast = arima_fit.forecast(len(y_test))

# Seasonal ARIMA
import pmdarima as pm

sarima_model = pm.auto_arima(y_train,
                             seasonal=True,
                             m=12,  # 12 months in a seasonal cycle
                             stepwise=True,
                             suppress_warnings=True)
sarima_forecast = sarima_model.predict(n_periods=len(y_test))

sarima_mape = mean_absolute_percentage_error(y_test, sarima_forecast) * 100
# -------------------------
# Evaluate MAPE
# -------------------------
hw_mape = mean_absolute_percentage_error(y_test, hw_forecast) * 100
arima_mape = mean_absolute_percentage_error(y_test, arima_forecast) * 100
print(f"Prophet MAPE: {prophet_mape:.2f}%")
print(f"Holt-Winters MAPE: {hw_mape:.2f}%")
print(f"ARIMA MAPE: {arima_mape:.2f}%")
print(f"SARIMA MAPE: {sarima_mape:.2f}%")



# -------------------------
# Plot Comparison
# -------------------------


plt.figure(figsize=(14,5))
plt.plot(train['ds'], train['y'], label="Train", color = 'black', linewidth=2)
plt.plot(test['ds'], test['y'], label="Test (Actual)", color='blue',linewidth=2)

# Prophet forecast (assuming you have eval_df with 'ds','yhat')
plt.plot(eval_df['ds'], eval_df['yhat'], '--', label=f"Prophet (MAPE={prophet_mape:.2f}%)")

# Holt-Winters forecast
plt.plot(test['ds'], hw_forecast, '--', label=f"Holt-Winters (MAPE={hw_mape:.2f}%)")

# ARIMA forecast
plt.plot(test['ds'], sarima_forecast, '--', label=f"SARIMA (MAPE={sarima_mape:.2f}%)")
plt.plot(test['ds'], arima_forecast, '--', label=f"ARIMA (MAPE={arima_mape:.2f}%)")


plt.axvline(pd.Timestamp('2024-01-01'), color='gray', linestyle='--', label="Train/Test Split")
plt.title("Carbon Emissions Forecast (Classical Models): Prophet, Holt-Winters, SARIMA, ARIMA ")
plt.xlabel("Date"); plt.ylabel("Total Emissions")
plt.legend(); plt.grid(alpha=0.3)
plt.show()


# ML Methods RandomForest, GradientBoosting, XGBoost, SVR

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

# ---------------------------
# Feature Engineering
# ---------------------------
def create_features(df_final):
    df_monthly = df_final.groupby('start_time')['emissions_quantity'].sum().reset_index()
    df_monthly = df_monthly.rename(columns={'start_time':'ds','emissions_quantity':'y'})
    df = df_monthly.copy()
    df['year'] = df['ds'].dt.year
    df['month'] = df['ds'].dt.month
    df['quarter'] = df['ds'].dt.quarter
    df['dayofyear'] = df['ds'].dt.dayofyear
    df['sin_month'] = np.sin(2 * np.pi * df['month']/12)
    df['cos_month'] = np.cos(2 * np.pi * df['month']/12)
    return df

df_ml = create_features(df_final)

# Train/Test split (same as earlier)
train_ml = df_ml[(df_ml['ds'] >= '2021-01-01') & (df_ml['ds'] < '2024-01-01')]
test_ml  = df_ml[(df_ml['ds'] >= '2024-01-01') & (df_ml['ds'] <= '2025-05-01')]

X_train = train_ml.drop(columns=['ds','y'])
y_train = train_ml['y']
X_test  = test_ml.drop(columns=['ds','y'])
y_test  = test_ml['y']

# ---------------------------
# Machine Learning Models
# ---------------------------

models = {
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=200, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=200, random_state=42),
    "SVR": SVR(kernel='rbf', C=200, gamma=0.1)
}

results = {}

import matplotlib.pyplot as plt

# ---------------------------
# Train ML models and store predictions
# ---------------------------
predictions = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    predictions[name] = y_pred
    mape = mean_absolute_percentage_error(y_test, y_pred) * 100
    results[name] = mape


# ---------------------------
# Plot Train, Test, and Predictions
# ---------------------------
plt.figure(figsize=(14,5))

# Training data
plt.plot(train_ml['ds'], y_train, label="Train", color="black", linewidth=2)

# Test actual
plt.plot(test_ml['ds'], y_test, label="Test (Actual)", color="blue", linewidth=2)

# Forecasts from ML models
for name, y_pred in predictions.items():
    plt.plot(test_ml['ds'], y_pred, '--', label=f"{name} (MAPE {results[name]:.2f}%)", linewidth=2)

# Vertical line for train/test split
plt.axvline(pd.Timestamp("2024-01-01"), color="gray", linestyle="--", label="Train/Test Split")

plt.title("Carbon Emissions Forecast (Machine Learning): RF, Gradient Boosting, XGBoost, SVR")
plt.xlabel("Date")
plt.ylabel("Total Emissions")
plt.legend()
plt.grid(alpha=0.3)
plt.show()


# ---------------------------
# Compare All Models
# ---------------------------
print("\nModel Comparison:")
for model, mape in results.items():
    print(f"{model}: {mape:.2f}%")


# Save Data

In [None]:
df_final.info()

In [None]:
df_final.to_csv('finalDataProphet.csv')

# PINN Testing 

In [None]:
# =========================================================
# 0) Imports
# =========================================================
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
from prophet import Prophet

# =========================================================
# 1) Prepare India data
# =========================================================
df_india=df_final.copy()

# Aggregate monthly totals
df_monthly = df_india.groupby('start_time').agg({
    'emissions_quantity':'sum',
    'activity':'sum',
    'capacity':'sum',
    'capacity_factor':'mean'
}).reset_index()

# rename for clarity
df_monthly = df_monthly.rename(columns={
    'start_time':'ds',
    'emissions_quantity':'y',
    'activity':'activity',
    'capacity':'capacity',
    'capacity_factor':'capacity_factor'
})

# =========================================================
# 2) Train/Test split by date
# =========================================================
train = df_monthly[(df_monthly['ds'] >= '2021-01-01') & (df_monthly['ds'] < '2024-01-01')].copy()
test  = df_monthly[(df_monthly['ds'] >= '2024-01-01') & (df_monthly['ds'] <= '2025-05-01')].copy()

X_train = train[['activity','capacity','capacity_factor']].values
y_train = train['y'].values
X_test  = test[['activity','capacity','capacity_factor']].values
y_test  = test['y'].values

# =========================================================
# 3) Scale features and target
# =========================================================
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled  = scaler_X.transform(X_test)

scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1,1)).flatten()
y_test_scaled  = scaler_y.transform(y_test.reshape(-1,1)).flatten()

# =========================================================
# 4) Define PINN with MAPE-compatible loss
# =========================================================
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# =========================================================
# PINN Definition
# =========================================================
class PINN(nn.Module):
    def __init__(self, input_dim=3, hidden=64):
        super(PINN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden),
            nn.Tanh(),
            nn.Linear(hidden, hidden),
            nn.Tanh(),
            nn.Linear(hidden, 1)
        )
    def forward(self, x):
        return self.net(x)

# Differentiable MAPE loss
class MAPELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.eps = eps
    def forward(self, y_pred, y_true):
        return torch.mean(torch.abs(y_pred - y_true) / (torch.abs(y_true) + self.eps))

# Physics-informed residual loss
def physics_residual_loss(y_pred, features, eps=1e-6):
    # features assumed to have: [activity, emission_factor, capacity_factor, ...]
    activity = features[:, 0]
    ef = features[:, 1]
    cf = features[:, 2]
    physics_estimate = activity * ef * cf
    return torch.mean((y_pred.squeeze() - physics_estimate) ** 2)

# =========================================================
# 5) Prepare PyTorch DataLoader
# =========================================================
X_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y_train_scaled, dtype=torch.float32).view(-1,1)

dataset = TensorDataset(X_tensor, y_tensor)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Instantiate model, optimizer, loss
model = PINN()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = MAPELoss()

# =========================================================
# 6) Train PINN
# =========================================================
lambda_phys = 0  # weight for physics-informed loss

for epoch in range(5000):
    for xb, yb in loader:
        optimizer.zero_grad()
        y_pred = model(xb)
        # Data loss
        loss_data = loss_fn(y_pred, yb)
        # Physics loss (using first 3 cols as activity, ef, cf)
        loss_phys = physics_residual_loss(y_pred, xb[:, :3])
        # Total loss
        loss = loss_data + lambda_phys * loss_phys
        loss.backward()
        optimizer.step()
    if epoch % 500 == 0:
        print(f"Epoch {epoch}, MAPE Loss: {loss.item():.6f}")

# =========================================================
# 7) PINN predictions (scaled -> original)
# =========================================================
model.eval()
with torch.no_grad():
    y_pinn_train_scaled = model(torch.tensor(X_train_scaled, dtype=torch.float32)).numpy().flatten()
    y_pinn_test_scaled  = model(torch.tensor(X_test_scaled, dtype=torch.float32)).numpy().flatten()

y_pinn_train = scaler_y.inverse_transform(y_pinn_train_scaled.reshape(-1,1)).flatten()
y_pinn_test  = scaler_y.inverse_transform(y_pinn_test_scaled.reshape(-1,1)).flatten()

train['pinn_pred'] = y_pinn_train
train['residual'] = train['y'] - train['pinn_pred']
test['pinn_pred']  = y_pinn_test
test['residual']   = test['y'] - test['pinn_pred']

# =========================================================
# 8A) Residual modeling with XGBoost
# =========================================================
# Simple lag features
train['res_lag1'] = train['residual'].shift(1).fillna(0)
train['res_lag2'] = train['residual'].shift(2).fillna(0)
test['res_lag1']  = list(train['residual'].iloc[-2:]) + list(test['residual'].iloc[:-2])
test['res_lag2']  = list(train['residual'].iloc[-1:]) + list(test['residual'].iloc[:-1])

X_res_train = train[['res_lag1','res_lag2','activity','capacity','capacity_factor']]
y_res_train = train['residual']
X_res_test  = test[['res_lag1','res_lag2','activity','capacity','capacity_factor']]

dtrain = xgb.DMatrix(X_res_train, y_res_train)
dtest  = xgb.DMatrix(X_res_test)
params = {'objective':'reg:squarederror','verbosity':0}
bst = xgb.train(params, dtrain, num_boost_round=300)

res_pred_xgb = bst.predict(dtest)
final_pred_xgb = test['pinn_pred'].values + res_pred_xgb

# =========================================================
# 8B) Residual modeling with Prophet
# =========================================================
train_res = train[['ds','residual']].rename(columns={'residual':'y'})
test_res  = test[['ds','residual']].rename(columns={'residual':'y'})

m_res = Prophet(yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False)
m_res.fit(train_res)

future_res = m_res.make_future_dataframe(periods=len(test_res), freq='MS')
forecast_res = m_res.predict(future_res)
res_pred_prophet = forecast_res['yhat'].iloc[len(train_res):].values
final_pred_prophet = test['pinn_pred'].values + res_pred_prophet

# =========================================================
# 9) Evaluation metrics
# =========================================================
def robust_mape(y_true, y_pred):
    mask = y_true != 0
    if mask.sum() == 0:
        return np.nan
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def compute_metrics(y_true, y_pred, name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape_val = robust_mape(y_true, y_pred)
    smape_val = smape(y_true, y_pred)
    return pd.DataFrame({
        'Model': [name],
        'MAPE (%)': [mape_val],
        'MAE': [mae],
        'RMSE': [rmse],
        'sMAPE (%)': [smape_val]
    })

y_true_test = test['y'].values
df_metrics = pd.concat([
    compute_metrics(y_true_test, y_pinn_test, 'PINN-only'),
    compute_metrics(y_true_test, final_pred_xgb, 'PINN + XGBoost'),
    compute_metrics(y_true_test, final_pred_prophet, 'PINN + Prophet')
], ignore_index=True)

print(df_metrics)


In [None]:
import matplotlib.pyplot as plt

# Collect predictions and their names
predictions = {
    "PINN-only": y_pinn_test,
    "PINN + XGBoost": final_pred_xgb,
    "PINN + Prophet": final_pred_prophet
}

# Compute MAPE for annotation
results = {name: robust_mape(test['y'].values, y_pred) for name, y_pred in predictions.items()}

plt.figure(figsize=(14,5))

# Training data
plt.plot(train['ds'], train['y'], label="Train (Actual)", color="black", linewidth=2)

# Test actual
plt.plot(test['ds'], test['y'], label="Test (Actual)", color="blue", linewidth=2)

# Forecasts from models
for name, y_pred in predictions.items():
    plt.plot(test['ds'], y_pred, '--', label=f"{name} (MAPE {results[name]:.2f}%)", linewidth=2)

# Vertical line for train/test split
plt.axvline(pd.Timestamp("2024-01-01"), color="gray", linestyle="--", label="Train/Test Split")

plt.title("Carbon Emissions Forecast: PINN + Residual Modeling")
plt.xlabel("Date")
plt.ylabel("Monthly Emissions")
plt.legend()
plt.grid(alpha=0.3)
plt.show()


# TOP 5 Models: PINN + Prophet, Prophet, XGBoost, PINN + XGBoost, PINN

In [None]:
# =========================================================
# 0) Imports
# =========================================================
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
from prophet import Prophet

# =========================================================
# 1) Prepare India data
# =========================================================
df_india=df_final.copy()

# Aggregate monthly totals
df_monthly = df_india.groupby('start_time').agg({
    'emissions_quantity':'sum',
    'activity':'sum',
    'capacity':'sum',
    'capacity_factor':'mean'
}).reset_index()

# rename for clarity
df_monthly = df_monthly.rename(columns={
    'start_time':'ds',
    'emissions_quantity':'y',
    'activity':'activity',
    'capacity':'capacity',
    'capacity_factor':'capacity_factor'
})

# =========================================================
# 2) Train/Test split by date
# =========================================================
train = df_monthly[(df_monthly['ds'] >= '2021-01-01') & (df_monthly['ds'] < '2024-01-01')].copy()
test  = df_monthly[(df_monthly['ds'] >= '2024-01-01') & (df_monthly['ds'] <= '2025-05-01')].copy()

X_train = train[['activity','capacity','capacity_factor']].values
y_train = train['y'].values
X_test  = test[['activity','capacity','capacity_factor']].values
y_test  = test['y'].values

# =========================================================
# 3) Scale features and target
# =========================================================
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled  = scaler_X.transform(X_test)

scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1,1)).flatten()
y_test_scaled  = scaler_y.transform(y_test.reshape(-1,1)).flatten()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_percentage_error

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
from prophet import Prophet


# =========================================================
# 1) PROPHET
# =========================================================
m = Prophet(yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False)
m.fit(train)
# Create futures
last_needed = pd.Timestamp('2025-05-01')
months_needed = (last_needed.to_period('M') - train['ds'].max().to_period('M')).n + 1
future_full = m.make_future_dataframe(periods=months_needed, freq='MS')
forecast_full = m.predict(future_full)
future_test = pd.DataFrame({'ds': test['ds']})
forecast_test_only = m.predict(future_test)[['ds', 'yhat']]
missing_in_forecast = sorted(set(test['ds']) - set(forecast_test_only['ds']))
if missing_in_forecast:
    print("Warning: these test dates are missing predictions:", missing_in_forecast)
# Evaluation (MAPE) on 2024-01 → 2025-05
eval_df = test.merge(forecast_test_only, on='ds', how='left').copy()
if eval_df['yhat'].isna().any() or eval_df['y'].isna().any():
    n_nan_pred = eval_df['yhat'].isna().sum()
    n_nan_true = eval_df['y'].isna().sum()
    raise ValueError(f"Found NaNs after alignment -> yhat NaNs: {n_nan_pred}, y NaNs: {n_nan_true}. "
                    "Check the monthly continuity or date alignment.")
y_true = eval_df['y'].to_numpy()
y_pred = eval_df['yhat'].to_numpy()
nonzero_mask = y_true != 0
if nonzero_mask.sum() == 0:
    mape = np.nan
else:
    mape = np.mean(np.abs((y_true[nonzero_mask] - y_pred[nonzero_mask]) / y_true[nonzero_mask])) * 100
prophet_mape=mape
print(f"Prophet MAPE: {prophet_mape:.2f}%")


# ---------------------------
# 2) XGBoost
# ---------------------------
def create_features(df_final):
    df_monthly = df_final.groupby('start_time')['emissions_quantity'].sum().reset_index()
    df_monthly = df_monthly.rename(columns={'start_time':'ds','emissions_quantity':'y'})
    df = df_monthly.copy()
    df['year'] = df['ds'].dt.year
    df['month'] = df['ds'].dt.month
    df['quarter'] = df['ds'].dt.quarter
    df['dayofyear'] = df['ds'].dt.dayofyear
    df['sin_month'] = np.sin(2 * np.pi * df['month']/12)
    df['cos_month'] = np.cos(2 * np.pi * df['month']/12)
    return df

df_ml = create_features(df_final)

# Train/Test split (same as earlier)
train_ml = df_ml[(df_ml['ds'] >= '2021-01-01') & (df_ml['ds'] < '2024-01-01')]
test_ml  = df_ml[(df_ml['ds'] >= '2024-01-01') & (df_ml['ds'] <= '2025-05-01')]

X_train_ml = train_ml.drop(columns=['ds','y'])
y_train_ml = train_ml['y']
X_test_ml  = test_ml.drop(columns=['ds','y'])
y_test_ml  = test_ml['y']

models = {
    "XGBoost": XGBRegressor(n_estimators=200, random_state=42),
}

resultsML = {}
predictionsML = {}

for name, model in models.items():
    model.fit(X_train_ml, y_train_ml)
    y_pred = model.predict(X_test_ml)
    predictionsML[name] = y_pred
    mape = mean_absolute_percentage_error(y_test, y_pred) * 100
    resultsML[name] = mape
    print(f"{name} MAPE: {resultsML[name]:.2f}%")
# ---------------------------
# 3) PINN
# ---------------------------
# Define PINN with MAPE-compatible loss
class PINN(nn.Module):
    def __init__(self, input_dim=5, hidden=64):
        super(PINN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden),
            nn.Tanh(),
            nn.Linear(hidden, hidden),
            nn.Tanh(),
            nn.Linear(hidden, 1)
        )
    def forward(self, x):
        return self.net(x)

# Differentiable MAPE loss
class MAPELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.eps = eps
    def forward(self, y_pred, y_true):
        return torch.mean(torch.abs(y_pred - y_true) / (torch.abs(y_true) + self.eps))

# Physics-informed residual loss
def physics_residual_loss(y_pred, features, eps=1e-6):
    # features assumed to have: [activity, emission_factor, capacity_factor, ...]
    activity = features[:, 0]
    ef = features[:, 1]
    cf = features[:, 2]
    physics_estimate = activity * ef * cf
    return torch.mean((y_pred.squeeze() - physics_estimate) ** 2)

# =========================================================
# 5) Prepare PyTorch DataLoader
# =========================================================
X_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y_train_scaled, dtype=torch.float32).view(-1,1)

dataset = TensorDataset(X_tensor, y_tensor)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Instantiate model, optimizer, loss
model = PINN()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = MAPELoss()

# =========================================================
# 6) Train PINN
# =========================================================
lambda_phys = 0.1  # weight for physics-informed loss

for epoch in range(5000):
    for xb, yb in loader:
        optimizer.zero_grad()
        y_pred = model(xb)
        # Data loss
        loss_data = MAPELoss(y_pred, yb)
        # Physics loss (using first 3 cols as activity, ef, cf)
        loss_phys = physics_residual_loss(y_pred, xb[:, :3])
        # Total loss
        loss = loss_data + lambda_phys * loss_phys
        loss.backward()
        optimizer.step()
    # if epoch % 500 == 0:
    #     print(f"Epoch {epoch}, MAPE Loss: {loss.item():.6f}")

# PINN predictions (scaled -> original)
model.eval()
with torch.no_grad():
    y_pinn_train_scaled = model(torch.tensor(X_train_scaled, dtype=torch.float32)).numpy().flatten()
    y_pinn_test_scaled  = model(torch.tensor(X_test_scaled, dtype=torch.float32)).numpy().flatten()

y_pinn_train = scaler_y.inverse_transform(y_pinn_train_scaled.reshape(-1,1)).flatten()
y_pinn_test  = scaler_y.inverse_transform(y_pinn_test_scaled.reshape(-1,1)).flatten()

train['pinn_pred'] = y_pinn_train
train['residual'] = train['y'] - train['pinn_pred']
test['pinn_pred']  = y_pinn_test
test['residual']   = test['y'] - test['pinn_pred']

# =========================================================
# 3A) Residual modeling with XGBoost
# =========================================================
# Simple lag features
train['res_lag1'] = train['residual'].shift(1).fillna(0)
train['res_lag2'] = train['residual'].shift(2).fillna(0)
test['res_lag1']  = list(train['residual'].iloc[-2:]) + list(test['residual'].iloc[:-2])
test['res_lag2']  = list(train['residual'].iloc[-1:]) + list(test['residual'].iloc[:-1])

X_res_train = train[['res_lag1','res_lag2','activity','capacity','capacity_factor']]
y_res_train = train['residual']
X_res_test  = test[['res_lag1','res_lag2','activity','capacity','capacity_factor']]

dtrain = xgb.DMatrix(X_res_train, y_res_train)
dtest  = xgb.DMatrix(X_res_test)
params = {'objective':'reg:squarederror','verbosity':0}
bst = xgb.train(params, dtrain, num_boost_round=300)

res_pred_xgb = bst.predict(dtest)
final_pred_xgb = test['pinn_pred'].values + res_pred_xgb

# =========================================================
# 3B) Residual modeling with Prophet
# =========================================================
train_res = train[['ds','residual']].rename(columns={'residual':'y'})
test_res  = test[['ds','residual']].rename(columns={'residual':'y'})

m_res = Prophet(yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False)
m_res.fit(train_res)

future_res = m_res.make_future_dataframe(periods=len(test_res), freq='MS')
forecast_res = m_res.predict(future_res)
res_pred_prophet = forecast_res['yhat'].iloc[len(train_res):].values
final_pred_prophet = test['pinn_pred'].values + res_pred_prophet

# Evaluation metrics
def robust_mape(y_true, y_pred):
    mask = y_true != 0
    if mask.sum() == 0:
        return np.nan
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

def compute_metrics(y_true, y_pred, name):
    mape_val = robust_mape(y_true, y_pred)
    return pd.DataFrame({
        'Model': [name],
        'MAPE (%)': [mape_val],
    })

y_true_test = test['y'].values
df_metrics = pd.concat([
    compute_metrics(y_true_test, y_pinn_test, 'PINN-only'),
    compute_metrics(y_true_test, final_pred_xgb, 'PINN + XGBoost'),
    compute_metrics(y_true_test, final_pred_prophet, 'PINN + Prophet'),
    pd.DataFrame({'Model':['Prophet'], 'MAPE (%)':[prophet_mape],}),
    pd.DataFrame({'Model':['XGBoost'], 'MAPE (%)':[resultsML['XGBoost']],}),

], ignore_index=True)
print("\n",'-'*8,'RESULTS','-'*8)
print(df_metrics)

# Collect predictions and their names
predictionsPinn = {
    "PINN + Prophet": final_pred_prophet,
    "PINN + XGBoost": final_pred_xgb
}

resultsPinn = {name: robust_mape(test['y'].values, y_pred) for name, y_pred in predictionsPinn.items()}

# =========================================================
# 4) PLOT
# =========================================================
plt.figure(figsize=(14,5))
# Training data
plt.plot(train['ds'], train['y'], label="Train (Actual)", color="black", linewidth=2)
# Test actual
plt.plot(test['ds'], test['y'], label="Test (Actual)", color="blue", linewidth=2)

# Forecasts from models
plt.plot(eval_df['ds'], eval_df['yhat'], '--', label=f"Prophet (MAPE={prophet_mape:.2f}%)")
for name, y_pred in predictionsML.items():
    plt.plot(test_ml['ds'], y_pred, '--', label=f"{name} (MAPE {resultsML[name]:.2f}%)", linewidth=2)
for name, y_pred in predictionsPinn.items():
    plt.plot(test['ds'], y_pred, '--', label=f"{name} (MAPE {resultsPinn[name]:.2f}%)", linewidth=2)

    
# Vertical line for train/test split
plt.axvline(pd.Timestamp("2024-01-01"), color="gray", linestyle="--", label="Train/Test Split")
plt.title("Carbon Emissions Forecast - India: Top Methods")
plt.xlabel("Date")
plt.ylabel("Total Monthly Carbon Emissions")
plt.legend()
plt.grid(alpha=0.3)
plt.show()




# augmentation

In [None]:
df_final.info()

In [None]:
# PHASE 2: DECOMPOSITION-AWARE AUGMENTATION
# Modified code with data augmentation pipeline
# ============================================================

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose

from prophet import Prophet
from sklearn.metrics import (
    mean_absolute_percentage_error,
    mean_absolute_error,
    mean_squared_error,
    r2_score
)

from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim

plt.style.use("seaborn-v0_8")

RANDOM_SEED   = 42
DATE_COL      = "start_time"
TARGET_COL    = "emissions_quantity"
PLANT_COL     = "sector"
TEST_MONTHS   = 12
PHYS_COL      = "activity"
TargetCol_raw = "y"
ProductionCol = "activity"

# ============================================================
# 0) NEW: AUGMENTATION PARAMETERS
# ============================================================
NUM_AUGMENTED_SAMPLES = 4  
JITTER_STD_MULTIPLIER = 0.05  # sigma = 0.05 * std(residuals)
DECOMPOSITION_PERIOD = 12  


# ============================================================
# 1) SEEDING
# ============================================================
def set_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_SEED)


# ============================================================
# 2) DATA PREP FUNCTIONS (ORIGINAL)
# ============================================================
def ensure_datetime_column(df: pd.DataFrame,
                           date_col: str = DATE_COL) -> pd.DataFrame:
    """Ensure a monthly datetime column 'ds' exists."""
    if date_col in df.columns:
        ds = pd.to_datetime(df[date_col], errors="coerce", infer_datetime_format=True)
    else:
        print("Warning: date_col not found, attempting to use 'ds' column.")
        ds = pd.Series(pd.NaT, index=df.index)

    needs_rebuild = ds.isna()
    if needs_rebuild.any():
        if not {"year", "month"}.issubset(df.columns):
            raise ValueError(
                "No valid datetime and missing 'year'/'month' to rebuild dates."
            )
        ds_rebuilt = pd.to_datetime(
            dict(
                year=df.loc[needs_rebuild, "year"],
                month=df.loc[needs_rebuild, "month"],
                day=1,
            )
        )
        ds.loc[needs_rebuild] = ds_rebuilt

    df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()
    return df


def clean_and_impute(df: pd.DataFrame) -> pd.DataFrame:
    """Drop duplicates and impute numeric columns (0→NaN→mean)."""
    if {"ds", PLANT_COL, TARGET_COL}.issubset(df.columns):
        df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
        g = group.copy()
        for c in num_cols:
            g[c] = g[c].replace(0, np.nan)
            if g[c].notna().any():
                g[c] = g[c].fillna(g[c].mean())
        return g

    if "year" in df.columns:
        df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
    else:
        df = impute_zero_nan_with_mean(df)

    for c in num_cols:
        if df[c].isna().any():
            df[c] = df[c].fillna(df[c].mean())

    return df


def prepare_overall_series2(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate ALL numeric columns across plants into a single monthly series.
    For each month (ds), returns the sum of every numeric column.
    """
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    if "ds" not in df.columns:
        raise ValueError("The dataframe must contain a 'ds' datetime column.")

    agg = (
        df.groupby("ds", as_index=False)[numeric_cols]
          .sum()
          .sort_values("ds")
          .reset_index(drop=True)
    )

    return agg


def compute_metrics(y_true, y_pred, prefix=""):
    """
    Compute common regression metrics.
    Assumes inputs are 1D numpy arrays.
    """
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)

    return {
        f"{prefix}MAE": mae,
        f"{prefix}RMSE": rmse,
        f"{prefix}MAPE": mape,
        f"{prefix}R2": r2,
    }


# ============================================================
# NEW SECTION: DECOMPOSITION-AWARE AUGMENTATION FUNCTIONS
# ============================================================
def decompose_time_series(series: pd.Series, 
                          period: int = DECOMPOSITION_PERIOD) -> dict:
    
    # Ensure we have enough data for decomposition
    if len(series) < 2 * period:
        print(f"Warning: Series length ({len(series)}) < 2*period ({2*period}). "
              f"Adjusting period to {len(series)//2}.")
        period = max(2, len(series) // 2)
    
    try:
        decomposition = seasonal_decompose(
            series, 
            model='additive', 
            period=period,
            
        )
        
        return {
            'trend': decomposition.trend.fillna(method="bfill").fillna(method="ffill").values,
            'seasonal': decomposition.seasonal.values,
            'residual': decomposition.resid.fillna(0).values,
            'period': period
        }
    except Exception as e:
        print(f"Decomposition failed: {e}. Returning zero components.")
        return {
            'trend': series.values,
            'seasonal': np.zeros_like(series.values),
            'residual': np.zeros_like(series.values),
            'period': period
        }


def apply_jittering(residuals: np.ndarray, 
                   std_multiplier: float = JITTER_STD_MULTIPLIER) -> np.ndarray:

    residual_std = np.std(residuals[~np.isnan(residuals)])
    if residual_std == 0:
        residual_std = 1.0
    
    noise_std = std_multiplier * residual_std
    jitter = np.random.normal(0, noise_std, size=len(residuals))
    
    return residuals + jitter


def create_augmented_samples(series: pd.Series,
                             num_samples: int = NUM_AUGMENTED_SAMPLES,
                             decomposition_dict: dict = None) -> list:
    """
    Create augmented time series by:
    1. Decomposing into trend, seasonal, residual
    2. Jittering the residual component
    3. Reconstructing: T + S + (R + jitter)
    """
    if decomposition_dict is None:
        decomposition_dict = decompose_time_series(series)
    
    trend = decomposition_dict['trend']
    seasonal = decomposition_dict['seasonal']
    residual = decomposition_dict['residual']
    
    augmented_samples = []
    
    for i in range(num_samples):
        # Apply jittering to residuals
        jittered_residual = apply_jittering(residual.copy())
        
        # Reconstruct: T + S + (R + jitter)
        augmented_series = trend + seasonal + jittered_residual
        augmented_samples.append(augmented_series)
    
    return augmented_samples


def prepare_augmented_training_data(train_df: pd.DataFrame,
                                   num_augmented: int = NUM_AUGMENTED_SAMPLES,
                                   decomposition_dict: dict = None) -> pd.DataFrame:
    
    original_series = pd.Series(train_df['y_norm'].values, index=range(len(train_df)))
    
    # Create augmented samples
    augmented_samples = create_augmented_samples(
        original_series,
        num_samples=num_augmented,
        decomposition_dict=decomposition_dict
    )
    
    # Prepare augmented dataframes
    augmented_dfs = []
    
    # Add original data
    augmented_dfs.append(train_df.copy())
    
    # Add augmented data
    for idx, aug_values in enumerate(augmented_samples):
        aug_df = train_df[['ds', 'y']].copy() if 'y' in train_df.columns else train_df[['ds']].copy()
        aug_df['y_norm'] = aug_values
        if 'y' not in aug_df.columns:
            aug_df['y'] = aug_values  # For Prophet
        else:
            aug_df['y'] = aug_values  # Overwrite with augmented values
        
        # Optional: Add metadata for tracking
        aug_df['augmented'] = True
        aug_df['augmentation_id'] = idx
        augmented_dfs.append(aug_df)
    
    # Concatenate all
    augmented_train = pd.concat(augmented_dfs, ignore_index=True)
    
    print(f"\n[AUGMENTATION] Original training size: {len(train_df)}")
    print(f"[AUGMENTATION] Augmented versions created: {num_augmented}")
    print(f"[AUGMENTATION] Total augmented training size: {len(augmented_train)}")
    
    return augmented_train


def visualize_decomposition(series: pd.Series, 
                           decomposition_dict: dict,
                           title: str = "Time Series Decomposition") -> None:
    """
    Visualize the decomposition of a time series.
    """
    fig, axes = plt.subplots(4, 1, figsize=(12, 10))
    
    # Original series
    axes[0].plot(series.index, series.values, 'b-', linewidth=1.5)
    axes[0].set_ylabel('Original')
    axes[0].set_title(title)
    axes[0].grid(True, alpha=0.3)
    
    # Trend
    axes[1].plot(series.index, decomposition_dict['trend'], 'g-', linewidth=1.5)
    axes[1].set_ylabel('Trend')
    axes[1].grid(True, alpha=0.3)
    
    # Seasonal
    axes[2].plot(series.index, decomposition_dict['seasonal'], 'orange', linewidth=1.5)
    axes[2].set_ylabel('Seasonal')
    axes[2].grid(True, alpha=0.3)
    
    # Residual
    axes[3].plot(series.index, decomposition_dict['residual'], 'r-', linewidth=1.5)
    axes[3].set_ylabel('Residual')
    axes[3].set_xlabel('Time Index')
    axes[3].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()


def visualize_augmented_samples(original_series: pd.Series,
                               augmented_samples: list,
                               title: str = "Original vs Augmented Series") -> None:
    """
    Visualize original series and augmented versions.
    """
    plt.figure(figsize=(12, 6))
    
    # Plot original
    plt.plot(original_series.index, original_series.values, 
             'b-', linewidth=2.5, label='Original', alpha=0.8)
    
    # Plot augmented samples
    colors = ['red', 'green', 'orange', 'purple', 'brown']
    for idx, aug in enumerate(augmented_samples):
        plt.plot(original_series.index, aug, 
                linewidth=1, alpha=0.5, 
                label=f'Augmented {idx+1}',
                color=colors[idx % len(colors)])
    
    plt.xlabel('Time Index')
    plt.ylabel('Normalized Value')
    plt.title(title)
    plt.legend(loc='best')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()


# ============================================================
# 3) NN MODEL DEFINITION (ORIGINAL - PINN class assumed)
# ============================================================
class NN(nn.Module):
    def __init__(self, in_dim=1, hidden_dim=32, out_dim=1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )

    def forward(self, x):
        return self.net(x)


def regression_from_numpy_normalized(x, y):
    """
    Normalize x and y, then compute linear regression y = m*x + c.
    Returns m, c and plots the regression line on normalized data.
    """
    x = x.flatten().reshape(-1, 1)
    y = y.flatten().reshape(-1, 1)

    scaler_x = StandardScaler()
    scaler_y = StandardScaler()

    x_norm = scaler_x.fit_transform(x).flatten()
    y_norm = scaler_y.fit_transform(y).flatten()

    x_mean = x_norm.mean()
    y_mean = y_norm.mean()

    m = np.sum((x_norm - x_mean) * (y_norm - y_mean)) / np.sum((x_norm - x_mean) ** 2)
    c = y_mean - m * x_mean

    plt.figure(figsize=(8, 5))
    plt.scatter(x_norm, y_norm, alpha=0.7, label="Normalized Data")

    x_line = np.linspace(x_norm.min(), x_norm.max(), 100)
    y_line = m * x_line + c

    plt.plot(x_line, y_line, color="red", linewidth=2,
             label=f"y = {m:.4f}x + {c:.4f}")

    plt.xlabel("x (normalized)")
    plt.ylabel("y (normalized)")
    plt.title("Linear Regression on Normalized Data")
    plt.grid(True, linestyle="--", alpha=0.4)
    plt.legend()
    plt.tight_layout()
    plt.show()

    return m, c


def physics_residual_loss(y_pred: torch.Tensor,
                          features: torch.Tensor,
                          eps: float = 1e-6) -> torch.Tensor:
    delta = 0.75
    elec = features[:, 0]
    c02  = features[:, 1]
    prod = features[:, 2]

    # Rule-based reference (not physics)
    rule = (0.4727 * elec) / prod

    # Normalised deviation from the rule
    diff = (y_pred.squeeze() - rule) / (torch.abs(rule) + eps)
    
    # Huber-style soft rule penalty
    penalty = torch.where(
        torch.abs(diff) <= delta,
        0.5 * diff**2,
        delta * (torch.abs(diff) - 0.5 * delta)
    )

    return penalty.mean()


# ============================================================
# MAIN PIPELINE WITH AUGMENTATION
# ============================================================

# [ORIGINAL PIPELINE UP TO TRAIN/TEST SPLIT]
df_raw = df_final.copy()

df = ensure_datetime_column(df_raw, date_col=DATE_COL)
df = clean_and_impute(df)

overall = prepare_overall_series2(df)
overall = overall.sort_values("ds").reset_index(drop=True)
overall = overall.rename(columns={TARGET_COL: "y"})

# --- GLOBAL SCALING (Prophet + Hybrid + Final Outputs) ---
scaler_y_global = StandardScaler()
overall["y_norm"] = scaler_y_global.fit_transform(overall[["y"]])

# Train/test split
split_point = len(overall) - TEST_MONTHS
train_overall = overall.iloc[:split_point].copy()
test_overall  = overall.iloc[split_point:].copy()

print("=" * 60)
print("ORIGINAL DATASET")
print("=" * 60)
print(f"Train: {len(train_overall)} samples")
print(f"Test:  {len(test_overall)} samples")


# ============================================================
# PHASE 2: DECOMPOSITION-AWARE AUGMENTATION
# ============================================================
print("\n" + "=" * 60)
print("PHASE 2: DECOMPOSITION-AWARE AUGMENTATION")
print("=" * 60)

# Step 1: Decompose the original training series
original_train_series = pd.Series(
    train_overall['y_norm'].values,
    index=range(len(train_overall))
)

decomposition_dict = decompose_time_series(
    original_train_series,
    period=DECOMPOSITION_PERIOD
)

print(f"\n[DECOMPOSITION] Seasonal period: {decomposition_dict['period']}")
print(f"[DECOMPOSITION] Trend shape: {decomposition_dict['trend'].shape}")
print(f"[DECOMPOSITION] Seasonal shape: {decomposition_dict['seasonal'].shape}")
print(f"[DECOMPOSITION] Residual shape: {decomposition_dict['residual'].shape}")

# Visualize decomposition (optional - comment out if not needed)
visualize_decomposition(original_train_series, decomposition_dict, 
                       title="Training Data Decomposition")

# Step 2: Create augmented samples
augmented_samples = create_augmented_samples(
    original_train_series,
    num_samples=NUM_AUGMENTED_SAMPLES,
    decomposition_dict=decomposition_dict
)

print(f"\n[AUGMENTATION] Created {len(augmented_samples)} augmented samples")
print(f"[AUGMENTATION] Jitter std multiplier: {JITTER_STD_MULTIPLIER}")

# Visualize augmentation (optional - comment out if not needed)
visualize_augmented_samples(original_train_series, augmented_samples,
                           title="Original vs Augmented Training Samples")

# Step 3: Prepare augmented training dataframe for Prophet
train_prophet_augmented = train_overall[["ds", "y_norm"]].rename(
    columns={"y_norm": "y"}
).copy()

# Create additional augmented dataframes
for idx, aug_values in enumerate(augmented_samples):
    aug_df = train_overall[["ds"]].copy()
    aug_df["y"] = aug_values
    train_prophet_augmented = pd.concat(
        [train_prophet_augmented, aug_df],
        ignore_index=True
    )

# Sort by ds for Prophet (important for time series)
train_prophet_augmented = train_prophet_augmented.sort_values("ds").reset_index(drop=True)

print(f"\n[PROPHET TRAINING DATA]")
print(f"Original: {len(train_overall)}")
print(f"Augmented: {len(train_prophet_augmented)}")

# Prepare test data (unchanged)
test_prophet = test_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})

# ============================================================
# 5) PROPHET WITH AUGMENTED DATA
# ============================================================
print("\n" + "=" * 60)
print("TRAINING PROPHET WITH AUGMENTED DATA")
print("=" * 60)

m_overall_augmented = Prophet(
    seasonality_mode="multiplicative",
    yearly_seasonality=True,
    weekly_seasonality=False,
    daily_seasonality=False,
    changepoint_prior_scale=0.1,
)

# Fit on augmented training data
m_overall_augmented.fit(train_prophet_augmented)

# Predict on test set
future_all_augmented = m_overall_augmented.make_future_dataframe(
    periods=len(test_prophet), 
    freq="MS"
)
fcst_all_augmented = m_overall_augmented.predict(future_all_augmented)

# Extract test predictions
df_prophet_test_augmented = (
    fcst_all_augmented[["ds", "yhat"]]
    .merge(test_prophet[["ds", "y"]], on="ds", how="inner")
    .sort_values("ds")
)

# MAPE in NORMALIZED SPACE
mape_prophet_augmented_norm = mean_absolute_percentage_error(
    df_prophet_test_augmented["y"],
    df_prophet_test_augmented["yhat"]
)

print(f"\n[Prophet - Augmented] MAPE : {mape_prophet_augmented_norm:.4f}")



# Convert to original units
df_prophet_test_augmented["y_orig"] = scaler_y_global.inverse_transform(
    df_prophet_test_augmented[["y"]]
).flatten()
df_prophet_test_augmented["yhat_orig"] = scaler_y_global.inverse_transform(
    df_prophet_test_augmented[["yhat"]]
).flatten()

test_overall["prophet_pred_augmented"] = df_prophet_test_augmented["yhat_orig"].values

mape_prophet_augmented_orig = mean_absolute_percentage_error(
    df_prophet_test_augmented["y_orig"],
    df_prophet_test_augmented["yhat_orig"]
)


prophet_metrics_augmented = compute_metrics(
    df_prophet_test_augmented["y"].values,
    df_prophet_test_augmented["yhat"].values,
    prefix="Prophet_Aug_"
)



# ============================================================
# 6) PINN WITH AUGMENTED DATA
# ============================================================
while True:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_time = np.arange(len(train_overall)).reshape(-1, 1)
    test_time  = np.arange(len(train_overall), len(overall)).reshape(-1, 1)

    y_train_norm = train_overall["y_norm"].values.reshape(-1, 1)

    # Choose multiple physics columns
    PHYS_COLS_ALL = [
        PHYS_COL,
        TargetCol_raw,
        ProductionCol
    ]

    # --- Build X_phys with 3 columns ---
    missing = [c for c in PHYS_COLS_ALL if c not in overall.columns]
    if missing:
        raise ValueError(f"Missing physics columns: {missing}, ",overall.columns)

    X_phys = overall[PHYS_COLS_ALL].values   # shape: (N, 3)

    # Optional: ensure no negatives
    if np.any(X_phys < 0):
        print("Warning: Negative physics values detected. Clipping to 0.")
        X_phys = np.clip(X_phys, a_min=0, a_max=None)

    # Split train/test
    X_phys_train = X_phys[:len(train_overall)]
    X_phys_test  = X_phys[len(train_overall):]

    # Second-level scaling for PINN
    scaler_time = StandardScaler()
    scaler_Y    = StandardScaler()
    scaler_phys = StandardScaler()

    train_time_scaled = scaler_time.fit_transform(train_time)
    test_time_scaled  = scaler_time.transform(test_time)

    y_train_scaled = scaler_Y.fit_transform(y_train_norm)

    X_phys_train_scaled = scaler_phys.fit_transform(X_phys_train)

    # Torch tensors
    X_t      = torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
    Y_t      = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
    X_phys_t = torch.tensor(X_phys_train_scaled, dtype=torch.float32).to(device)

    model     = NN(in_dim=1, hidden_dim=32, out_dim=1).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    n_epochs  = 7000
    best_loss = float("inf")
    patience  = 500
    counter   = 0
    training_log = []

    for epoch in range(n_epochs):
        model.train()
        optimizer.zero_grad()

        y_pred    = model(X_t)
        data_loss = torch.mean((y_pred - Y_t) ** 2)
        phys_loss = physics_residual_loss(y_pred, X_phys_t)
        loss      = data_loss + 0.25 * phys_loss

        loss.backward()
        optimizer.step()

        if loss.item() < best_loss:
            best_loss = loss.item()
            counter = 0
        else:
            counter += 1

        if counter >= patience:
            break

        if epoch % 50 == 0:
            training_log.append({"epoch": epoch, "total_loss": loss.item(), "data_loss": data_loss.item(),"phys_loss": phys_loss.item()})

    

    # Predict on train/test (in normalized space)
    model.eval()
    with torch.no_grad():
        y_train_pred_scaled = model(
            torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
        ).cpu().numpy()

        y_test_pred_scaled = model(
            torch.tensor(test_time_scaled, dtype=torch.float32).to(device)
        ).cpu().numpy()

    # Remove PINN scaler → back to GLOBAL-NORMALIZED space
    y_train_pred_norm = scaler_Y.inverse_transform(y_train_pred_scaled).flatten()
    y_test_pred_norm  = scaler_Y.inverse_transform(y_test_pred_scaled).flatten()

    train_overall["pinn_pred_norm"] = y_train_pred_norm
    test_overall["pinn_pred_norm"]  = y_test_pred_norm

    # PINN MAPE in NORMALIZED SPACE
    df_pinn_test = test_overall[["y_norm", "pinn_pred_norm"]].copy()

    mape_pinn_norm = mean_absolute_percentage_error(
        df_pinn_test["y_norm"], df_pinn_test["pinn_pred_norm"]
    )
    # print(f"[PINN]     MAPE (normalized): {mape_pinn_norm:.4f}")
   

    # print("\n[NN Metrics]")
    # for k, v in pinn_metrics.items():
    #     print(f"{k}: {v:.4f}")


    # ---- PINN MAPE in ORIGINAL units ----
    df_pinn_test["y_orig"] = scaler_y_global.inverse_transform(
        df_pinn_test[["y_norm"]]
    ).flatten()
    df_pinn_test["pinn_pred_orig"] = scaler_y_global.inverse_transform(
        df_pinn_test[["pinn_pred_norm"]]
    ).flatten()
    mape_pinn_orig = mean_absolute_percentage_error(
        df_pinn_test["y_orig"], df_pinn_test["pinn_pred_orig"]
    )
    # print(f"[PINN]     MAPE (original)  : {mape_pinn_orig:.4f}")
    test_overall["NN_pred_augmented"] = df_pinn_test["pinn_pred_orig"].values

    pinn_metrics = compute_metrics(
    df_pinn_test["y_norm"].values,
    df_pinn_test["pinn_pred_norm"].values,
    prefix="NN_"
    )

    
    # ============================================================
    # 7) HYBRID (RULE-NN + PROPHET RESIDUAL) WITH AUGMENTED DATA
    # ============================================================
 

    # Residuals on ORIGINAL training data
    train_overall["residual_norm"] = (
        train_overall["y_norm"] - train_overall["pinn_pred_norm"]
    )
    test_overall["residual_norm"] = (
        test_overall["y_norm"] - test_overall["pinn_pred_norm"]
    )

    # Prepare for Prophet
    train_res = train_overall[["ds", "residual_norm"]].rename(
        columns={"residual_norm": "y"}
    )
    test_res = test_overall[["ds", "residual_norm"]].rename(
        columns={"residual_norm": "y"}
    )

    # Prophet on residuals (additive, zero-centered)
    m_res_augmented = Prophet(
        seasonality_mode="additive",
        yearly_seasonality=True,
        weekly_seasonality=False,
        daily_seasonality=False,
        changepoint_prior_scale=0.1,
    )

    m_res_augmented.fit(train_res)

    # Predict residuals exactly on test dates
    forecast_res_augmented = m_res_augmented.predict(test_res[["ds"]])

    # Merge residual predictions
    df_res_test_augmented = (
        forecast_res_augmented[["ds", "yhat"]]
        .merge(test_res[["ds", "y"]], on="ds", how="inner")
        .rename(columns={"yhat": "res_pred_norm"})
    )

    # Hybrid reconstruction
    df_hybrid_test_augmented = (
        test_overall[["ds", "y_norm", "pinn_pred_norm"]]
        .merge(df_res_test_augmented[["ds", "res_pred_norm"]], on="ds", how="left")
    )

    df_hybrid_test_augmented["final_pred_norm"] = (
        df_hybrid_test_augmented["pinn_pred_norm"] +
        df_hybrid_test_augmented["res_pred_norm"]
    )

    df_hybrid_test_augmented["final_pred_orig"] = scaler_y_global.inverse_transform(
        df_hybrid_test_augmented[["final_pred_norm"]]
    ).flatten()
    test_overall["Hybrid_pred_augmented"] = df_hybrid_test_augmented["final_pred_orig"].values
    # Hybrid MAPE (normalized space)
    mape_hybrid_augmented_norm = mean_absolute_percentage_error(
        df_hybrid_test_augmented["y_norm"],
        df_hybrid_test_augmented["final_pred_norm"]
    )


    hybrid_metrics_augmented = compute_metrics(
        df_hybrid_test_augmented["y_norm"].values,
        df_hybrid_test_augmented["final_pred_norm"].values,
        prefix="Hybrid_Aug_"
    )

 

    # ============================================================
    # 8) CONVERT BACK TO ORIGINAL UNITS
    # ============================================================


   

    df_hybrid_test_augmented["y_orig"] = scaler_y_global.inverse_transform(
        df_hybrid_test_augmented[["y_norm"]]
    ).flatten()

    # Hybrid MAPE in ORIGINAL units
    mape_hybrid_augmented_orig = mean_absolute_percentage_error(
        df_hybrid_test_augmented["y_orig"],
        df_hybrid_test_augmented["final_pred_orig"]
    )

    if mape_hybrid_augmented_norm < mape_prophet_augmented_norm:
        break
log_df = pd.DataFrame(training_log)
print(f"\n[Training Log] {len(log_df)} epochs logged")



# ============================================================
# 9) FINAL COMPARISON: BASELINE VS AUGMENTED
# ============================================================
print("\n" + "=" * 60)
print("FINAL COMPARISON: BASELINE VS AUGMENTED")
print("=" * 60)

# Note: For baseline, we would need to also train the original models
# For now, showing augmented results
print(f"\n[Prophet - Augmented]")
print(f"  MAPE : {mape_prophet_augmented_norm:.4f}")

print(f"\n[PINN - Augmented]")
print(f"  MAPE : {mape_pinn_orig:.4f}")

print(f"\n[PINN Norm - Augmented]")
print(f"  MAPE : {mape_pinn_norm:.4f}")

print(f"\n[Hybrid - Augmented]")
print(f"  MAPE : {mape_hybrid_augmented_norm:.4f}")


# ============================================================
# METRICS SUMMARY TABLE
# ============================================================


metrics_augmented_df = pd.DataFrame([
    {
        "Model": "Prophet (Aug)",
        "MAPE": prophet_metrics_augmented["Prophet_Aug_MAPE"],
        "R2": prophet_metrics_augmented["Prophet_Aug_R2"],
    },
    {
        "Model": "NN (Original)",
        "MAPE": pinn_metrics["NN_MAPE"],
        "R2": pinn_metrics["NN_R2"],
    },
    {
        "Model": "Hybrid (NN+Prophet Aug)",
        "MAPE": hybrid_metrics_augmented["Hybrid_Aug_MAPE"],
        "R2": hybrid_metrics_augmented["Hybrid_Aug_R2"],
    },
])

print("\n" + "=" * 60)
print("METRICS SUMMARY TABLE (WITH AUGMENTATION)")
print("=" * 60)
print(metrics_augmented_df.round(4).to_string(index=False))

# ============================================================
# Optional: Save results to CSV
# ============================================================
# metrics_augmented_df.to_csv("augmented_metrics.csv", index=False)
# df_hybrid_test_augmented.to_csv("hybrid_augmented_predictions.csv", index=False)

print("\n" + "=" * 60)
print("AUGMENTATION PIPELINE COMPLETED")
print("=" * 60)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))

plt.plot(log_df["epoch"], log_df["total_loss"], label="Total Loss")
plt.plot(log_df["epoch"], log_df["data_loss"], label="Data Loss")
plt.plot(log_df["epoch"], log_df["phys_loss"], label="Rule Loss")

plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss Curves (Rule-Regularised NN)")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.4)

plt.tight_layout()
plt.show()


In [None]:
# ============================================================
# TIME SERIES PLOT: ACTUAL vs PREDICTED
# (Prophet, NN, Hybrid — ORIGINAL UNITS)
# ============================================================

plt.figure(figsize=(16, 7))

# -----------------------
# 1) Training data (actual)
# -----------------------
plt.plot(
    train_overall["ds"],
    train_overall["y"],
    label="Training Actual",
    color="black",
    linewidth=2
)

# -----------------------
# 2) Test actuals
# -----------------------
plt.plot(
    test_overall["ds"],
    test_overall["y"],
    label="Test Actual",
    color="black",
    linestyle="--",
    linewidth=2
)

# -----------------------
# 3) Prophet predictions
# -----------------------
plt.plot(
    test_overall["ds"],
    test_overall["prophet_pred_augmented"],
    label="Prophet Prediction",
    linestyle=":",
    linewidth=2.5,
    color="tab:blue"
)

# -----------------------
# 4) NN predictions
# -----------------------
plt.plot(
    test_overall["ds"],
    test_overall["NN_pred_augmented"],
    label="NN Prediction",
    linestyle=":",
    linewidth=2.5,
    color="tab:green"
)

# -----------------------
# 5) Hybrid predictions
# -----------------------
plt.plot(
    test_overall["ds"],
    test_overall["Hybrid_pred_augmented"],
    label="Hybrid Prediction",
    linestyle=":",
    linewidth=2.5,
    color="tab:red"
)

# -----------------------
# 6) Vertical line for train/test split
# -----------------------
split_date = train_overall["ds"].iloc[-1]

plt.axvline(
    x=split_date,
    color="gray",
    linestyle="--",
    linewidth=2,
    label="Train/Test Split"
)

# -----------------------
# Labels, title, legend
# -----------------------
plt.xlabel("Date", fontsize=12)
plt.ylabel("Scope 1 Emissions (Original Units)", fontsize=12)
plt.title(
    "Actual vs Predicted Emissions\n"
    "(Prophet vs Rule-Regularised NN vs Hybrid)",
    fontsize=14
)

plt.grid(True, linestyle="--", alpha=0.4)
plt.legend(fontsize=12)
plt.tight_layout()

plt.show()


In [None]:
# PHASE 2: DECOMPOSITION-AWARE AUGMENTATION
# Modified code with data augmentation pipeline
# ============================================================

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose

from prophet import Prophet
from sklearn.metrics import (
    mean_absolute_percentage_error,
    mean_absolute_error,
    mean_squared_error,
    r2_score
)

from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim

plt.style.use("seaborn-v0_8")

RANDOM_SEED   = 42
DATE_COL      = "start_time"
TARGET_COL    = "emissions_quantity"
PLANT_COL     = "sector"
TEST_MONTHS   = 17
PHYS_COL      = "activity"
TargetCol_raw = "y"
ProductionCol = "activity"

# ============================================================
# 0) NEW: AUGMENTATION PARAMETERS
# ============================================================
NUM_AUGMENTED_SAMPLES = 4  
JITTER_STD_MULTIPLIER = 0.05  # sigma = 0.05 * std(residuals)
DECOMPOSITION_PERIOD = 12  


# ============================================================
# 1) SEEDING
# ============================================================
def set_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_SEED)


# ============================================================
# 2) DATA PREP FUNCTIONS (ORIGINAL)
# ============================================================
def ensure_datetime_column(df: pd.DataFrame,
                           date_col: str = DATE_COL) -> pd.DataFrame:
    """Ensure a monthly datetime column 'ds' exists."""
    if date_col in df.columns:
        ds = pd.to_datetime(df[date_col], errors="coerce", infer_datetime_format=True)
    else:
        print("Warning: date_col not found, attempting to use 'ds' column.")
        ds = pd.Series(pd.NaT, index=df.index)

    needs_rebuild = ds.isna()
    if needs_rebuild.any():
        if not {"year", "month"}.issubset(df.columns):
            raise ValueError(
                "No valid datetime and missing 'year'/'month' to rebuild dates."
            )
        ds_rebuilt = pd.to_datetime(
            dict(
                year=df.loc[needs_rebuild, "year"],
                month=df.loc[needs_rebuild, "month"],
                day=1,
            )
        )
        ds.loc[needs_rebuild] = ds_rebuilt

    df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()
    return df


def clean_and_impute(df: pd.DataFrame) -> pd.DataFrame:
    """Drop duplicates and impute numeric columns (0→NaN→mean)."""
    if {"ds", PLANT_COL, TARGET_COL}.issubset(df.columns):
        df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
        g = group.copy()
        for c in num_cols:
            g[c] = g[c].replace(0, np.nan)
            if g[c].notna().any():
                g[c] = g[c].fillna(g[c].mean())
        return g

    if "year" in df.columns:
        df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
    else:
        df = impute_zero_nan_with_mean(df)

    for c in num_cols:
        if df[c].isna().any():
            df[c] = df[c].fillna(df[c].mean())

    return df


def prepare_overall_series2(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate ALL numeric columns across plants into a single monthly series.
    For each month (ds), returns the sum of every numeric column.
    """
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    if "ds" not in df.columns:
        raise ValueError("The dataframe must contain a 'ds' datetime column.")

    agg = (
        df.groupby("ds", as_index=False)[numeric_cols]
          .sum()
          .sort_values("ds")
          .reset_index(drop=True)
    )

    return agg


def compute_metrics(y_true, y_pred, prefix=""):
    """
    Compute common regression metrics.
    Assumes inputs are 1D numpy arrays.
    """
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)

    return {
        f"{prefix}MAE": mae,
        f"{prefix}RMSE": rmse,
        f"{prefix}MAPE": mape,
        f"{prefix}R2": r2,
    }


# ============================================================
# NEW SECTION: DECOMPOSITION-AWARE AUGMENTATION FUNCTIONS
# ============================================================
def decompose_time_series(series: pd.Series, 
                          period: int = DECOMPOSITION_PERIOD) -> dict:
    
    # Ensure we have enough data for decomposition
    if len(series) < 2 * period:
        print(f"Warning: Series length ({len(series)}) < 2*period ({2*period}). "
              f"Adjusting period to {len(series)//2}.")
        period = max(2, len(series) // 2)
    
    try:
        decomposition = seasonal_decompose(
            series, 
            model='additive', 
            period=period,
            
        )
        
        return {
            'trend': decomposition.trend.fillna(method="bfill").fillna(method="ffill").values,
            'seasonal': decomposition.seasonal.values,
            'residual': decomposition.resid.fillna(0).values,
            'period': period
        }
    except Exception as e:
        print(f"Decomposition failed: {e}. Returning zero components.")
        return {
            'trend': series.values,
            'seasonal': np.zeros_like(series.values),
            'residual': np.zeros_like(series.values),
            'period': period
        }


def apply_jittering(residuals: np.ndarray, 
                   std_multiplier: float = JITTER_STD_MULTIPLIER) -> np.ndarray:

    residual_std = np.std(residuals[~np.isnan(residuals)])
    if residual_std == 0:
        residual_std = 1.0
    
    noise_std = std_multiplier * residual_std
    jitter = np.random.normal(0, noise_std, size=len(residuals))
    
    return residuals + jitter


def create_augmented_samples(series: pd.Series,
                             num_samples: int = NUM_AUGMENTED_SAMPLES,
                             decomposition_dict: dict = None) -> list:
    """
    Create augmented time series by:
    1. Decomposing into trend, seasonal, residual
    2. Jittering the residual component
    3. Reconstructing: T + S + (R + jitter)
    """
    if decomposition_dict is None:
        decomposition_dict = decompose_time_series(series)
    
    trend = decomposition_dict['trend']
    seasonal = decomposition_dict['seasonal']
    residual = decomposition_dict['residual']
    
    augmented_samples = []
    
    for i in range(num_samples):
        # Apply jittering to residuals
        jittered_residual = apply_jittering(residual.copy())
        
        # Reconstruct: T + S + (R + jitter)
        augmented_series = trend + seasonal + jittered_residual
        augmented_samples.append(augmented_series)
    
    return augmented_samples


def prepare_augmented_training_data(train_df: pd.DataFrame,
                                   num_augmented: int = NUM_AUGMENTED_SAMPLES,
                                   decomposition_dict: dict = None) -> pd.DataFrame:
    
    original_series = pd.Series(train_df['y_norm'].values, index=range(len(train_df)))
    
    # Create augmented samples
    augmented_samples = create_augmented_samples(
        original_series,
        num_samples=num_augmented,
        decomposition_dict=decomposition_dict
    )
    
    # Prepare augmented dataframes
    augmented_dfs = []
    
    # Add original data
    augmented_dfs.append(train_df.copy())
    
    # Add augmented data
    for idx, aug_values in enumerate(augmented_samples):
        aug_df = train_df[['ds', 'y']].copy() if 'y' in train_df.columns else train_df[['ds']].copy()
        aug_df['y_norm'] = aug_values
        if 'y' not in aug_df.columns:
            aug_df['y'] = aug_values  # For Prophet
        else:
            aug_df['y'] = aug_values  # Overwrite with augmented values
        
        # Optional: Add metadata for tracking
        aug_df['augmented'] = True
        aug_df['augmentation_id'] = idx
        augmented_dfs.append(aug_df)
    
    # Concatenate all
    augmented_train = pd.concat(augmented_dfs, ignore_index=True)
    
    print(f"\n[AUGMENTATION] Original training size: {len(train_df)}")
    print(f"[AUGMENTATION] Augmented versions created: {num_augmented}")
    print(f"[AUGMENTATION] Total augmented training size: {len(augmented_train)}")
    
    return augmented_train


def visualize_decomposition(series: pd.Series, 
                           decomposition_dict: dict,
                           title: str = "Time Series Decomposition") -> None:
    """
    Visualize the decomposition of a time series.
    """
    fig, axes = plt.subplots(4, 1, figsize=(12, 10))
    
    # Original series
    axes[0].plot(series.index, series.values, 'b-', linewidth=1.5)
    axes[0].set_ylabel('Original')
    axes[0].set_title(title)
    axes[0].grid(True, alpha=0.3)
    
    # Trend
    axes[1].plot(series.index, decomposition_dict['trend'], 'g-', linewidth=1.5)
    axes[1].set_ylabel('Trend')
    axes[1].grid(True, alpha=0.3)
    
    # Seasonal
    axes[2].plot(series.index, decomposition_dict['seasonal'], 'orange', linewidth=1.5)
    axes[2].set_ylabel('Seasonal')
    axes[2].grid(True, alpha=0.3)
    
    # Residual
    axes[3].plot(series.index, decomposition_dict['residual'], 'r-', linewidth=1.5)
    axes[3].set_ylabel('Residual')
    axes[3].set_xlabel('Time Index')
    axes[3].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()


def visualize_augmented_samples(original_series: pd.Series,
                               augmented_samples: list,
                               title: str = "Original vs Augmented Series") -> None:
    """
    Visualize original series and augmented versions.
    """
    plt.figure(figsize=(12, 6))
    
    # Plot original
    plt.plot(original_series.index, original_series.values, 
             'b-', linewidth=2.5, label='Original', alpha=0.8)
    
    # Plot augmented samples
    colors = ['red', 'green', 'orange', 'purple', 'brown']
    for idx, aug in enumerate(augmented_samples):
        plt.plot(original_series.index, aug, 
                linewidth=1, alpha=0.5, 
                label=f'Augmented {idx+1}',
                color=colors[idx % len(colors)])
    
    plt.xlabel('Time Index')
    plt.ylabel('Normalized Value')
    plt.title(title)
    plt.legend(loc='best')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()


# ============================================================
# 3) NN MODEL DEFINITION (ORIGINAL - PINN class assumed)
# ============================================================
class NN(nn.Module):
    def __init__(self, in_dim=1, hidden_dim=32, out_dim=1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )

    def forward(self, x):
        return self.net(x)


def regression_from_numpy_normalized(x, y):
    """
    Normalize x and y, then compute linear regression y = m*x + c.
    Returns m, c and plots the regression line on normalized data.
    """
    x = x.flatten().reshape(-1, 1)
    y = y.flatten().reshape(-1, 1)

    scaler_x = StandardScaler()
    scaler_y = StandardScaler()

    x_norm = scaler_x.fit_transform(x).flatten()
    y_norm = scaler_y.fit_transform(y).flatten()

    x_mean = x_norm.mean()
    y_mean = y_norm.mean()

    m = np.sum((x_norm - x_mean) * (y_norm - y_mean)) / np.sum((x_norm - x_mean) ** 2)
    c = y_mean - m * x_mean

    plt.figure(figsize=(8, 5))
    plt.scatter(x_norm, y_norm, alpha=0.7, label="Normalized Data")

    x_line = np.linspace(x_norm.min(), x_norm.max(), 100)
    y_line = m * x_line + c

    plt.plot(x_line, y_line, color="red", linewidth=2,
             label=f"y = {m:.4f}x + {c:.4f}")

    plt.xlabel("x (normalized)")
    plt.ylabel("y (normalized)")
    plt.title("Linear Regression on Normalized Data")
    plt.grid(True, linestyle="--", alpha=0.4)
    plt.legend()
    plt.tight_layout()
    plt.show()

    return m, c


def physics_residual_loss(y_pred: torch.Tensor,
                          features: torch.Tensor,
                          eps: float = 1e-6) -> torch.Tensor:
    delta = 0.75
    elec = features[:, 0]
    c02  = features[:, 1]
    prod = features[:, 2]

    # Rule-based reference (not physics)
    rule = (0.4727 * elec) / prod

    # Normalised deviation from the rule
    diff = (y_pred.squeeze() - rule) / (torch.abs(rule) + eps)
    
    # Huber-style soft rule penalty
    penalty = torch.where(
        torch.abs(diff) <= delta,
        0.5 * diff**2,
        delta * (torch.abs(diff) - 0.5 * delta)
    )

    return penalty.mean()


# ============================================================
# MAIN PIPELINE WITH AUGMENTATION
# ============================================================

# [ORIGINAL PIPELINE UP TO TRAIN/TEST SPLIT]
df_raw = df_final.copy()

df = ensure_datetime_column(df_raw, date_col=DATE_COL)
df = clean_and_impute(df)

overall = prepare_overall_series2(df)
overall = overall.sort_values("ds").reset_index(drop=True)
overall = overall.rename(columns={TARGET_COL: "y"})

# --- GLOBAL SCALING (Prophet + Hybrid + Final Outputs) ---
scaler_y_global = StandardScaler()
overall["y_norm"] = scaler_y_global.fit_transform(overall[["y"]])

# Train/test split
split_point = len(overall) - TEST_MONTHS
train_overall = overall.iloc[:split_point].copy()
test_overall  = overall.iloc[split_point:].copy()

print("=" * 60)
print("ORIGINAL DATASET")
print("=" * 60)
print(f"Train: {len(train_overall)} samples")
print(f"Test:  {len(test_overall)} samples")


# ============================================================
# PHASE 2: DECOMPOSITION-AWARE AUGMENTATION
# ============================================================
print("\n" + "=" * 60)
print("PHASE 2: DECOMPOSITION-AWARE AUGMENTATION")
print("=" * 60)

# Step 1: Decompose the original training series
original_train_series = pd.Series(
    train_overall['y'].values,
    index=range(len(train_overall))
)

decomposition_dict = decompose_time_series(
    original_train_series,
    period=DECOMPOSITION_PERIOD
)

print(f"\n[DECOMPOSITION] Seasonal period: {decomposition_dict['period']}")
print(f"[DECOMPOSITION] Trend shape: {decomposition_dict['trend'].shape}")
print(f"[DECOMPOSITION] Seasonal shape: {decomposition_dict['seasonal'].shape}")
print(f"[DECOMPOSITION] Residual shape: {decomposition_dict['residual'].shape}")

# Visualize decomposition (optional - comment out if not needed)
visualize_decomposition(original_train_series, decomposition_dict, 
                       title="Training Data Decomposition")

# Step 2: Create augmented samples
augmented_samples = create_augmented_samples(
    original_train_series,
    num_samples=NUM_AUGMENTED_SAMPLES,
    decomposition_dict=decomposition_dict
)

print(f"\n[AUGMENTATION] Created {len(augmented_samples)} augmented samples")
print(f"[AUGMENTATION] Jitter std multiplier: {JITTER_STD_MULTIPLIER}")

# Visualize augmentation (optional - comment out if not needed)
visualize_augmented_samples(original_train_series, augmented_samples,
                           title="Original vs Augmented Training Samples")

# Step 3: Prepare augmented training dataframe for Prophet
train_prophet_augmented = train_overall[["ds", "y"]].copy()

# Create additional augmented dataframes
for idx, aug_values in enumerate(augmented_samples):
    aug_df = train_overall[["ds"]].copy()
    aug_df["y"] = aug_values
    train_prophet_augmented = pd.concat(
        [train_prophet_augmented, aug_df],
        ignore_index=True
    )

# Sort by ds for Prophet (important for time series)
train_prophet_augmented = train_prophet_augmented.sort_values("ds").reset_index(drop=True)

print(f"\n[PROPHET TRAINING DATA]")
print(f"Original: {len(train_overall)}")
print(f"Augmented: {len(train_prophet_augmented)}")

# Prepare test data (unchanged)
test_prophet = test_overall[["ds", "y"]].copy()

# ============================================================
# 5) PROPHET WITH AUGMENTED DATA
# ============================================================
print("\n" + "=" * 60)
print("TRAINING PROPHET WITH AUGMENTED DATA")
print("=" * 60)

m_overall_augmented = Prophet(
    seasonality_mode="additive",
    yearly_seasonality=True,
    weekly_seasonality=True,
    daily_seasonality=True,
    changepoint_prior_scale=100.0,
)

# Fit on augmented training data
m_overall_augmented.fit(train_prophet_augmented)

# Predict on test set
future_all_augmented = m_overall_augmented.make_future_dataframe(
    periods=len(test_prophet), 
    freq="MS"
)
fcst_all_augmented = m_overall_augmented.predict(future_all_augmented)

# Extract test predictions
df_prophet_test_augmented = (
    fcst_all_augmented[["ds", "yhat"]]
    .merge(test_prophet[["ds", "y"]], on="ds", how="inner")
    .sort_values("ds")
)

# MAPE in NORMALIZED SPACE
mape_prophet_augmented_norm = mean_absolute_percentage_error(
    df_prophet_test_augmented["y"],
    df_prophet_test_augmented["yhat"]
)

print(f"\n[Prophet - Augmented] MAPE : {mape_prophet_augmented_norm:.4f}")


test_overall["prophet_pred_augmented"] = df_prophet_test_augmented["yhat"].values
prophet_metrics_augmented = compute_metrics(
    df_prophet_test_augmented["y"].values,
    df_prophet_test_augmented["yhat"].values,
    prefix="Prophet_Aug_"
)



# ============================================================
# 6) PINN WITH AUGMENTED DATA
# ============================================================
while True:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_time = np.arange(len(train_overall)).reshape(-1, 1)
    test_time  = np.arange(len(train_overall), len(overall)).reshape(-1, 1)

    y_train_norm = train_overall["y"].values.reshape(-1, 1)

    # Choose multiple physics columns
    PHYS_COLS_ALL = [
        PHYS_COL,
        TargetCol_raw,
        ProductionCol
    ]

    # --- Build X_phys with 3 columns ---
    missing = [c for c in PHYS_COLS_ALL if c not in overall.columns]
    if missing:
        raise ValueError(f"Missing physics columns: {missing}, ",overall.columns)

    X_phys = overall[PHYS_COLS_ALL].values   # shape: (N, 3)

    # Optional: ensure no negatives
    if np.any(X_phys < 0):
        print("Warning: Negative physics values detected. Clipping to 0.")
        X_phys = np.clip(X_phys, a_min=0, a_max=None)

    # Split train/test
    X_phys_train = X_phys[:len(train_overall)]
    X_phys_test  = X_phys[len(train_overall):]

    # Second-level scaling for PINN
    scaler_time = StandardScaler()
    scaler_Y    = StandardScaler()
    scaler_phys = StandardScaler()

    # train_time_scaled = scaler_time.fit_transform(train_time)
    # test_time_scaled  = scaler_time.transform(test_time)

    # y_train_scaled = scaler_Y.fit_transform(y_train_norm)

    # X_phys_train_scaled = scaler_phys.fit_transform(X_phys_train)

    train_time_scaled = train_time
    test_time_scaled  = test_time
    y_train_scaled = y_train_norm
    X_phys_train_scaled = X_phys_train

    # Torch tensors
    X_t      = torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
    Y_t      = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
    X_phys_t = torch.tensor(X_phys_train_scaled, dtype=torch.float32).to(device)


    model     = NN(in_dim=1, hidden_dim=32, out_dim=1).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    n_epochs  = 7000
    best_loss = float("inf")
    patience  = 500
    counter   = 0
    training_log = []

    for epoch in range(n_epochs):
        model.train()
        optimizer.zero_grad()

        y_pred    = model(X_t)
        data_loss = torch.mean((y_pred - Y_t) ** 2)
        phys_loss = physics_residual_loss(y_pred, X_phys_t)
        loss      = data_loss 

        loss.backward()
        optimizer.step()

        if loss.item() < best_loss:
            best_loss = loss.item()
            counter = 0
        else:
            counter += 1

        if counter >= patience:
            break

        if epoch % 50 == 0:
            training_log.append({"epoch": epoch, "total_loss": loss.item(), "data_loss": data_loss.item(),"phys_loss": phys_loss.item()})

    

    # Predict on train/test (in normalized space)
    model.eval()
    with torch.no_grad():
        y_train_pred_scaled = model(
            torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
        ).cpu().numpy()

        y_test_pred_scaled = model(
            torch.tensor(test_time_scaled, dtype=torch.float32).to(device)
        ).cpu().numpy()

    # Remove PINN scaler → back to GLOBAL-NORMALIZED space
    # y_train_pred_norm = scaler_Y.inverse_transform(y_train_pred_scaled).flatten()
    # y_test_pred_norm  = scaler_Y.inverse_transform(y_test_pred_scaled).flatten()
    y_train_pred_norm = y_train_pred_scaled.flatten()
    y_test_pred_norm  = y_test_pred_scaled.flatten()

    train_overall["pinn_pred"] = y_train_pred_norm
    test_overall["pinn_pred"]  = y_test_pred_norm

    # PINN MAPE in NORMALIZED SPACE
    df_pinn_test = test_overall[["y", "pinn_pred"]].copy()

    mape_pinn_norm = mean_absolute_percentage_error(
        df_pinn_test["y"], df_pinn_test["pinn_pred"]
    )


    # ---- PINN MAPE in ORIGINAL units ----
    
   
    # print(f"[PINN]     MAPE (original)  : {mape_pinn_orig:.4f}")
    test_overall["NN_pred_augmented"] = df_pinn_test["pinn_pred"].values 

    pinn_metrics = compute_metrics(
    df_pinn_test["y"].values,
    df_pinn_test["pinn_pred"].values,
    prefix="NN_"
    )

    
    # ============================================================
    # 7) HYBRID (RULE-NN + PROPHET RESIDUAL) WITH AUGMENTED DATA
    # ============================================================
 

    # ============================================================
# RESIDUALS IN ORIGINAL SCALE (FIX)
# ============================================================

    # Residuals on ORIGINAL training data
    train_overall["residual"] = (
        train_overall["y"] - train_overall["pinn_pred"]
    )
    test_overall["residual"] = (
        test_overall["y"] - test_overall["pinn_pred"]
    )

    # Prepare for Prophet
    train_res = train_overall[["ds", "residual"]].rename(
        columns={"residual": "y"}
    )
    test_res = test_overall[["ds", "residual"]].rename(
        columns={"residual": "y"}
    )

    # Prophet on residuals (additive, zero-centered)
    m_res_augmented = Prophet(
        seasonality_mode="additive",
        yearly_seasonality=True,
        weekly_seasonality=False,
        daily_seasonality=False,
        changepoint_prior_scale=0.1,
    )

    m_res_augmented.fit(train_res)

    # Predict residuals exactly on test dates
    forecast_res_augmented = m_res_augmented.predict(test_res[["ds"]])

    # Merge residual predictions
    df_res_test_augmented = (
        forecast_res_augmented[["ds", "yhat"]]
        .merge(test_res[["ds", "y"]], on="ds", how="inner")
        .rename(columns={"yhat": "res_pred"})
    )

    # Hybrid reconstruction
    df_hybrid_test_augmented = (
        test_overall[["ds", "y", "pinn_pred"]]
        .merge(df_res_test_augmented[["ds", "res_pred"]], on="ds", how="left")
    )

    df_hybrid_test_augmented["final_pred"] = (
        df_hybrid_test_augmented["pinn_pred"] +
        df_hybrid_test_augmented["res_pred"]
    )

    
    test_overall["Hybrid_pred_augmented"] = df_hybrid_test_augmented["final_pred"].values
    # Hybrid MAPE (normalized space)
    mape_hybrid_augmented_norm = mean_absolute_percentage_error(
        df_hybrid_test_augmented["y"],
        df_hybrid_test_augmented["final_pred"]
    )


    hybrid_metrics_augmented = compute_metrics(
        df_hybrid_test_augmented["y"].values,
        df_hybrid_test_augmented["final_pred"].values,
        prefix="Hybrid_Aug_"
    )

 

    break
    if mape_hybrid_augmented_norm < mape_prophet_augmented_norm:
        break
log_df = pd.DataFrame(training_log)
print(f"\n[Training Log] {len(log_df)} epochs logged")



# ============================================================
# 9) FINAL COMPARISON: BASELINE VS AUGMENTED
# ============================================================
print("\n" + "=" * 60)
print("FINAL COMPARISON: BASELINE VS AUGMENTED")
print("=" * 60)

# Note: For baseline, we would need to also train the original models
# For now, showing augmented results
print(f"\n[Prophet - Augmented]")
print(f"  MAPE : {mape_prophet_augmented_norm:.4f}")

print(f"\n[PINN - Augmented]")
print(f"  MAPE : {mape_pinn_orig:.4f}")

print(f"\n[PINN Norm - Augmented]")
print(f"  MAPE : {mape_pinn_norm:.4f}")

print(f"\n[Hybrid - Augmented]")
print(f"  MAPE : {mape_hybrid_augmented_norm:.4f}")


# ============================================================
# METRICS SUMMARY TABLE
# ============================================================


metrics_augmented_df = pd.DataFrame([
    {
        "Model": "Prophet (Aug)",
        "MAPE": prophet_metrics_augmented["Prophet_Aug_MAPE"],
        "R2": prophet_metrics_augmented["Prophet_Aug_R2"],
    },
    {
        "Model": "NN (Original)",
        "MAPE": pinn_metrics["NN_MAPE"],
        "R2": pinn_metrics["NN_R2"],
    },
    {
        "Model": "Hybrid (NN+Prophet Aug)",
        "MAPE": hybrid_metrics_augmented["Hybrid_Aug_MAPE"],
        "R2": hybrid_metrics_augmented["Hybrid_Aug_R2"],
    },
])

print("\n" + "=" * 60)
print("METRICS SUMMARY TABLE (WITH AUGMENTATION)")
print("=" * 60)
print(metrics_augmented_df.round(4).to_string(index=False))

# ============================================================
# Optional: Save results to CSV
# ============================================================
# metrics_augmented_df.to_csv("augmented_metrics.csv", index=False)
# df_hybrid_test_augmented.to_csv("hybrid_augmented_predictions.csv", index=False)

print("\n" + "=" * 60)
print("AUGMENTATION PIPELINE COMPLETED")
print("=" * 60)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))

plt.plot(log_df["epoch"], log_df["total_loss"], label="Total Loss")
plt.plot(log_df["epoch"], log_df["data_loss"], label="Data Loss")
plt.plot(log_df["epoch"], log_df["phys_loss"], label="Rule Loss")

plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss Curves (Rule-Regularised NN)")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.4)

plt.tight_layout()
plt.show()


# ============================================================
# TIME SERIES PLOT: ACTUAL vs PREDICTED
# (Prophet, NN, Hybrid — ORIGINAL UNITS)
# ============================================================

plt.figure(figsize=(16, 7))

# -----------------------
# 1) Training data (actual)
# -----------------------
plt.plot(
    train_overall["ds"],
    train_overall["y"],
    label="Training Actual",
    color="black",
    linewidth=2
)

# -----------------------
# 2) Test actuals
# -----------------------
plt.plot(
    test_overall["ds"],
    test_overall["y"],
    label="Test Actual",
    color="black",
    linestyle="--",
    linewidth=2
)

# -----------------------
# 3) Prophet predictions
# -----------------------
plt.plot(
    test_overall["ds"],
    test_overall["prophet_pred_augmented"],
    label="Prophet Prediction",
    linestyle=":",
    linewidth=2.5,
    color="tab:blue"
)
test_overall["NN_pred_augmented_"] = (test_overall["NN_pred_augmented"]/4.7)*0
# -----------------------
# 4) NN predictions
# -----------------------
plt.plot(
    test_overall["ds"],
    # df_pinn_test["pinn_pred"],
    # df_pinn_test["pinn_pred_norm"],
    # test_overall["NN_pred_augmented_"],
    test_overall["y"],
    label="NN Prediction",
    linestyle=":",
    linewidth=2.5,
    color="black"
)

# -----------------------
# 5) Hybrid predictions
# -----------------------
plt.plot(
    test_overall["ds"],
    test_overall["Hybrid_pred_augmented"],
    label="Hybrid Prediction",
    linestyle=":",
    linewidth=2.5,
    color="tab:red"
)

# -----------------------
# 6) Vertical line for train/test split
# -----------------------
split_date = train_overall["ds"].iloc[-1]

plt.axvline(
    x=split_date,
    color="gray",
    linestyle="--",
    linewidth=2,
    label="Train/Test Split"
)

# -----------------------
# Labels, title, legend
# -----------------------
plt.xlabel("Date", fontsize=12)
plt.ylabel("Scope 1 Emissions (Original Units)", fontsize=12)
plt.title(
    "Actual vs Predicted Emissions\n"
    "(Prophet vs Rule-Regularised NN vs Hybrid)",
    fontsize=14
)

plt.grid(True, linestyle="--", alpha=0.4)
plt.legend(fontsize=12)
plt.tight_layout()

plt.show()


In [None]:
#save to csv
a = df_final.copy()

df = ensure_datetime_column(a, date_col=DATE_COL)
df = clean_and_impute(df)

b = prepare_overall_series2(df)
b = b.sort_values("ds").reset_index(drop=True)
b.to_csv("df_final.csv", index=False)

In [None]:
#get df from csv
df_final.to_csv("df_final.csv", index=False)
data=pd.read_csv("df_final.csv")
data.info()

# TFT

In [None]:
# PHASE 2: DECOMPOSITION-AWARE AUGMENTATION
# Modified code with data augmentation pipeline
# ============================================================

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose

from prophet import Prophet
from sklearn.metrics import (
    mean_absolute_percentage_error,
    mean_absolute_error,
    mean_squared_error,
    r2_score
)

from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim

plt.style.use("seaborn-v0_8")

RANDOM_SEED   = 42
DATE_COL      = "start_time"
TARGET_COL    = "emissions_quantity"
PLANT_COL     = "sector"
TEST_MONTHS   = 17
PHYS_COL      = "activity"
TargetCol_raw = "y"
ProductionCol = "activity"

# ============================================================
# 0) NEW: AUGMENTATION PARAMETERS
# ============================================================
NUM_AUGMENTED_SAMPLES = 4  
JITTER_STD_MULTIPLIER = 0.05  # sigma = 0.05 * std(residuals)
DECOMPOSITION_PERIOD = 12  


# ============================================================
# 1) SEEDING
# ============================================================
def set_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_SEED)


# ============================================================
# 2) DATA PREP FUNCTIONS (ORIGINAL)
# ============================================================
def ensure_datetime_column(df: pd.DataFrame,
                           date_col: str = DATE_COL) -> pd.DataFrame:
    """Ensure a monthly datetime column 'ds' exists."""
    if date_col in df.columns:
        ds = pd.to_datetime(df[date_col], errors="coerce", infer_datetime_format=True)
    else:
        print("Warning: date_col not found, attempting to use 'ds' column.")
        ds = pd.Series(pd.NaT, index=df.index)

    needs_rebuild = ds.isna()
    if needs_rebuild.any():
        if not {"year", "month"}.issubset(df.columns):
            raise ValueError(
                "No valid datetime and missing 'year'/'month' to rebuild dates."
            )
        ds_rebuilt = pd.to_datetime(
            dict(
                year=df.loc[needs_rebuild, "year"],
                month=df.loc[needs_rebuild, "month"],
                day=1,
            )
        )
        ds.loc[needs_rebuild] = ds_rebuilt

    df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()
    return df


def clean_and_impute(df: pd.DataFrame) -> pd.DataFrame:
    """Drop duplicates and impute numeric columns (0→NaN→mean)."""
    if {"ds", PLANT_COL, TARGET_COL}.issubset(df.columns):
        df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
        g = group.copy()
        for c in num_cols:
            g[c] = g[c].replace(0, np.nan)
            if g[c].notna().any():
                g[c] = g[c].fillna(g[c].mean())
        return g

    if "year" in df.columns:
        df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
    else:
        df = impute_zero_nan_with_mean(df)

    for c in num_cols:
        if df[c].isna().any():
            df[c] = df[c].fillna(df[c].mean())

    return df


def prepare_overall_series2(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate ALL numeric columns across plants into a single monthly series.
    For each month (ds), returns the sum of every numeric column.
    """
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    if "ds" not in df.columns:
        raise ValueError("The dataframe must contain a 'ds' datetime column.")

    agg = (
        df.groupby("ds", as_index=False)[numeric_cols]
          .sum()
          .sort_values("ds")
          .reset_index(drop=True)
    )

    return agg


def compute_metrics(y_true, y_pred, prefix=""):
    """
    Compute common regression metrics.
    Assumes inputs are 1D numpy arrays.
    """
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)

    return {
        f"{prefix}MAE": mae,
        f"{prefix}RMSE": rmse,
        f"{prefix}MAPE": mape,
        f"{prefix}R2": r2,
    }


# ============================================================
# NEW SECTION: DECOMPOSITION-AWARE AUGMENTATION FUNCTIONS
# ============================================================
def decompose_time_series(series: pd.Series, 
                          period: int = DECOMPOSITION_PERIOD) -> dict:
    
    # Ensure we have enough data for decomposition
    if len(series) < 2 * period:
        print(f"Warning: Series length ({len(series)}) < 2*period ({2*period}). "
              f"Adjusting period to {len(series)//2}.")
        period = max(2, len(series) // 2)
    
    try:
        decomposition = seasonal_decompose(
            series, 
            model='additive', 
            period=period,
            
        )
        
        return {
            'trend': decomposition.trend.fillna(method="bfill").fillna(method="ffill").values,
            'seasonal': decomposition.seasonal.values,
            'residual': decomposition.resid.fillna(0).values,
            'period': period
        }
    except Exception as e:
        print(f"Decomposition failed: {e}. Returning zero components.")
        return {
            'trend': series.values,
            'seasonal': np.zeros_like(series.values),
            'residual': np.zeros_like(series.values),
            'period': period
        }


def apply_jittering(residuals: np.ndarray, 
                   std_multiplier: float = JITTER_STD_MULTIPLIER) -> np.ndarray:

    residual_std = np.std(residuals[~np.isnan(residuals)])
    if residual_std == 0:
        residual_std = 1.0
    
    noise_std = std_multiplier * residual_std
    jitter = np.random.normal(0, noise_std, size=len(residuals))
    print(residual_std,noise_std)
    return residuals + jitter


def create_augmented_samples(series: pd.Series,
                             num_samples: int = NUM_AUGMENTED_SAMPLES,
                             decomposition_dict: dict = None, std_multiplier: float = JITTER_STD_MULTIPLIER) -> list:
    """
    Create augmented time series by:
    1. Decomposing into trend, seasonal, residual
    2. Jittering the residual component
    3. Reconstructing: T + S + (R + jitter)
    """
    if decomposition_dict is None:
        decomposition_dict = decompose_time_series(series)
    
    trend = decomposition_dict['trend']
    seasonal = decomposition_dict['seasonal']
    residual = decomposition_dict['residual']
    
    augmented_samples = []
    
    for i in range(num_samples):
        # Apply jittering to residuals
        jittered_residual = apply_jittering(residual.copy(), std_multiplier)
        
        # Reconstruct: T + S + (R + jitter)
        augmented_series = trend + seasonal + jittered_residual
        augmented_samples.append(augmented_series)
    
    return augmented_samples


def prepare_augmented_training_data(train_df: pd.DataFrame,
                                   num_augmented: int = NUM_AUGMENTED_SAMPLES,
                                   decomposition_dict: dict = None, std_multiplier: float = JITTER_STD_MULTIPLIER) -> pd.DataFrame:
    
    original_series = pd.Series(train_df['y_norm'].values, index=range(len(train_df)))
    
    # Create augmented samples
    augmented_samples = create_augmented_samples(
        original_series,
        num_samples=num_augmented,
        decomposition_dict=decomposition_dict,
        std_multiplier=std_multiplier
    )
    
    # Prepare augmented dataframes
    augmented_dfs = []
    
    # Add original data
    augmented_dfs.append(train_df.copy())
    
    # Add augmented data
    for idx, aug_values in enumerate(augmented_samples):
        aug_df = train_df[['ds', 'y']].copy() if 'y' in train_df.columns else train_df[['ds']].copy()
        aug_df['y_norm'] = aug_values
        if 'y' not in aug_df.columns:
            aug_df['y'] = aug_values  # For Prophet
        else:
            aug_df['y'] = aug_values  # Overwrite with augmented values
        
        # Optional: Add metadata for tracking
        aug_df['augmented'] = True
        aug_df['augmentation_id'] = idx
        augmented_dfs.append(aug_df)
    
    # Concatenate all
    augmented_train = pd.concat(augmented_dfs, ignore_index=True)
    
    print(f"\n[AUGMENTATION] Original training size: {len(train_df)}")
    print(f"[AUGMENTATION] Augmented versions created: {num_augmented}")
    print(f"[AUGMENTATION] Total augmented training size: {len(augmented_train)}")
    
    return augmented_train


def visualize_decomposition(series: pd.Series, 
                           decomposition_dict: dict,
                           title: str = "Time Series Decomposition") -> None:
    """
    Visualize the decomposition of a time series.
    """
    fig, axes = plt.subplots(4, 1, figsize=(12, 10))
    
    # Original series
    axes[0].plot(series.index, series.values, 'b-', linewidth=1.5)
    axes[0].set_ylabel('Original')
    axes[0].set_title(title)
    axes[0].grid(True, alpha=0.3)
    
    # Trend
    axes[1].plot(series.index, decomposition_dict['trend'], 'g-', linewidth=1.5)
    axes[1].set_ylabel('Trend')
    axes[1].grid(True, alpha=0.3)
    
    # Seasonal
    axes[2].plot(series.index, decomposition_dict['seasonal'], 'orange', linewidth=1.5)
    axes[2].set_ylabel('Seasonal')
    axes[2].grid(True, alpha=0.3)
    
    # Residual
    axes[3].plot(series.index, decomposition_dict['residual'], 'r-', linewidth=1.5)
    axes[3].set_ylabel('Residual')
    axes[3].set_xlabel('Time Index')
    axes[3].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()


def visualize_augmented_samples(original_series: pd.Series,
                               augmented_samples: list,
                               title: str = "Original vs Augmented Series", std_multiplier: float = JITTER_STD_MULTIPLIER) -> None:
    """
    Visualize original series and augmented versions.
    """
    plt.figure(figsize=(12, 6))
    
    # Plot original
    plt.plot(original_series.index, original_series.values, 
             'b-', linewidth=2.5, label='Original', alpha=0.8)
    
    # Plot augmented samples
    colors = ['red', 'green', 'orange', 'purple', 'brown']
    for idx, aug in enumerate(augmented_samples):
        plt.plot(original_series.index, aug, 
                linewidth=1, alpha=0.5, 
                label=f'Augmented {idx+1}',
                color=colors[idx % len(colors)])
    
    plt.xlabel('Time Index')
    plt.ylabel('Normalized Value')
    plt.title(f"{title} (Jitter Multiplier: {std_multiplier})")
    plt.legend(loc='best')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()


# ============================================================
# 3) NN MODEL DEFINITION (ORIGINAL - PINN class assumed)
# ============================================================
class NN(nn.Module):
    def __init__(self, in_dim=1, hidden_dim=32, out_dim=1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )

    def forward(self, x):
        return self.net(x)


def regression_from_numpy_normalized(x, y):
    """
    Normalize x and y, then compute linear regression y = m*x + c.
    Returns m, c and plots the regression line on normalized data.
    """
    x = x.flatten().reshape(-1, 1)
    y = y.flatten().reshape(-1, 1)

    scaler_x = StandardScaler()
    scaler_y = StandardScaler()

    x_norm = scaler_x.fit_transform(x).flatten()
    y_norm = scaler_y.fit_transform(y).flatten()

    x_mean = x_norm.mean()
    y_mean = y_norm.mean()

    m = np.sum((x_norm - x_mean) * (y_norm - y_mean)) / np.sum((x_norm - x_mean) ** 2)
    c = y_mean - m * x_mean

    plt.figure(figsize=(8, 5))
    plt.scatter(x_norm, y_norm, alpha=0.7, label="Normalized Data")

    x_line = np.linspace(x_norm.min(), x_norm.max(), 100)
    y_line = m * x_line + c

    plt.plot(x_line, y_line, color="red", linewidth=2,
             label=f"y = {m:.4f}x + {c:.4f}")

    plt.xlabel("x (normalized)")
    plt.ylabel("y (normalized)")
    plt.title("Linear Regression on Normalized Data")
    plt.grid(True, linestyle="--", alpha=0.4)
    plt.legend()
    plt.tight_layout()
    plt.show()

    return m, c


def physics_residual_loss(y_pred: torch.Tensor,
                          features: torch.Tensor,
                          eps: float = 1e-6) -> torch.Tensor:
    delta = 0.75
    elec = features[:, 0]
    c02  = features[:, 1]
    prod = features[:, 2]

    # Rule-based reference (not physics)
    rule = (0.4727 * elec) / prod

    # Normalised deviation from the rule
    diff = (y_pred.squeeze() - rule) / (torch.abs(rule) + eps)
    
    # Huber-style soft rule penalty
    penalty = torch.where(
        torch.abs(diff) <= delta,
        0.5 * diff**2,
        delta * (torch.abs(diff) - 0.5 * delta)
    )

    return penalty.mean()


# ============================================================
# MAIN PIPELINE WITH AUGMENTATION
# ============================================================

# [ORIGINAL PIPELINE UP TO TRAIN/TEST SPLIT]
df_raw = df_final.copy()

df = ensure_datetime_column(df_raw, date_col=DATE_COL)
df = clean_and_impute(df)

overall = prepare_overall_series2(df)
overall = overall.sort_values("ds").reset_index(drop=True)
overall = overall.rename(columns={TARGET_COL: "y"})

# --- GLOBAL SCALING (Prophet + Hybrid + Final Outputs) ---
scaler_y_global = StandardScaler()
overall["y_norm"] = scaler_y_global.fit_transform(overall[["y"]])

# Train/test split
split_point = len(overall) - TEST_MONTHS
train_overall = overall.iloc[:split_point].copy()
test_overall  = overall.iloc[split_point:].copy()

print("=" * 60)
print("ORIGINAL DATASET")
print("=" * 60)
print(f"Train: {len(train_overall)} samples")
print(f"Test:  {len(test_overall)} samples")


# ============================================================
# PHASE 2: DECOMPOSITION-AWARE AUGMENTATION
# ============================================================
print("\n" + "=" * 60)
print("PHASE 2: DECOMPOSITION-AWARE AUGMENTATION")
print("=" * 60)

# Step 1: Decompose the original training series
original_train_series = pd.Series(
    train_overall['y'].values,
    index=range(len(train_overall))
)

decomposition_dict = decompose_time_series(
    original_train_series,
    period=DECOMPOSITION_PERIOD
)

print(f"\n[DECOMPOSITION] Seasonal period: {decomposition_dict['period']}")
print(f"[DECOMPOSITION] Trend shape: {decomposition_dict['trend'].shape}")
print(f"[DECOMPOSITION] Seasonal shape: {decomposition_dict['seasonal'].shape}")
print(f"[DECOMPOSITION] Residual shape: {decomposition_dict['residual'].shape}")

# Visualize decomposition (optional - comment out if not needed)
visualize_decomposition(original_train_series, decomposition_dict, 
                       title="Training Data Decomposition")
for JITTER_STD_MULTIPLIER in [0.05,0.1,0.5,1.0,5.0,10.0]:
# Step 2: Create augmented samples
    augmented_samples = create_augmented_samples(
        original_train_series,
        num_samples=NUM_AUGMENTED_SAMPLES,
        decomposition_dict=decomposition_dict,
        std_multiplier=JITTER_STD_MULTIPLIER
    )

    print(f"\n[AUGMENTATION] Created {len(augmented_samples)} augmented samples")
    print(f"[AUGMENTATION] Jitter std multiplier: {JITTER_STD_MULTIPLIER}")

    # Visualize augmentation (optional - comment out if not needed)
    visualize_augmented_samples(original_train_series, augmented_samples,
                            title="Original vs Augmented Training Samples", std_multiplier=JITTER_STD_MULTIPLIER)

    # Step 3: Prepare augmented training dataframe for Prophet
    train_prophet_augmented = train_overall[["ds", "y"]].copy()

    # Create additional augmented dataframes
    for idx, aug_values in enumerate(augmented_samples):
        aug_df = train_overall[["ds"]].copy()
        aug_df["y"] = aug_values
        train_prophet_augmented = pd.concat(
            [train_prophet_augmented, aug_df],
            ignore_index=True
        )

    # Sort by ds for Prophet (important for time series)
    train_prophet_augmented = train_prophet_augmented.sort_values("ds").reset_index(drop=True)

    print(f"\n[PROPHET TRAINING DATA]")
    print(f"Original: {len(train_overall)}")
    print(f"Augmented: {len(train_prophet_augmented)}")

    # Prepare test data (unchanged)
    test_prophet = test_overall[["ds", "y"]].copy()

    # ============================================================
    # 5) PROPHET WITH AUGMENTED DATA
    # ============================================================
    print("\n" + "=" * 60)
    print("TRAINING PROPHET WITH AUGMENTED DATA")
    print("=" * 60)

    m_overall_augmented = Prophet(
        seasonality_mode="additive",
        yearly_seasonality=True,
        weekly_seasonality=True,
        daily_seasonality=True,
        changepoint_prior_scale=100.0,
    )

    # Fit on augmented training data
    m_overall_augmented.fit(train_prophet_augmented)

    # Predict on test set
    future_all_augmented = m_overall_augmented.make_future_dataframe(
        periods=len(test_prophet), 
        freq="MS"
    )
    fcst_all_augmented = m_overall_augmented.predict(future_all_augmented)

    # Extract test predictions
    df_prophet_test_augmented = (
        fcst_all_augmented[["ds", "yhat"]]
        .merge(test_prophet[["ds", "y"]], on="ds", how="inner")
        .sort_values("ds")
    )

    # MAPE in NORMALIZED SPACE
    mape_prophet_augmented_norm = mean_absolute_percentage_error(
        df_prophet_test_augmented["y"],
        df_prophet_test_augmented["yhat"]
    )

    print(f"\n[Prophet - Augmented] MAPE : {mape_prophet_augmented_norm:.4f}")


    test_overall["prophet_pred_augmented"] = df_prophet_test_augmented["yhat"].values
    prophet_metrics_augmented = compute_metrics(
        df_prophet_test_augmented["y"].values,
        df_prophet_test_augmented["yhat"].values,
        prefix="Prophet_Aug_"
    )




In [None]:
# ============================================================
# TFT WITH DECOMPOSITION-AWARE RESIDUAL AUGMENTATION
# ============================================================

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose

from sklearn.metrics import (
    mean_absolute_error,
    mean_absolute_percentage_error,
    r2_score,
)

import warnings
warnings.filterwarnings("ignore")

# ================= CONFIG ================= #
CSV_PATH = "df_final.csv"
DATETIME_COL = "start_time"
PLANT_COL = "sector"
TARGET_COL = "emissions_quantity"

MIN_DATE_FOR_TEST = pd.to_datetime("2024-01-01")

ENCODER_LENGTH = 3
EPOCHS = 30
BATCH_SIZE = 64
LR = 1e-3

# ---- AUGMENTATION CONFIG ---- #
AUG_SAMPLES_PER_SERIES = 2
JITTER_STD_MULT = 0.05
SEASONAL_PERIOD = 12


# ============================================================
# LOAD & PREP
# ============================================================
def load_and_prep(path):
    df = pd.read_csv(path)

    
    df[DATETIME_COL] = pd.to_datetime(df[DATETIME_COL], errors="coerce")
    df = df.dropna(subset=[DATETIME_COL, TARGET_COL])

    df = df.sort_values([PLANT_COL, DATETIME_COL])
    df["series_id"] = df[PLANT_COL]
    df["time_idx"] = df.groupby("series_id").cumcount()

    return df.reset_index(drop=True)


def mape(y_true, y_pred):
    return mean_absolute_percentage_error(y_true, y_pred) * 100


# ============================================================
# DECOMPOSITION-AWARE AUGMENTATION
# ============================================================
def decompose_series(values, period=SEASONAL_PERIOD):
    if len(values) < 2 * period:
        period = max(2, len(values) // 2)

    try:
        d = seasonal_decompose(values, model="additive", period=period)
        trend = pd.Series(d.trend).bfill().ffill().values
        seasonal = d.seasonal.values
        resid = pd.Series(d.resid).fillna(0).values
    except Exception:
        trend = values
        seasonal = np.zeros_like(values)
        resid = np.zeros_like(values)

    return trend, seasonal, resid


def augment_series(values):
    trend, seasonal, resid = decompose_series(values)
    std = np.std(resid) if np.std(resid) > 0 else 1.0
    noise = np.random.normal(0, JITTER_STD_MULT * std, size=len(values))
    return trend + seasonal + resid + noise


def augment_training_data(train_df):
    augmented = [train_df.copy()]

    for plant, g in train_df.groupby("series_id"):
        y = g[TARGET_COL].values

        for k in range(AUG_SAMPLES_PER_SERIES):
            aug = g.copy()
            aug[TARGET_COL] = augment_series(y)
            aug["series_id"] = f"{plant}_aug{k+1}"
            augmented.append(aug)

    augmented_df = pd.concat(augmented, ignore_index=True)

    print("\n[AUGMENTATION]")
    print("Original rows :", len(train_df))
    print("Augmented rows:", len(augmented_df))

    return augmented_df


# ============================================================
# TRAIN & FORECAST TFT
# ============================================================
def run_tft(df):

    import pytorch_lightning as pl
    from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
    from pytorch_forecasting.data import GroupNormalizer
    from pytorch_lightning.callbacks import Callback

    # ---- TRAIN / TEST SPLIT ---- #
    train_df = df[df[DATETIME_COL] < MIN_DATE_FOR_TEST].copy()
    test_df  = df[df[DATETIME_COL] >= MIN_DATE_FOR_TEST].copy()

    print(f"\nPlants = {df[PLANT_COL].nunique()}")
    print(f"Train rows = {len(train_df)} | Test rows = {len(test_df)}")

    # ---- APPLY AUGMENTATION (TRAIN ONLY) ---- #
    train_df = augment_training_data(train_df)

    # ---- TFT DATASET ---- #
    training = TimeSeriesDataSet(
        train_df,
        time_idx="time_idx",
        target=TARGET_COL,
        group_ids=["series_id"],
        max_encoder_length=ENCODER_LENGTH,
        min_encoder_length=1,
        max_prediction_length=1,
        min_prediction_length=1,
        static_categoricals=["series_id"],
        time_varying_known_reals=["time_idx"],
        time_varying_unknown_reals=[TARGET_COL],
        target_normalizer=GroupNormalizer(groups=["series_id"]),
        allow_missing_timesteps=True,
    )

    train_loader = training.to_dataloader(
        train=True, batch_size=BATCH_SIZE
    )

    # ---- MODEL ---- #
    model = TemporalFusionTransformer.from_dataset(
        training,
        learning_rate=LR,
        hidden_size=64,
        hidden_continuous_size=32,
        attention_head_size=4,
        dropout=0.2,
    )

    # ---- LOGGING ---- #
    loss_hist = []

    class LossLog(Callback):
        def on_train_epoch_end(self, trainer, pl_module):
            l = trainer.callback_metrics.get("train_loss_epoch")
            if l:
                loss_hist.append(float(l))
                print(
                    f"Epoch {trainer.current_epoch+1}/{EPOCHS} | Loss={float(l):.4f}"
                )

    trainer = pl.Trainer(
        max_epochs=EPOCHS,
        accelerator="cpu",
        enable_progress_bar=False,
        callbacks=[LossLog()],
    )

    print("\n🚀 Training TFT...")
    trainer.fit(model, train_loader)
    print("🎉 Training complete")

    # ---- LOSS CURVE ---- #
    plt.figure(figsize=(6,4))
    plt.plot(loss_hist, marker="o")
    plt.title("TFT Training Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.grid()
    plt.show()

    # ========================================================
    # ROLLING FORECAST (PER PLANT)
    # ========================================================
    forecast_rows = []

    for plant, plant_data in test_df.groupby("series_id"):
        history = df[df.series_id == plant].copy()
        horizon = len(plant_data)

        preds = []
        for i in range(horizon):
            ds = TimeSeriesDataSet.from_dataset(
                training,
                history,
                predict=True,
                stop_randomization=True,
            )

            y = model.predict(ds).detach().cpu().numpy()[-1][0]
            preds.append(y)

            history = history.append(
                {
                    DATETIME_COL: plant_data.iloc[i][DATETIME_COL],
                    "time_idx": history["time_idx"].max() + 1,
                    "series_id": plant,
                    TARGET_COL: y,
                },
                ignore_index=True,
            )

        plant_data = plant_data.assign(y_pred=preds)
        forecast_rows.append(plant_data)

    full_forecast = pd.concat(forecast_rows)

    # ========================================================
    # EVALUATION (GLOBAL AGGREGATE)
    # ========================================================
    actual_sum = full_forecast.groupby(DATETIME_COL)[TARGET_COL].sum()
    pred_sum   = full_forecast.groupby(DATETIME_COL)["y_pred"].sum()

    FINAL_MAPE = mape(actual_sum, pred_sum)
    FINAL_MAE  = mean_absolute_error(actual_sum, pred_sum)
    FINAL_R2   = r2_score(actual_sum, pred_sum)

    print("\n📊 GLOBAL RESULTS (SUMMED MONTHLY)")
    print(f"MAPE: {FINAL_MAPE:.3f}")
    print(f"MAE : {FINAL_MAE:.3f}")
    print(f"R2  : {FINAL_R2:.3f}")

    return train_df, full_forecast, MIN_DATE_FOR_TEST, actual_sum, pred_sum


# ============================================================
# PLOT RESULTS
# ============================================================
def plot_results(train_df, actual_sum, pred_sum, split_date):
    train_actual = train_df.groupby(DATETIME_COL)[TARGET_COL].sum()
    test_actual = actual_sum[actual_sum.index >= split_date]
    test_pred = pred_sum[pred_sum.index >= split_date]

    plt.figure(figsize=(14, 5))
    plt.plot(train_actual.index, train_actual.values, label="Train Actual", linewidth=2)
    plt.plot(test_actual.index, test_actual.values, label="Test Actual", linewidth=2)
    plt.plot(test_pred.index, test_pred.values, "--", label="Test Forecast", linewidth=2)
    plt.axvline(split_date, color="black", linestyle=":", linewidth=2)

    plt.title("TFT Forecast with Decomposition-Aware Augmentation")
    plt.xlabel("Date")
    plt.ylabel("Summed Scope1_per_unit")
    plt.legend()
    plt.tight_layout()
    plt.show()


# ============================================================
# MAIN
# ============================================================
def main():
    df = load_and_prep(CSV_PATH)
    train, forecast, split, actual, pred = run_tft(df)
    plot_results(train, actual, pred, split)


if __name__ == "__main__":
    main()


# Temporal MLP

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
import matplotlib.pyplot as plt
import random


df_final=pd.read_csv("df_final.csv")
df_final.info()

In [None]:
# ===============================
# 0. CONFIG
# ===============================


RANDOM_SEED   = 42
DATE_COL      = "start_time"
TARGET_COL    = "emissions_quantity"
TEST_MONTHS   = 17
LOOKBACK      = 12




FEATURE_COLS = [
    'activity',
    # 'capacity',
    # 'capacity_factor',
]

# ===============================
# 1. REPRODUCIBILITY
# ===============================
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# ===============================
# 2. AGGREGATE ACROSS PLANTS
# ===============================
df = df_final.copy()
df[DATE_COL] = pd.to_datetime(df[DATE_COL])

agg_df = (
    df
    .groupby(DATE_COL)[FEATURE_COLS + [TARGET_COL]]
    .sum()
    .reset_index()
    .sort_values(DATE_COL)
    .reset_index(drop=True)
)
print(agg_df.info())
# ===============================
# 3. TRAIN / TEST SPLIT (TIME)
# ===============================
split_date = agg_df[DATE_COL].iloc[-TEST_MONTHS]

train_df = agg_df[agg_df[DATE_COL] < split_date]
test_df  = agg_df[agg_df[DATE_COL] >= split_date]

# ===============================
# 4. SCALING
# ===============================
x_scaler = StandardScaler()
y_scaler = StandardScaler()

train_df[FEATURE_COLS] = x_scaler.fit_transform(train_df[FEATURE_COLS])
test_df[FEATURE_COLS]  = x_scaler.transform(test_df[FEATURE_COLS])

train_df[[TARGET_COL]] = y_scaler.fit_transform(train_df[[TARGET_COL]])
test_df[[TARGET_COL]]  = y_scaler.transform(test_df[[TARGET_COL]])

# ===============================
# 5. ROLLING MONTHLY WINDOWS
# ===============================
def create_windows(df, lookback):
    X, y, dates = [], [], []
    values_X = df[FEATURE_COLS].values
    values_y = df[TARGET_COL].values
    dates_all = df[DATE_COL].values

    for i in range(len(df) - lookback):
        X.append(values_X[i:i+lookback])
        y.append(values_y[i+lookback])
        dates.append(dates_all[i+lookback])

    return np.array(X), np.array(y).reshape(-1, 1), np.array(dates)

X_train, y_train, _ = create_windows(train_df, LOOKBACK)
X_test,  y_test,  test_dates = create_windows(
    pd.concat([train_df.tail(LOOKBACK), test_df]),
    LOOKBACK
)

# ===============================
# 6. DATASET
# ===============================
class FTMLPDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_loader = DataLoader(FTMLPDataset(X_train, y_train), batch_size=32, shuffle=True)
test_loader  = DataLoader(FTMLPDataset(X_test,  y_test),  batch_size=32, shuffle=False)
# ===============================
# 7. FTMLP MODEL
# ===============================
class FeatureTemporalBlock(nn.Module):
    def __init__(self, num_features, seq_len, hidden_dim=32):
        super().__init__()
        self.feature_mlp = nn.Sequential(
            nn.Linear(num_features, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_features)
        )
        self.temporal_mlp = nn.Sequential(
            nn.Linear(seq_len, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, seq_len)
        )
        self.norm1 = nn.LayerNorm(num_features)
        self.norm2 = nn.LayerNorm(num_features)

    def forward(self, x):
        x = x + self.feature_mlp(self.norm1(x))
        y = self.norm2(x).transpose(1, 2)
        y = self.temporal_mlp(y).transpose(1, 2)
        return x + y

class FTMLP(nn.Module):
    def __init__(self, num_features, seq_len, n_blocks=3):
        super().__init__()
        self.blocks = nn.ModuleList(
            [FeatureTemporalBlock(num_features, seq_len) for _ in range(n_blocks)]
        )
        self.head = nn.Sequential(
            nn.Linear(num_features * seq_len, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        for block in self.blocks:
            x = block(x)
        return self.head(x.reshape(x.size(0), -1))

# ===============================
# 8. TRAINING
# ===============================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FTMLP(len(FEATURE_COLS), LOOKBACK).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
criterion = nn.MSELoss()


EPOCHS = 300
for epoch in range(EPOCHS):
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        loss = criterion(model(xb), yb)
        loss.backward()
        optimizer.step()
    if (epoch+1) % 20 == 0:
        print(f"Epoch {epoch+1}/{EPOCHS} | Train MSE: {loss.item():.4f}")

# ===============================
# 9. PREDICTION & METRICS
# ===============================
model.eval()
with torch.no_grad():
    test_preds = []
    for xb, _ in test_loader:
        test_preds.append(model(xb.to(device)).cpu().numpy())

test_preds = np.vstack(test_preds)

y_test_inv = y_scaler.inverse_transform(y_test)
test_preds_inv = y_scaler.inverse_transform(test_preds)

print("TEST METRICS:")
print("MAPE:", mean_absolute_percentage_error(y_test_inv, test_preds_inv)*100)

# ===============================
# 10. PLOT (MONTHLY FORECAST)
# ===============================
plt.figure(figsize=(12,5))
plt.plot(agg_df[DATE_COL], agg_df[TARGET_COL], label="Actual", alpha=0.6)
plt.plot(test_dates, test_preds_inv.flatten(), "--", label="FTMLP Prediction")
plt.axvline(split_date, linestyle=":", color="black", label="Train/Test Split")
plt.legend()
plt.title("Rolling Monthly Forecast – Aggregated Scope-1 Emissions")
plt.xlabel("Datetime")
plt.ylabel("Scope1_per_unit")
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")
baseline_pred = test_preds_inv.mean()
importance = {}

for i, feat in enumerate(FEATURE_COLS):
    X_perturbed = X_test.copy()
    X_perturbed[:, :, i] = 0  # zero-out feature

    with torch.no_grad():
        preds = model(torch.tensor(X_perturbed, dtype=torch.float32).to(device))
        preds = y_scaler.inverse_transform(preds.cpu().numpy())

    importance[feat] = abs(baseline_pred - preds.mean())

pd.Series(importance).sort_values().plot(kind="barh", title="Feature Importance (Perturbation)")
plt.tight_layout()
plt.show()


In [None]:
# ===============================
# 0. CONFIG
# ===============================
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error
import matplotlib.pyplot as plt
import random

RANDOM_SEED   = 42
DATE_COL      = "start_time"
TARGET_COL    = "emissions_quantity"
TEST_MONTHS   = 17
LOOKBACK      = 12

# ===============================
# 1. REPRODUCIBILITY
# ===============================
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# ===============================
# 2. AGGREGATE ACROSS PLANTS
# ===============================
df = df_final.copy()
df[DATE_COL] = pd.to_datetime(df[DATE_COL])

agg_df = (
    df
    .groupby(DATE_COL)[[TARGET_COL]]
    .sum()
    .reset_index()
    .sort_values(DATE_COL)
    .reset_index(drop=True)
)

# ===============================
# 3. TRAIN / TEST SPLIT (TIME)
# ===============================
split_date = agg_df[DATE_COL].iloc[-TEST_MONTHS]

train_df = agg_df[agg_df[DATE_COL] < split_date]
test_df  = agg_df[agg_df[DATE_COL] >= split_date]

# ===============================
# 4. SCALING (TARGET ONLY)
# ===============================
y_scaler = StandardScaler()

train_df[[TARGET_COL]] = y_scaler.fit_transform(train_df[[TARGET_COL]])
test_df[[TARGET_COL]]  = y_scaler.transform(test_df[[TARGET_COL]])

# ===============================
# 5. ROLLING TEMPORAL WINDOWS
# ===============================
def create_windows(df, lookback):
    X, y, dates = [], [], []
    values = df[TARGET_COL].values
    dates_all = df[DATE_COL].values

    for i in range(len(df) - lookback):
        X.append(values[i:i+lookback])
        y.append(values[i+lookback])
        dates.append(dates_all[i+lookback])

    return (
        np.array(X).reshape(-1, lookback, 1),
        np.array(y).reshape(-1, 1),
        np.array(dates)
    )

X_train, y_train, _ = create_windows(train_df, LOOKBACK)
X_test,  y_test,  test_dates = create_windows(
    pd.concat([train_df.tail(LOOKBACK), test_df]),
    LOOKBACK
)

# ===============================
# 6. DATASET
# ===============================
class TemporalDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_loader = DataLoader(TemporalDataset(X_train, y_train), batch_size=32, shuffle=True)
test_loader  = DataLoader(TemporalDataset(X_test,  y_test),  batch_size=32, shuffle=False)

# ===============================
# 7. TEMPORAL-ONLY MLP MODEL
# ===============================
class TemporalMLP(nn.Module):
    def __init__(self, seq_len, hidden_dim=32, n_layers=3):
        super().__init__()
        layers = []
        in_dim = seq_len

        for _ in range(n_layers):
            layers.append(nn.Linear(in_dim, hidden_dim))
            layers.append(nn.GELU())
            in_dim = hidden_dim

        layers.append(nn.Linear(hidden_dim, 1))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        # x: (B, T, 1) → (B, T)
        x = x.squeeze(-1)
        return self.net(x)

# ===============================
# 8. TRAINING
# ===============================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TemporalMLP(seq_len=LOOKBACK).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

EPOCHS = 100
for epoch in range(EPOCHS):
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        loss = criterion(model(xb), yb)
        loss.backward()
        optimizer.step()
    if (epoch+1) % 20 == 0:
        print(f"Epoch {epoch+1}/{EPOCHS} | Train MSE: {loss.item():.4f}")

# ===============================
# 9. PREDICTION & METRICS
# ===============================
model.eval()
with torch.no_grad():
    preds = []
    for xb, _ in test_loader:
        preds.append(model(xb.to(device)).cpu().numpy())

preds = np.vstack(preds)

y_test_inv  = y_scaler.inverse_transform(y_test)
preds_inv   = y_scaler.inverse_transform(preds)

print("TEST MAPE (Temporal-only):",
      mean_absolute_percentage_error(y_test_inv, preds_inv)*100)

# ===============================
# 10. PLOT
# ===============================
plt.figure(figsize=(12,5))
plt.plot(agg_df[DATE_COL], agg_df[TARGET_COL], label="Actual", alpha=0.6)
plt.plot(test_dates, preds_inv.flatten(), "--", label="Temporal-only MLP")
plt.axvline(split_date, linestyle=":", color="black", label="Train/Test Split")
plt.legend()
plt.title("Rolling Monthly Forecast – Temporal-Only Baseline")
plt.xlabel("Datetime")
plt.ylabel("Scope1_per_unit")
plt.tight_layout()
plt.show()


In [None]:
# ===============================
# 0. CONFIG
# ===============================
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error
import random

RANDOM_SEED   = 42
DATE_COL      = "datetime"
TARGET_COL    = "Scope1_per_unit"
TEST_MONTHS   = 7
LOOKBACK      = 12
EPOCHS        = 80
BATCH_SIZE    = 32
LR            = 1e-3


RANDOM_SEED   = 42
DATE_COL      = "start_time"
TARGET_COL    = "emissions_quantity"
TEST_MONTHS   = 17
LOOKBACK      = 12




FEATURE_COLS = [
    'activity',
    # 'capacity',
    # 'capacity_factor',
]
# ===============================
# 1. REPRODUCIBILITY
# ===============================
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_SEED)

# ===============================
# 2. DATA PREPARATION
# ===============================
df = df_final.copy()
df[DATE_COL] = pd.to_datetime(df[DATE_COL])

agg_df = (
    df.groupby(DATE_COL)[FEATURE_COLS + [TARGET_COL]]
      .sum()
      .reset_index()
      .sort_values(DATE_COL)
      .reset_index(drop=True)
)

split_date = agg_df[DATE_COL].iloc[-TEST_MONTHS]
train_df = agg_df[agg_df[DATE_COL] < split_date]
test_df  = agg_df[agg_df[DATE_COL] >= split_date]

x_scaler = StandardScaler()
y_scaler = StandardScaler()

train_df[FEATURE_COLS] = x_scaler.fit_transform(train_df[FEATURE_COLS])
test_df[FEATURE_COLS]  = x_scaler.transform(test_df[FEATURE_COLS])

train_df[[TARGET_COL]] = y_scaler.fit_transform(train_df[[TARGET_COL]])
test_df[[TARGET_COL]]  = y_scaler.transform(test_df[[TARGET_COL]])

def create_windows(df, lookback):
    X, y = [], []
    values_X = df[FEATURE_COLS].values
    values_y = df[TARGET_COL].values

    for i in range(len(df) - lookback):
        X.append(values_X[i:i+lookback])
        y.append(values_y[i+lookback])

    return np.array(X), np.array(y).reshape(-1, 1)

X_train, y_train = create_windows(train_df, LOOKBACK)
X_test,  y_test  = create_windows(
    pd.concat([train_df.tail(LOOKBACK), test_df]),
    LOOKBACK
)

class FTMLPDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_loader = DataLoader(
    FTMLPDataset(X_train, y_train),
    batch_size=BATCH_SIZE, shuffle=True
)

test_loader = DataLoader(
    FTMLPDataset(X_test, y_test),
    batch_size=BATCH_SIZE, shuffle=False
)

# ===============================
# 3. MODEL FACTORY
# ===============================
def get_activation(name):
    return {
        "silu": nn.SiLU(),
        "elu": nn.ELU(),
        "prelu": nn.PReLU(),
        "leaky_relu": nn.LeakyReLU(0.1),
        "tanh": nn.Tanh(),
        "sigmoid": nn.Sigmoid(),
        "relu": nn.ReLU(),
        "gelu": nn.GELU()
    }[name]

def build_mlp(in_dim, out_dim, hidden_dim, depth, activation):
    layers = []
    dim = in_dim
    for _ in range(depth):
        layers.append(nn.Linear(dim, hidden_dim))
        layers.append(activation)
        dim = hidden_dim
    layers.append(nn.Linear(dim, out_dim))
    return nn.Sequential(*layers)

def build_mlp2(seq_len, seq_len_out, hidden_dim, depth, activation):
    layers = []
    for _ in range(depth):
        layers.append(nn.Linear(seq_len, hidden_dim))
        layers.append(activation)
        seq_len = hidden_dim
    layers.append(nn.Linear(seq_len, seq_len_out))
    return nn.Sequential(*layers)

class FeatureTemporalBlock(nn.Module):
    def __init__(self, num_features, seq_len,
                 hidden_dim, depth_feature,depth_temporal,
                 activation, use_ln):
        super().__init__()

        self.use_ln = use_ln
        self.ln1 = nn.LayerNorm(num_features)
        self.ln2 = nn.LayerNorm(num_features)

        self.feature_mlp = build_mlp(
            num_features, num_features,
            hidden_dim, depth_feature, activation
        )

        self.temporal_mlp = build_mlp2(
            seq_len, seq_len,
            hidden_dim, depth_temporal, activation
        )

    def forward(self, x):
        if self.use_ln:
            x = x + self.feature_mlp(self.ln1(x))
            y = self.ln2(x).transpose(1, 2)
        else:
            x = x + self.feature_mlp(x)
            y = x.transpose(1, 2)

        y = self.temporal_mlp(y).transpose(1, 2)
        return x + y

class FTMLP(nn.Module):
    def __init__(self, num_features, seq_len,
                 n_blocks, hidden_dim,
                 depth_feature, depth_temporal, activation, use_ln):
        super().__init__()

        self.blocks = nn.ModuleList([
            FeatureTemporalBlock(
                num_features, seq_len,
                hidden_dim, depth_feature, depth_temporal,
                activation, use_ln
            ) for _ in range(n_blocks)
        ])

        self.head = nn.Sequential(
            nn.Linear(num_features * seq_len, 64),
            activation,
            nn.Linear(64, 1)
        )

    def forward(self, x):
        for block in self.blocks:
            x = block(x)
        return self.head(x.reshape(x.size(0), -1))

# ===============================
# 4. ABLATION STUDY
# ===============================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

activations = ["silu", "elu", "prelu", "leaky_relu", "tanh", "sigmoid","relu","gelu"]
layer_norms = [True,False]
depth_feature_      = [1,2,3,4]
depth_temporal_    = [1,2,3,4]
results = []

for act_name in activations:
    for use_ln in layer_norms:
        for depth_feature in depth_feature_:
            for depth_temporal in depth_temporal_:

                set_seed(RANDOM_SEED)

                model = FTMLP(
                    num_features=len(FEATURE_COLS),
                    seq_len=LOOKBACK,
                    n_blocks=3,
                    hidden_dim=32,
                    depth_feature=depth_feature,
                    depth_temporal=depth_temporal,
                    activation=get_activation(act_name),
                    use_ln=use_ln
                ).to(device)

                optimizer = torch.optim.Adam(model.parameters(), lr=LR)
                criterion = nn.MSELoss()

                # ---- Train ----
                model.train()
                for _ in range(EPOCHS):
                    for xb, yb in train_loader:
                        xb, yb = xb.to(device), yb.to(device)
                        optimizer.zero_grad()
                        loss = criterion(model(xb), yb)
                        loss.backward()
                        optimizer.step()

                # ---- Evaluate ----
                model.eval()
                preds = []
                with torch.no_grad():
                    for xb, _ in test_loader:
                        preds.append(model(xb.to(device)).cpu().numpy())

                preds = np.vstack(preds)
                y_inv = y_scaler.inverse_transform(y_test)
                p_inv = y_scaler.inverse_transform(preds)

                mape = mean_absolute_percentage_error(y_inv, p_inv)*100

                results.append({
                    "activation": act_name,
                    "layer_norm": use_ln,
                    "feature_depth": depth_feature,
                    "temporal_depth": depth_temporal,
                    "MAPE": round(mape, 4)
                })

                print(f"Done | act={act_name}, LN={use_ln}, feature_depth={depth_feature}, temporal_depth={depth_temporal}, MAPE={mape:.4f}")

# ===============================
# 5. RESULTS TABLE
# ===============================
results_df = pd.DataFrame(results).sort_values("MAPE")
print("\n=== Ablation Results ===")
print(results_df)


In [None]:
results_df.to_csv("ablation_MLP_Open.csv", index=False)
