In [None]:
# !pip uninstall keras -y
# !pip uninstall pytorch-lightning -y
# !pip install numpy==1.26.4
# !pip install pandas==2.1.4
# !pip install matplotlib==3.7.2
# !pip install -U scikit-learn
# !pip install torch --upgrade
# !pip install torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
# !pip install lightning==2.1.0
# !pip install pytorch-forecasting==0.10.3


In [None]:
# !pip list

# Import Data

In [None]:
import pandas as pd

# Load Excel file
file_path = "Sustainable Data.xlsx"   # replace with your file path
df = pd.read_excel(file_path)
df["datetime"] = pd.to_datetime(df["year"].astype(str) + "-" + df["month"].astype(str) + "-07")
cols_to_drop = ['FoodwastePermittedLimit(MT','Percentage difference']
df = df.drop(columns=cols_to_drop)
df.info()


In [None]:
def nullCols(df,x):
    print("--- Count of Null Values in Each Column ---")
    null_counts = df.isnull().sum()
    display(null_counts[null_counts >x].sort_values(ascending=False))
    print(null_counts[null_counts >x].index)

def nullColsVis(df):
    import pandas as pd
    import matplotlib.pyplot as plt

    # Function to get bad columns for each year
    def get_null_or_zero_cols(group):
        bad_cols = group.columns[(group.isna().all()) | ((group == 0).all())].tolist()
        return pd.Series({
            "num_columns": len(bad_cols),
            "columns": bad_cols
        })

    # Apply function per year
    col_info_by_year = df.groupby("year").apply(get_null_or_zero_cols).reset_index()

    # Print results
    for _, row in col_info_by_year.iterrows():
        print(f"\nYear: {row['year']}")
        print(f"Number of completely null/zero columns: {row['num_columns']}")
        print(f"Columns: {row['columns']}")

    # Visualization
    plt.figure(figsize=(8,5))
    plt.bar(col_info_by_year["year"], col_info_by_year["num_columns"])
    plt.xlabel("Year")
    plt.ylabel("Columns completely null or 0")
    plt.title("Completely null/zero columns per year")
    plt.show()

def nullZero(df):
    null_counts = df.isna().sum()
    zero_counts = (df == 0).sum()
    counts = pd.DataFrame({
        "Null Count": null_counts,
        "Zero Count": zero_counts,
    })

    print(counts[counts['Zero Count']>0].index)

In [None]:
col=['month', 'Current Date', 'Plant Name', 'Plant Name: organisation',
       'quarter', 'year', 'financialYear', 'Scope1', 'Scope_2',
       'totalEnergyPerUnit(GJ)', 'TotalCO2emission(MT)',
       'Electricity Grid Energy Per Unit (GJ)',
       'Electricity Grid TCO2 Emission',
       'Production Actual Quantity (MT/Month)', 'totalWaterConsumption',
       'Attachments', 'Plant Location', 'gridEmissionTesting',
       'Plant Name: category', 'datetime']
dfAll=df[col]

In [None]:
col = "Production Actual Quantity (MT/Month)"
dfAll[col] = dfAll[col].fillna(dfAll[col].mean())

In [None]:
nullCols(dfAll,0)
nullZero(dfAll)

In [None]:
import numpy as np
col2=['Scope1', 'Scope_2', 'totalEnergyPerUnit(GJ)', 'TotalCO2emission(MT)',
       'Electricity Grid Energy Per Unit (GJ)',
       'Electricity Grid TCO2 Emission',
       'Production Actual Quantity (MT/Month)', 'totalWaterConsumption',
       'Attachments', 'gridEmissionTesting']
for col in col2:
    dfAll[col] = dfAll[col].replace(0, np.nan).fillna(dfAll[col].mean())

In [None]:
# ===== 0) Imports =====
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error

# ===== 1) CONFIG =====
DATE_COL = "datetime"                       # if missing/dirty, we’ll rebuild from year & month
TARGET_COL = "TotalCO2emission(MT)"             # the series to forecast
PLANT_COL = "Plant Name"                        # per-plant option
FORECAST_MONTHS = 6                            # horizon
TEST_MONTHS = 12                                 # last N months as test

# ===== 2) START FROM df_2020 =====
# Assumes df_2020 exists in memory with the structure you shared.
df = dfAll.copy()

# ----- 2a) Ensure a proper datetime column 'ds' -----
if DATE_COL in df.columns:
    # Try parsing; if parsing fails for some rows, we rebuild ds below
    ds = pd.to_datetime(df[DATE_COL], errors="coerce", dayfirst=True, infer_datetime_format=True)
else:
    ds = pd.Series(pd.NaT, index=df.index)

# If ds has NaT (or 'Current Date' missing), rebuild from year+month (set day=1)
needs_rebuild = ds.isna()
if needs_rebuild.any():
    if not {"year", "month"}.issubset(df.columns):
        raise ValueError("No valid 'Current Date' and missing 'year'/'month' to rebuild dates.")
    ds_rebuilt = pd.to_datetime(
        dict(year=df.loc[needs_rebuild, "year"],
             month=df.loc[needs_rebuild, "month"],
             day=1)
    )
    ds.loc[needs_rebuild] = ds_rebuilt

df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()  # normalize to month-start

# ----- 2b) Basic cleaning: drop obvious duplicates, keep consistent types -----
df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

# ----- 2c) Impute zeros/NaNs with mean for all numeric columns (by year, then global fallback) -----
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
    # Replace 0 with NaN, then fill with column mean (within group)
    g = group.copy()
    for c in num_cols:
        g[c] = g[c].replace(0, np.nan)
        if g[c].notna().any():
            g[c] = g[c].fillna(g[c].mean())
    return g

# First try imputation within each calendar year (if present)
if "year" in df.columns:
    df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
else:
    df = impute_zero_nan_with_mean(df)

# Global fallback (in case an entire year's column was all zeros/NaN)
for c in num_cols:
    if df[c].isna().any():
        df[c] = df[c].fillna(df[c].mean())

# ===== 3) TWO PATHS: (A) OVERALL monthly forecast across all plants, (B) PER-PLANT forecasts =====

# ---------- A) OVERALL SERIES ----------
def prepare_overall_series(df_in: pd.DataFrame) -> pd.DataFrame:
    # Aggregate monthly across plants (sum is typical for emissions; use 'mean' if you prefer)
    s = (df_in
         .groupby("ds", as_index=False)[TARGET_COL]
         .sum()
         .sort_values("ds"))
    s = s.rename(columns={TARGET_COL: "y"})
    return s

overall = prepare_overall_series(df)

# Train / Test split by last TEST_MONTHS
overall = overall.sort_values("ds").reset_index(drop=True)
split_point = len(overall) - TEST_MONTHS if len(overall) > TEST_MONTHS else len(overall)
train_overall = overall.iloc[:split_point].copy()
test_overall = overall.iloc[split_point:].copy()  # may be empty if not enough months

# Fit Prophet (no extra regressors for the baseline)
m_overall = Prophet(seasonality_mode="additive", yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False)
m_overall.fit(train_overall)

# In-sample forecast to last training date (+ test window to compare)
future_cutoff = train_overall["ds"].max()
future_all = m_overall.make_future_dataframe(periods=max(TEST_MONTHS, 0), freq="MS")
fcst_all = m_overall.predict(future_all)

# Evaluate on test if available
if not test_overall.empty:
    # Align predictions with actual test months
    y_pred = (fcst_all[["ds", "yhat"]]
              .merge(test_overall[["ds", "y"]], on="ds", how="inner"))
    mape_overall = mean_absolute_percentage_error(y_pred["y"], y_pred["yhat"])
else:
    mape_overall = np.nan

# Final 12-month forecast (beyond the full available data)
future_12 = m_overall.make_future_dataframe(periods=FORECAST_MONTHS, freq="MS")
forecast_12 = m_overall.predict(future_12).loc[:, ["ds", "yhat", "yhat_lower", "yhat_upper"]]
forecast_12 = forecast_12[forecast_12["ds"] > overall["ds"].max()].reset_index(drop=True)

print("=== OVERALL SERIES ===")
print(f"Training months: {len(train_overall)} | Test months: {len(test_overall)}")
print(f"Overall Test MAPE: {mape_overall:.3f}" if not np.isnan(mape_overall) else "Overall Test MAPE: N/A (not enough test months)")
print("\nNext 12 months forecast (overall):")
print(forecast_12)

# ---------- B) PER-PLANT SERIES (optional) ----------
# Build one model per plant; returns a dict of {plant: (mape, forecast_df)}
def forecast_per_plant(df_in: pd.DataFrame, test_months: int = 3, horizon: int = 12):
    out = {}
    g = (df_in.groupby([PLANT_COL, "ds"], as_index=False)[TARGET_COL]
         .sum()
         .rename(columns={TARGET_COL: "y"}))

    for plant, gdf in g.groupby(PLANT_COL):
        gdf = gdf.sort_values("ds").reset_index(drop=True)
        if len(gdf) < 6:
            out[plant] = (np.nan, pd.DataFrame())
            continue

        split = len(gdf) - test_months if len(gdf) > test_months else len(gdf)
        train = gdf.iloc[:split].copy()
        test  = gdf.iloc[split:].copy()

        m = Prophet(seasonality_mode="additive", yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False)
        m.fit(train)

        future_all = m.make_future_dataframe(periods=max(test_months, 0), freq="MS")
        fcst_all  = m.predict(future_all)

        # Align test and predictions
        y_pred = (fcst_all[["ds", "yhat"]]
                  .merge(test[["ds", "y"]], on="ds", how="inner"))

        if not test.empty and not y_pred.empty:
            mape = mean_absolute_percentage_error(y_pred["y"], y_pred["yhat"])
        else:
            mape = np.nan   # No overlap, can’t compute MAPE

        future_h = m.make_future_dataframe(periods=horizon, freq="MS")
        fcst_h = m.predict(future_h)[["ds", "yhat", "yhat_lower", "yhat_upper"]]
        fcst_h = fcst_h[fcst_h["ds"] > gdf["ds"].max()].reset_index(drop=True)

        out[plant] = (mape, fcst_h)

    return out


# Run per-plant (optional; comment this block if you only need overall)
per_plant_results = forecast_per_plant(df, test_months=TEST_MONTHS, horizon=FORECAST_MONTHS)

# Show a quick summary table for per-plant MAPE
per_plant_mape = pd.DataFrame(
    [(plant, mape) for plant, (mape, _) in per_plant_results.items()],
    columns=[PLANT_COL, "Test MAPE"]
).sort_values("Test MAPE", na_position="last")
print("\n=== PER-PLANT MAPE (lower is better) ===")
print(per_plant_mape.head(20))  # top 20; adjust as needed




In [None]:
import matplotlib.pyplot as plt

# Align predictions with full actuals
eval_df = (fcst_all[["ds", "yhat"]]
           .merge(overall[["ds", "y"]], on="ds", how="left"))

plt.figure(figsize=(12,6))

# A) Train actuals
plt.plot(train_overall['ds'], train_overall['y'],
         label="Train (Actual)", linewidth=2, color="blue")

# B) Test actuals
plt.plot(test_overall['ds'], test_overall['y'],
         label="Test (Actual)", linewidth=2, color="green")

# # C) Prophet forecast (fitted + on test window)
# plt.plot(eval_df['ds'], eval_df['yhat'],
#          '--', label="Forecast (Prophet)", linewidth=2, color="red")

# # D) Train/Test split marker
# if not test_overall.empty:
#     split_date = test_overall['ds'].min()
#     plt.axvline(split_date, color='gray', linestyle='--',
#                 linewidth=2, label="Train/Test Split")

# Titles and labels
plt.title("Prophet Forecast vs Actual (Overall Series)", fontsize=14)
plt.xlabel("Date", fontsize=12)
plt.ylabel("Total CO2 Emissions (MT)", fontsize=12)

plt.legend()
plt.grid(alpha=0.3)
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Align predictions with full actuals
eval_df = (fcst_all[["ds", "yhat"]]
           .merge(overall[["ds", "y"]], on="ds", how="left"))

plt.figure(figsize=(12,6))

# A) Train actuals
plt.plot(train_overall['ds'], train_overall['y'],
         label="Train (Actual)", linewidth=2, color="blue")

# B) Test actuals
plt.plot(test_overall['ds'], test_overall['y'],
         label="Test (Actual)", linewidth=2, color="green")

# C) Prophet forecast (fitted + on test window)
plt.plot(eval_df['ds'], eval_df['yhat'],
         '--', label="Forecast (Prophet)", linewidth=2, color="red")

# D) Train/Test split marker
if not test_overall.empty:
    split_date = test_overall['ds'].min()
    plt.axvline(split_date, color='gray', linestyle='--',
                linewidth=2, label="Train/Test Split")

# Titles and labels
plt.title("Prophet Forecast vs Actual (Overall Series)", fontsize=14)
plt.xlabel("Date", fontsize=12)
plt.ylabel("Total CO2 Emissions (MT)", fontsize=12)

plt.legend()
plt.grid(alpha=0.3)
plt.show()


# 2021

In [None]:
df.to_csv("Client_Oct_ProcessedFULL.csv", index=False)

In [None]:

df_2022=df.copy(deep=True)

df_2022=df_2022[df_2022['year']>=2021]
print(df_2022.columns)

In [None]:
temp=df_2022[df_2022['year']>=2022]
print(temp.shape)
print("--- Count of Null Values in Each Column ---")

null_counts = temp.isnull().sum()
display(null_counts[null_counts >0].sort_values(ascending=False))
print(null_counts[null_counts >0].index.tolist())

In [None]:
col = "Production Actual Quantity (MT/Month)"
df_2022[col] = df_2022[col].fillna(df_2022[col].mean())

In [None]:
def nullZero(df):
    null_counts = df_2022.isna().sum()
    zero_counts = (df_2022 == 0).sum()
    counts = pd.DataFrame({
        "Null Count": null_counts,
        "Zero Count": zero_counts,
    })

    print(counts[counts['Zero Count']>0].index)

In [None]:
import numpy as np
col2=['Scope1', 'Scope_2', 'totalEnergyPerUnit(GJ)', 'TotalCO2emission(MT)',
       'Electricity Grid Energy Per Unit (GJ)',
       'Electricity Grid TCO2 Emission',
       'Production Actual Quantity (MT/Month)', 'totalWaterConsumption',
       'Attachments', 'gridEmissionTesting']
for col in col2:
    df_2022[col] = df_2022[col].replace(0, np.nan).fillna(df_2022[col].mean())


In [None]:
df_2022["Scope1_per_unit"] = df_2022["Scope1"] / df_2022["Production Actual Quantity (MT/Month)"]
df_2022["CO2_per_unit"] = df_2022["TotalCO2emission(MT)"] / df_2022["Production Actual Quantity (MT/Month)"]


df_2022.info()

# EDA

In [None]:
df_2022.columns

In [None]:
eda=df_2022.copy(deep=True)
eda=eda[['datetime','Plant Name: category','Plant Name: organisation','Plant Name','Plant Location','Scope1','Scope_2','Production Actual Quantity (MT/Month)','TotalCO2emission(MT)','Scope1_per_unit',
       'CO2_per_unit']]
eda.info()

In [None]:
for col in eda.select_dtypes('object').columns:
    print(col)
    print(eda[col].unique())
    print(len(eda[col].unique()))

In [None]:
# ===== 0) Imports =====
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


df = eda.copy()  # your dataframe

# ===== 1) Basic Info =====
print("Data shape:", df.shape)
print("\nColumns:\n", df.columns)
print("\nSample data:\n", df.head(1))
print("\nSummary stats:\n", df.describe(include="all"))

In [None]:
df.columns

In [None]:
df['Plant Location'].value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (18, 6)

# ===== 2) Plants in India vs Outside =====
df["India_Flag"] = df["Plant Location"].apply(lambda x: "India" if x=="India" else "Outside India")
plant_counts = df.groupby("India_Flag")["Plant Name"].nunique().reset_index()
plant_counts.rename(columns={"Plant Name":"Unique Plants"}, inplace=True)

ax = sns.barplot(data=plant_counts, x="India_Flag", y="Unique Plants", palette="viridis")
for p in ax.patches:
    ax.annotate(int(p.get_height()), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='bottom', fontsize=11, color='black', xytext=(0, 5),
                textcoords='offset points')
plt.title("Number of Unique Plants: India vs Outside")
plt.show()

plant_counts = df.groupby("Plant Location")["Plant Name"].nunique().reset_index()
plant_counts.rename(columns={"Plant Name":"Unique Plants"}, inplace=True)

ax = sns.barplot(data=plant_counts, x="Plant Location", y="Unique Plants", palette="viridis")
for p in ax.patches:
    ax.annotate(int(p.get_height()), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='bottom', fontsize=11, color='black', xytext=(0, 5),
                textcoords='offset points')
plt.title("Number of Unique Plants in Different Locations")
plt.show()
plt.rcParams["figure.figsize"] = (18, 6)

# ===== 6) Total Scope1 emissions over time =====
scope1_ts = df.groupby("datetime")["Scope1_per_unit"].sum().reset_index()
sns.lineplot(data=scope1_ts, x="datetime", y="Scope1_per_unit", marker="o")
plt.title("Total Scope1 Emissions Over Time")
plt.xlabel("Date")
plt.ylabel("Scope1 Emissions")
plt.show()
# ===== 8) Scope1 vs Production vs CO2 timeline =====
scope1_prod_ts = df.groupby("datetime")[[
    "Scope1_per_unit",
    "TotalCO2emission(MT)",
    "Production Actual Quantity (MT/Month)"
]].sum().reset_index()

fig, ax1 = plt.subplots(figsize=(18, 6))

# --- Left y-axis ---
ax1.set_xlabel("Date")
ax1.set_ylabel("Scope1 / CO2 (per unit)", color="tab:blue")
ax1.plot(scope1_prod_ts["datetime"], scope1_prod_ts["Scope1_per_unit"], 
         color="tab:blue", marker="o", label="Scope1 per Unit")
ax1.tick_params(axis="y", labelcolor="tab:blue")

# --- Right y-axis ---
ax2 = ax1.twinx()
ax2.set_ylabel("Production & Total CO2", color="tab:green")
ax2.plot(scope1_prod_ts["datetime"], scope1_prod_ts["Production Actual Quantity (MT/Month)"], 
         color="tab:green", marker="s", linestyle="--", label="Production")
ax2.plot(scope1_prod_ts["datetime"], scope1_prod_ts["TotalCO2emission(MT)"], 
         color="tab:red", marker="d", linestyle="-.", label="Total CO2")
ax2.tick_params(axis="y", labelcolor="tab:green")

# --- Title & legend ---
fig.suptitle("Scope1 Emissions, CO2 and Production Over Time", fontsize=14)

# merge legends from both axes
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
fig.legend(lines + lines2, labels + labels2, loc="upper left", bbox_to_anchor=(0.1, 0.9))

plt.show()


# ===== 9) Scope1 emissions per unit per plant (India vs Outside India) =====
# Count unique plants by region
plant_counts = df.groupby("India_Flag")["Plant Name"].nunique().to_dict()

# Aggregate Scope1 per unit by datetime and region
scope1_india_ts = (
    df.groupby(["datetime", "India_Flag"])["Scope1_per_unit"]
      .sum()
      .reset_index()
)

# Divide by number of plants in each region
scope1_india_ts["Scope1_per_unit_per_plant"] = scope1_india_ts.apply(
    lambda row: row["Scope1_per_unit"] / plant_counts[row["India_Flag"]],
    axis=1
)

# Split into two series
india_ts = scope1_india_ts[scope1_india_ts["India_Flag"] == "India"]
outside_ts = scope1_india_ts[scope1_india_ts["India_Flag"] == "Outside India"]

# ---- Plot with dual y-axes ----
fig, ax1 = plt.subplots(figsize=(18, 6))

# Left y-axis = India
ax1.set_xlabel("Date")
ax1.set_ylabel("Scope1 per Unit per Plant (India)", color="tab:blue")
ax1.plot(india_ts["datetime"], india_ts["Scope1_per_unit_per_plant"], 
         color="tab:blue", marker="o", label="India")
ax1.tick_params(axis="y", labelcolor="tab:blue")

# Right y-axis = Outside India
ax2 = ax1.twinx()
ax2.set_ylabel("Scope1 per Unit per Plant (Outside India)", color="tab:orange")
ax2.plot(outside_ts["datetime"], outside_ts["Scope1_per_unit_per_plant"], 
         color="tab:orange", marker="s", linestyle="--", label="Outside India")
ax2.tick_params(axis="y", labelcolor="tab:orange")

# Title & Legend
fig.suptitle("Scope1 Emissions per Unit per Plant: India vs Outside India", fontsize=14)

lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
fig.legend(lines + lines2, labels + labels2, loc="upper left", bbox_to_anchor=(0.1, 0.9))

plt.show()


In [None]:
df_2022.columns

In [None]:
plt.rcParams["figure.figsize"] = (4, 4)

plant_counts = df.groupby("Plant Location")["Plant Name"].nunique().reset_index()
plant_counts.rename(columns={"Plant Name":"Unique Plants"}, inplace=True)
top5_plants = plant_counts.sort_values("Unique Plants", ascending=False).head(5)

ax = sns.barplot(data=top5_plants, x="Plant Location", y="Unique Plants", palette="viridis")
for p in ax.patches:
    ax.annotate(int(p.get_height()), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='bottom', fontsize=11, color='black', xytext=(0, 5),
                textcoords='offset points')
plt.title("Number of Unique Plants in Different Locations")
plt.show()

In [None]:
scope1_prod_ts = df_2022.groupby("datetime")[[
    "Scope1_per_unit",
    "TotalCO2emission(MT)",
    "Production Actual Quantity (MT/Month)",
     'Scope1', 'Scope_2',
       'totalEnergyPerUnit(GJ)', 'TotalCO2emission(MT)',
       'Electricity Grid Energy Per Unit (GJ)',
       'Electricity Grid TCO2 Emission',
       'Production Actual Quantity (MT/Month)', 'totalWaterConsumption'
]].sum().reset_index()

def chart(col):
    fig, ax1 = plt.subplots(figsize=(18, 6))
    # --- Left y-axis ---
    ax1.set_xlabel("Date")
    ax1.set_ylabel(f"{col}", color="tab:blue")
    ax1.plot(scope1_prod_ts["datetime"], scope1_prod_ts[col], 
            color="tab:blue", marker="o", label=f"{col}")
    ax1.tick_params(axis="y", labelcolor="tab:blue")
    fig.suptitle(f"{col} Over Time", fontsize=14)

    # merge legends from both axes
    lines, labels = ax1.get_legend_handles_labels()
    fig.legend(lines, labels, loc="upper left", bbox_to_anchor=(0.1, 0.9))

    plt.show()

for col in scope1_prod_ts.columns:
    if col != 'datetime':
        chart(col)

In [None]:
df_2022.to_csv("Client_Oct_Processed.csv", index=False)

# Models

In [None]:
# ===== 0) Imports =====
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error

# ===== 1) CONFIG =====
DATE_COL = "datetime"                       # if missing/dirty, we’ll rebuild from year & month
TARGET_COL = "Scope1_per_unit"             # the series to forecast
PLANT_COL = "Plant Name"                        # per-plant option
FORECAST_MONTHS = 7                           # horizon
TEST_MONTHS = 7                                 # last N months as test

# ===== 2) START FROM df_2020 =====
# Assumes df_2020 exists in memory with the structure you shared.
df = df_2022.copy()

# ----- 2a) Ensure a proper datetime column 'ds' -----
if DATE_COL in df.columns:
    # Try parsing; if parsing fails for some rows, we rebuild ds below
    ds = pd.to_datetime(df[DATE_COL], errors="coerce", dayfirst=True, infer_datetime_format=True)
else:
    ds = pd.Series(pd.NaT, index=df.index)

# If ds has NaT (or 'Current Date' missing), rebuild from year+month (set day=1)
needs_rebuild = ds.isna()
if needs_rebuild.any():
    if not {"year", "month"}.issubset(df.columns):
        raise ValueError("No valid 'Current Date' and missing 'year'/'month' to rebuild dates.")
    ds_rebuilt = pd.to_datetime(
        dict(year=df.loc[needs_rebuild, "year"],
             month=df.loc[needs_rebuild, "month"],
             day=1)
    )
    ds.loc[needs_rebuild] = ds_rebuilt

df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()  # normalize to month-start

# ----- 2b) Basic cleaning: drop obvious duplicates, keep consistent types -----
df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

# ----- 2c) Impute zeros/NaNs with mean for all numeric columns (by year, then global fallback) -----
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
    # Replace 0 with NaN, then fill with column mean (within group)
    g = group.copy()
    for c in num_cols:
        g[c] = g[c].replace(0, np.nan)
        if g[c].notna().any():
            g[c] = g[c].fillna(g[c].mean())
    return g

# First try imputation within each calendar year (if present)
if "year" in df.columns:
    df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
else:
    df = impute_zero_nan_with_mean(df)

# Global fallback (in case an entire year's column was all zeros/NaN)
for c in num_cols:
    if df[c].isna().any():
        df[c] = df[c].fillna(df[c].mean())

# ===== 3) TWO PATHS: (A) OVERALL monthly forecast across all plants, (B) PER-PLANT forecasts =====

# ---------- A) OVERALL SERIES ----------
def prepare_overall_series(df_in: pd.DataFrame) -> pd.DataFrame:
    # Aggregate monthly across plants (sum is typical for emissions; use 'mean' if you prefer)
    s = (df_in
         .groupby("ds", as_index=False)[TARGET_COL]
         .sum()
         .sort_values("ds"))
    s = s.rename(columns={TARGET_COL: "y"})
    return s

overall = prepare_overall_series(df)

# Train / Test split by last TEST_MONTHS
overall = overall.sort_values("ds").reset_index(drop=True)
split_point = len(overall) - TEST_MONTHS if len(overall) > TEST_MONTHS else len(overall)
train_overall = overall.iloc[:split_point].copy()
test_overall = overall.iloc[split_point:].copy()  # may be empty if not enough months

# Fit Prophet (no extra regressors for the baseline)
m_overall = Prophet(seasonality_mode="additive", yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False)
m_overall.fit(train_overall)

# In-sample forecast to last training date (+ test window to compare)
future_cutoff = train_overall["ds"].max()
future_all = m_overall.make_future_dataframe(periods=max(TEST_MONTHS, 0), freq="MS")
fcst_all = m_overall.predict(future_all)

# Evaluate on test if available
if not test_overall.empty:
    # Align predictions with actual test months
    y_pred = (fcst_all[["ds", "yhat"]]
              .merge(test_overall[["ds", "y"]], on="ds", how="inner"))
    mape_overall = mean_absolute_percentage_error(y_pred["y"], y_pred["yhat"])
else:
    mape_overall = np.nan

# Final 12-month forecast (beyond the full available data)
future_12 = m_overall.make_future_dataframe(periods=FORECAST_MONTHS, freq="MS")
forecast_12 = m_overall.predict(future_12).loc[:, ["ds", "yhat", "yhat_lower", "yhat_upper"]]
forecast_12 = forecast_12[forecast_12["ds"] > overall["ds"].max()].reset_index(drop=True)

print("=== OVERALL SERIES ===")
print(f"Training months: {len(train_overall)} | Test months: {len(test_overall)}")
print(f"Overall Test MAPE: {mape_overall:.3f}" if not np.isnan(mape_overall) else "Overall Test MAPE: N/A (not enough test months)")
print("\nNext 12 months forecast (overall):")
print(forecast_12)

# ---------- B) PER-PLANT SERIES (optional) ----------
# Build one model per plant; returns a dict of {plant: (mape, forecast_df)}
def forecast_per_plant(df_in: pd.DataFrame, test_months: int = 3, horizon: int = 12):
    out = {}
    g = (df_in.groupby([PLANT_COL, "ds"], as_index=False)[TARGET_COL]
         .sum()
         .rename(columns={TARGET_COL: "y"}))

    for plant, gdf in g.groupby(PLANT_COL):
        gdf = gdf.sort_values("ds").reset_index(drop=True)
        if len(gdf) < 6:
            out[plant] = (np.nan, pd.DataFrame())
            continue

        split = len(gdf) - test_months if len(gdf) > test_months else len(gdf)
        train = gdf.iloc[:split].copy()
        test  = gdf.iloc[split:].copy()

        m = Prophet(seasonality_mode="multiplicative", yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False, changepoint_prior_scale=0.1)
        m.fit(train)

        future_all = m.make_future_dataframe(periods=max(test_months, 0), freq="MS")
        fcst_all  = m.predict(future_all)

        # Align test and predictions
        y_pred = (fcst_all[["ds", "yhat"]]
                  .merge(test[["ds", "y"]], on="ds", how="inner"))

        if not test.empty and not y_pred.empty:
            mape = mean_absolute_percentage_error(y_pred["y"], y_pred["yhat"])
        else:
            mape = np.nan   # No overlap, can’t compute MAPE

        future_h = m.make_future_dataframe(periods=horizon, freq="MS")
        fcst_h = m.predict(future_h)[["ds", "yhat", "yhat_lower", "yhat_upper"]]
        fcst_h = fcst_h[fcst_h["ds"] > gdf["ds"].max()].reset_index(drop=True)

        out[plant] = (mape, fcst_h)

    return out


# Run per-plant (optional; comment this block if you only need overall)
per_plant_results = forecast_per_plant(df, test_months=TEST_MONTHS, horizon=FORECAST_MONTHS)

# Show a quick summary table for per-plant MAPE
per_plant_mape = pd.DataFrame(
    [(plant, mape) for plant, (mape, _) in per_plant_results.items()],
    columns=[PLANT_COL, "Test MAPE"]
).sort_values("Test MAPE", na_position="last")
print("\n=== PER-PLANT MAPE (lower is better) ===")
print(per_plant_mape.head(20))  # top 20; adjust as needed




In [None]:
import matplotlib.pyplot as plt

# Align predictions with full actuals
eval_df = (fcst_all[["ds", "yhat"]]
           .merge(overall[["ds", "y"]], on="ds", how="left"))

plt.figure(figsize=(12,6))

# A) Train actuals
plt.plot(train_overall['ds'], train_overall['y'],
         label="Train (Actual)", linewidth=2, color="blue")

# B) Test actuals
plt.plot(test_overall['ds'], test_overall['y'],
         label="Test (Actual)", linewidth=2, color="green")

# C) Prophet forecast (fitted + on test window)
plt.plot(eval_df['ds'], eval_df['yhat'],
         '--', label="Forecast (Prophet)", linewidth=2, color="red")

# D) Train/Test split marker
if not test_overall.empty:
    split_date = test_overall['ds'].min()
    plt.axvline(split_date, color='gray', linestyle='--',
                linewidth=2, label="Train/Test Split")

# Titles and labels
plt.title("Prophet Forecast vs Actual (Overall Series)", fontsize=14)
plt.xlabel("Date", fontsize=12)
plt.ylabel("Total Scope 1 per unit production", fontsize=12)

plt.legend()
plt.grid(alpha=0.3)
plt.show()


In [None]:
# ===== 0) Imports =====
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error
import logging
logging.getLogger("cmdstanpy").setLevel(logging.WARNING)

# ===== 1) CONFIG =====
DATE_COL = "datetime"                       # if missing/dirty, we’ll rebuild from year & month
TARGET_COL = "Scope1_per_unit"             # the series to forecast
PLANT_COL = "Plant Name"                        # per-plant option
FORECAST_MONTHS = 7                             # horizon
TEST_MONTHS = 7                                 # last N months as test

# ===== 2) START FROM df_2020 =====
# Assumes df_2020 exists in memory with the structure you shared.
df = df_2022.copy()
# ----- 2a) Ensure a proper datetime column 'ds' -----
if DATE_COL in df.columns:
    # Try parsing; if parsing fails for some rows, we rebuild ds below
    ds = pd.to_datetime(df[DATE_COL], errors="coerce", infer_datetime_format=True)
else:
    ds = pd.Series(pd.NaT, index=df.index)

# If ds has NaT (or 'Current Date' missing), rebuild from year+month (set day=1)
needs_rebuild = ds.isna()
if needs_rebuild.any():
    if not {"year", "month"}.issubset(df.columns):
        raise ValueError("No valid 'Current Date' and missing 'year'/'month' to rebuild dates.")
    ds_rebuilt = pd.to_datetime(
        dict(year=df.loc[needs_rebuild, "year"],
             month=df.loc[needs_rebuild, "month"],
             day=1)
    )
    ds.loc[needs_rebuild] = ds_rebuilt

df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()  # normalize to month-start

# ----- 2b) Basic cleaning: drop obvious duplicates, keep consistent types -----
df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

# ----- 2c) Impute zeros/NaNs with mean for all numeric columns (by year, then global fallback) -----
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
    # Replace 0 with NaN, then fill with column mean (within group)
    g = group.copy()
    for c in num_cols:
        g[c] = g[c].replace(0, np.nan)
        if g[c].notna().any():
            g[c] = g[c].fillna(g[c].mean())
    return g

# First try imputation within each calendar year (if present)
if "year" in df.columns:
    df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
else:
    df = impute_zero_nan_with_mean(df)

# Global fallback (in case an entire year's column was all zeros/NaN)
for c in num_cols:
    if df[c].isna().any():
        df[c] = df[c].fillna(df[c].mean())

# ===== 3) TWO PATHS: (A) OVERALL monthly forecast across all plants, (B) PER-PLANT forecasts =====
# ---------- A) OVERALL SERIES ----------
def prepare_overall_series(df_in: pd.DataFrame) -> pd.DataFrame:
    # Aggregate monthly across plants (sum is typical for emissions; use 'mean' if you prefer)
    s = (df_in
         .groupby("ds", as_index=False)[TARGET_COL]
         .sum()
         .sort_values("ds"))
    s = s.rename(columns={TARGET_COL: "y"})
    return s

overall = prepare_overall_series(df)
# Train / Test split by last TEST_MONTHS
overall = overall.sort_values("ds").reset_index(drop=True)
split_point = len(overall) - TEST_MONTHS if len(overall) > TEST_MONTHS else len(overall)
train_overall = overall.iloc[:split_point].copy()
test_overall = overall.iloc[split_point:].copy()  # may be empty if not enough months
# Fit Prophet (no extra regressors for the baseline)
m_overall = Prophet(seasonality_mode="multiplicative", yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False,changepoint_prior_scale=0.1 )
m_overall.fit(train_overall)

# In-sample forecast to last training date (+ test window to compare)
future_cutoff = train_overall["ds"].max()
future_all = m_overall.make_future_dataframe(periods=max(TEST_MONTHS, 0), freq="MS")
fcst_all = m_overall.predict(future_all)

# Evaluate on test if available 
if not test_overall.empty:
    # Align predictions with actual test months
    y_pred = (fcst_all[["ds", "yhat"]]
              .merge(test_overall[["ds", "y"]], on="ds", how="inner"))
    mape_overall = mean_absolute_percentage_error(y_pred["y"], y_pred["yhat"])
else:
    mape_overall = np.nan

# Final 12-month forecast (beyond the full available data)
future_12 = m_overall.make_future_dataframe(periods=FORECAST_MONTHS, freq="MS")
forecast_12 = m_overall.predict(future_12).loc[:, ["ds", "yhat", "yhat_lower", "yhat_upper"]]
forecast_12 = forecast_12[forecast_12["ds"] > overall["ds"].max()].reset_index(drop=True)
print("=== OVERALL SERIES ===")
print(f"Training months: {len(train_overall)} | Test months: {len(test_overall)}")
print(f"Overall Test MAPE: {mape_overall:.3f}" if not np.isnan(mape_overall) else "Overall Test MAPE: N/A (not enough test months)")
# print("\nNext 12 months forecast (overall):")
# print(forecast_12)

import matplotlib.pyplot as plt

# Align predictions with full actuals
eval_df = (fcst_all[["ds", "yhat"]]
           .merge(overall[["ds", "y"]], on="ds", how="left"))

plt.figure(figsize=(12,6))

# A) Train actuals
plt.plot(train_overall['ds'], train_overall['y'],
         label="Train (Actual)", linewidth=2, color="blue")

# B) Test actuals
plt.plot(test_overall['ds'], test_overall['y'],
         label="Test (Actual)", linewidth=2, color="green")

# C) Prophet forecast (fitted + on test window)
plt.plot(eval_df['ds'], eval_df['yhat'],
         '--', label="Forecast (Prophet)", linewidth=2, color="red")

# D) Train/Test split marker
if not test_overall.empty:
    split_date = test_overall['ds'].min()
    plt.axvline(split_date, color='gray', linestyle='--',
                linewidth=2, label="Train/Test Split")

# Titles and labels
plt.title("Prophet Forecast vs Actual (Overall Series)", fontsize=14)
plt.xlabel("Date", fontsize=12)
plt.ylabel("Total CO2 Emissions (MT)", fontsize=12)

plt.legend()
plt.grid(alpha=0.3)
plt.show()


In [None]:
df_2022.columns

In [None]:
df_compare = df_2022[['datetime','Electricity Grid Energy Per Unit (GJ)', 'Scope1_per_unit','Production Actual Quantity (MT/Month)']]
df_compare['elecPerUnit']=df_compare['Electricity Grid Energy Per Unit (GJ)']/df_compare['Production Actual Quantity (MT/Month)']


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# ===== Group by datetime and sum both variables =====

energy_scope1_ts = df_compare.groupby("datetime")[[
    "elecPerUnit",
    "Scope1_per_unit"
]].sum().reset_index()

# ===== Plot with two y-axes =====
fig, ax1 = plt.subplots(figsize=(18, 6))

# --- Left y-axis (Energy) ---
ax1.plot(energy_scope1_ts["datetime"], energy_scope1_ts["elecPerUnit"],
         color="tab:blue", marker="o", label="Electricity Per Unit")
ax1.set_xlabel("Date")
ax1.set_ylabel("Electricity Per Unit", color="tab:blue")
ax1.tick_params(axis="y", labelcolor="tab:blue")

# --- Right y-axis (Scope1) ---
ax2 = ax1.twinx()
ax2.plot(energy_scope1_ts["datetime"], energy_scope1_ts["Scope1_per_unit"],
         color="tab:red", marker="s", label="Scope1 per Unit")
ax2.set_ylabel("Scope1 per Unit", color="tab:red")
ax2.tick_params(axis="y", labelcolor="tab:red")

# --- Title & Legend ---
fig.suptitle("Electricity Grid Energy vs Scope1 Emissions (Summed Over Time)", fontsize=14)

# Merge legends
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
fig.legend(lines + lines2, labels + labels2, loc="upper left", bbox_to_anchor=(0.1, 0.9))

plt.show()


# Phy Loss

In [None]:
df_2022.columns

In [None]:

df_compare = df_2022[['Electricity Grid Energy Per Unit (GJ)', 'Scope1','Scope1_per_unit','datetime']].copy()

import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
# Group by datetime and aggregate
import matplotlib.pyplot as plt

# Group by datetime and aggregate
energy_scope1_ts = df_compare.groupby("datetime")[[
    "Electricity Grid Energy Per Unit (GJ)",
    "Scope1",
    "Scope1_per_unit"
]].sum().reset_index()

# Create dual-axis plot
fig, ax1 = plt.subplots(figsize=(12,6))

# Left y-axis (Electricity Grid Energy)
color = "tab:blue"
ax1.set_xlabel("Datetime")
ax1.set_ylabel("Electricity Grid Energy Per Unit (GJ)", color=color)
ax1.plot(energy_scope1_ts["datetime"], 
         energy_scope1_ts["Electricity Grid Energy Per Unit (GJ)"], 
         color=color, label="Electricity Grid Energy Per Unit (GJ)")
ax1.tick_params(axis="y", labelcolor=color)

# Right y-axis (Scope1_per_unit)
ax1 = ax1.twinx()
color = "tab:green"
ax1.set_ylabel("Scope1", color=color)
ax1.plot(energy_scope1_ts["datetime"], 
         energy_scope1_ts["Scope1"], 
         color=color, label="Scope1")
ax1.tick_params(axis="y", labelcolor=color)

ax2 = ax1.twinx()
color = "tab:red"
ax2.set_ylabel("Scope1_per_unit", color=color)
ax2.plot(energy_scope1_ts["datetime"], 
         energy_scope1_ts["Scope1_per_unit"], 
         color=color, label="Scope1_per_unit")
ax2.tick_params(axis="y", labelcolor=color)

# Title and grid
plt.title("Electricity Grid Energy vs Scope1 Emissions per Unit Over Time")
fig.tight_layout()
plt.grid(True)
plt.show()




X = energy_scope1_ts[["Electricity Grid Energy Per Unit (GJ)"]].values
y = energy_scope1_ts["Scope1"].values

# Fit linear regression
model = LinearRegression()
model.fit(X, y)

slope = model.coef_[0]
intercept = model.intercept_
r2 = model.score(X, y)

# Scatter + regression line
plt.figure(figsize=(8,6))
sns.scatterplot(
    x="Electricity Grid Energy Per Unit (GJ)",
    y="Scope1",
    data=energy_scope1_ts,
    alpha=0.6,
    color="purple"
)

# Regression line
x_vals = np.linspace(X.min(), X.max(), 100).reshape(-1,1)
y_vals = model.predict(x_vals)
plt.plot(x_vals, y_vals, color="red", label="Regression Line")

# Equation text
eq_text = f"y = {slope:.3f}x + {intercept:.3f}\nR² = {r2:.3f}"
plt.text(0.05, 0.95, eq_text, transform=plt.gca().transAxes, 
         fontsize=12, verticalalignment="top", bbox=dict(facecolor="white", alpha=0.7))

plt.xlabel("Electricity Grid Energy Per Unit (GJ)")
plt.ylabel("Scope1")
plt.title("Scatter Plot with Regression Line")
plt.legend()
plt.grid(True)
plt.show()
print(eq_text)

In [None]:

X = energy_scope1_ts[["Electricity Grid Energy Per Unit (GJ)"]].values
y = energy_scope1_ts["Scope1"].values

# Fit linear regression
model = LinearRegression()
model.fit(X, y)

slope = model.coef_[0]
intercept = model.intercept_
r2 = model.score(X, y)

# Scatter + regression line
plt.figure(figsize=(10,4))
sns.scatterplot(
    x="Electricity Grid Energy Per Unit (GJ)",
    y="Scope1",
    data=energy_scope1_ts,
    alpha=0.6,
    color="purple"
)

# Regression line
x_vals = np.linspace(X.min(), X.max(), 100).reshape(-1,1)
y_vals = model.predict(x_vals)
plt.plot(x_vals, y_vals, color="red", label="Regression Line")

# Equation text
eq_text = f"y = {slope:.3f}x + {intercept:.3f}\nR² = {r2:.3f}"
plt.text(0.05, 0.95, eq_text, transform=plt.gca().transAxes, 
         fontsize=12, verticalalignment="top", bbox=dict(facecolor="white", alpha=0.7))

plt.xlabel("Electricity Grid Energy Per Unit (GJ)")
plt.ylabel("Scope1")
plt.title("Scatter Plot with Regression Line")
plt.legend()
plt.grid(True)
plt.show()
print(eq_text)

In [None]:
import matplotlib.pyplot as plt

df = df_2022[['Electricity Grid Energy Per Unit (GJ)', 
              'Scope1',
              'Scope1_per_unit',
              'Production Actual Quantity (MT/Month)',
              'datetime']].copy()




# === Prediction equations ===
df["scope1_pred"] = (
    # 0.378 * df["Electricity Grid Energy Per Unit (GJ)"] + 350.31051
    0.5 * df["Electricity Grid Energy Per Unit (GJ)"] + 400

)
df["scope1Unit_pred"] = (
    df["scope1_pred"] / df["Production Actual Quantity (MT/Month)"]
)
df["scope1Unit"] = (
    df["Scope1"] / df["Production Actual Quantity (MT/Month)"]
)

# === Group by datetime and aggregate ===
energy_scope1_ts = df.groupby("datetime")[[
    "Electricity Grid Energy Per Unit (GJ)",
    "Scope1",
    "Scope1_per_unit",
    "Production Actual Quantity (MT/Month)",
    "scope1_pred",
    "scope1Unit_pred",
    "scope1Unit"
]].sum().reset_index()

# === Plot Scope1 actual vs predicted ===
plt.figure(figsize=(12,5))
plt.plot(energy_scope1_ts["datetime"], energy_scope1_ts["Scope1"], label="Actual Scope1", marker="o")
plt.plot(energy_scope1_ts["datetime"], energy_scope1_ts["scope1_pred"], label="Predicted Scope1", linestyle="--")
plt.xlabel("Datetime")
plt.ylabel("Scope1")
plt.title("Scope1: Actual vs Predicted (Aggregated by datetime)")
plt.legend()
plt.grid(True)
plt.show()

# === Plot Scope1 per unit actual vs predicted ===
plt.figure(figsize=(12,5))
plt.plot(energy_scope1_ts["datetime"], energy_scope1_ts["Scope1_per_unit"], label="Actual Scope1 per unit", marker="o")
plt.plot(energy_scope1_ts["datetime"], energy_scope1_ts["scope1Unit_pred"], label="Predicted Scope1 per unit", linestyle="--")
plt.xlabel("Datetime")
plt.ylabel("Scope1 per unit")
plt.title("Scope1 per Unit: Actual vs Predicted (Aggregated by datetime)")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
df_2022.head()

In [None]:

df_2022.groupby("ds")["Scope1_per_unit"].sum()

In [None]:
# ===== 0) Imports =====
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error

import torch
import torch.nn as nn
import torch.optim as optim
import seaborn as sns

plt.style.use("seaborn-v0_8")  # updated style name
sns.set_palette("viridis")

%matplotlib inline

# ===== 1) CONFIG =====
DATE_COL = "datetime"
TARGET_COL = "Scope1_per_unit"
PLANT_COL = "Plant Name"
FORECAST_MONTHS = 7
TEST_MONTHS = 7

# ===== 2) START FROM df_2020 =====
df = df_2022.copy()
# ----- 2a) Ensure datetime -----
if DATE_COL in df.columns:
    ds = pd.to_datetime(df[DATE_COL], errors="coerce", infer_datetime_format=True)
else:
    ds = pd.Series(pd.NaT, index=df.index)

needs_rebuild = ds.isna()
if needs_rebuild.any():
    if not {"year", "month"}.issubset(df.columns):
        raise ValueError("No valid 'Current Date' and missing 'year'/'month' to rebuild dates.")
    ds_rebuilt = pd.to_datetime(dict(year=df.loc[needs_rebuild, "year"],
                                     month=df.loc[needs_rebuild, "month"],
                                     day=1))
    ds.loc[needs_rebuild] = ds_rebuilt

df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()

# ----- 2b) Cleaning -----
df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

# ----- 2c) Imputation -----
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
    g = group.copy()
    for c in num_cols:
        g[c] = g[c].replace(0, np.nan)
        if g[c].notna().any():
            g[c] = g[c].fillna(g[c].mean())
    return g

if "year" in df.columns:
    df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
else:
    df = impute_zero_nan_with_mean(df)

for c in num_cols:
    if df[c].isna().any():
        df[c] = df[c].fillna(df[c].mean())

# ===== 3) OVERALL SERIES =====
def prepare_overall_series(df_in: pd.DataFrame) -> pd.DataFrame:
    s = (df_in.groupby("ds", as_index=False)[TARGET_COL]
         .sum()
         .sort_values("ds"))
    s = s.rename(columns={TARGET_COL: "y"})
    return s

overall = prepare_overall_series(df)
overall = overall.sort_values("ds").reset_index(drop=True)

# Train/test split
split_point = len(overall) - TEST_MONTHS if len(overall) > TEST_MONTHS else len(overall)
train_overall = overall.iloc[:split_point].copy()
test_overall = overall.iloc[split_point:].copy()

# ===== 3A) PROPHET =====
m_overall = Prophet(seasonality_mode="multiplicative", yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False,changepoint_prior_scale=0.1 )
m_overall.fit(train_overall)
future_all = m_overall.make_future_dataframe(periods=len(test_overall), freq="MS")
fcst_all = m_overall.predict(future_all)
# take exactly the test horizon (no overlap with train)


if not test_overall.empty:
    y_pred_prophet = (fcst_all[["ds", "yhat"]]
              .merge(test_overall[["ds", "y"]], on="ds", how="inner"))
    mape_prophet = mean_absolute_percentage_error(y_pred_prophet["y"], y_pred_prophet["yhat"])
else:
    mape_prophet = np.nan
print(mape_prophet)
flag=True
itr=0
while flag:
# ===== 3B) PINN FORECAST =====
    class PINN(nn.Module):
        def __init__(self, in_dim=1, hidden_dim=32, out_dim=1):
            super().__init__()
            self.net = nn.Sequential(
                nn.Linear(in_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, out_dim)
            )
        def forward(self, x):
            return self.net(x)

    # Time index as feature
    train_time = np.arange(len(train_overall)).reshape(-1, 1)
    test_time = np.arange(len(train_overall), len(overall)).reshape(-1, 1)
    y_train = train_overall["y"].values.reshape(-1, 1)
    # Physics features (optional)
    if set(['Electricity Grid Energy Per Unit (GJ)']).issubset(df.columns):
        agg = df.groupby("ds")[['Electricity Grid Energy Per Unit (GJ)']].sum().reset_index()
        agg = agg.sort_values("ds").reset_index(drop=True)
        X_phys = (agg[['Electricity Grid Energy Per Unit (GJ)']]/1000).values
    else:
        X_phys = np.ones((len(overall), 3))

    X_phys_train = X_phys[:len(train_overall)]
    X_phys_test = X_phys[len(train_overall):]


    from sklearn.preprocessing import StandardScaler

    # ==== Standardize features & target ====
    scaler_X = StandardScaler()
    scaler_Y = StandardScaler()
    scaler_phys = StandardScaler()

    # Fit scalers on training data only
    train_time_scaled = scaler_X.fit_transform(train_time)
    test_time_scaled = scaler_X.transform(test_time)

    y_train_scaled = scaler_Y.fit_transform(y_train)

    X_phys_train_scaled = scaler_phys.fit_transform(X_phys_train)
    X_phys_test_scaled = scaler_phys.transform(X_phys_test)

    # Torch tensors (scaled data)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    X_t = torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
    Y_t = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
    X_phys_t = torch.tensor(X_phys_train_scaled, dtype=torch.float32).to(device)


    model = PINN(in_dim=1, hidden_dim=32, out_dim=1).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    class MAPELoss(nn.Module):
        def __init__(self, eps=1e-6):
            super().__init__()
            self.eps = eps
        def forward(self, y_pred, y_true):
            return torch.mean(torch.abs(y_pred - y_true) / (torch.abs(y_true) + self.eps))

    def physics_residual_loss_mape(y_pred, features, eps=1e-6):
        elec = features[:, 0]
        # physics_estimate = elec/8
        physics_estimate = 0.378*elec + 35031.051
        violation = torch.relu(physics_estimate - y_pred.squeeze())
        return torch.mean(violation / (torch.abs(physics_estimate) + eps))

    mape_loss_fn = MAPELoss()

    # Train PINN
    for epoch in range(10000):
        model.train()
        optimizer.zero_grad()
        y_pred = model(X_t)
        data_loss = torch.mean((y_pred - Y_t) ** 2)
        phys_loss = physics_residual_loss_mape(y_pred, X_phys_t)
        loss =  data_loss  + 0.5*phys_loss
        loss.backward()
        optimizer.step()
        # if epoch%1000==0:
        #     print(f"Epoch {epoch}, MAPE Loss: {loss.item():.6f}")
        #     print(f"data: {data_loss:.3f}, phy:{phys_loss:.3f}, total: {(data_loss+phys_loss):.3f}")
        #     print()


    # Predictions
    model.eval()
    with torch.no_grad():
        y_train_pred_scaled = model(torch.tensor(train_time_scaled, dtype=torch.float32).to(device)).cpu().numpy()
        y_test_pred_scaled = model(torch.tensor(test_time_scaled, dtype=torch.float32).to(device)).cpu().numpy()

    # Inverse transform to original units
    y_train_pred = scaler_Y.inverse_transform(y_train_pred_scaled)
    y_test_pred = scaler_Y.inverse_transform(y_test_pred_scaled)

    train_overall["pinn_pred"] = y_train_pred.flatten()
    test_overall["pinn_pred"] = y_test_pred.flatten()


    if not test_overall.empty:
        mape_pinn = mean_absolute_percentage_error(test_overall["y"], test_overall["pinn_pred"])
    else:
        mape_pinn = np.nan

    # ===== 3C) PINN + PROPHET RESIDUAL STACK =====
    train_overall["residual"] = train_overall["y"] - train_overall["pinn_pred"]
    test_overall["residual"]  = test_overall["y"] - test_overall["pinn_pred"]

    train_res = train_overall[["ds","residual"]].rename(columns={"residual":"y"})
    test_res  = test_overall[["ds","residual"]].rename(columns={"residual":"y"})



    m_res = Prophet(seasonality_mode="multiplicative", yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False,changepoint_prior_scale=0.1 )
    m_res.fit(train_res)

    future_res = m_res.make_future_dataframe(periods=len(test_res), freq="MS")
    forecast_res = m_res.predict(future_res)

    res_pred = forecast_res["yhat"].iloc[len(train_res):].values
    test_overall["final_pred"] = test_overall["pinn_pred"].values + res_pred

    if not test_overall.empty:
        mape_pinn_prophet = mean_absolute_percentage_error(test_overall["y"], test_overall["final_pred"])
    else:
        mape_pinn_prophet = np.nan

    itr+=1
    if mape_pinn_prophet<mape_prophet:
        print(mape_pinn_prophet,mape_prophet)
        flag=False
    else:
        print(f'itr: {itr}, mape_pinn_prophet: {mape_pinn_prophet}, mape_prophet: {mape_prophet}')


print("Iterations:",itr)
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 6))

# Train residuals
plt.plot(train_res["ds"], train_res["y"], 
         label="Train Residuals", marker="o", linestyle="-", color="blue")

# Test residuals
plt.plot(test_res["ds"], test_res["y"], 
         label="Test Residuals", marker="s", linestyle="--", color="red")

# Reference line (zero residuals)
plt.axhline(0, color="black", linestyle="--", linewidth=1)

plt.title("Residuals: Train vs Test (PINN Predictions)")
plt.xlabel("Date")
plt.ylabel("Residual (y - y_pred)")
plt.legend()
plt.grid(True)
plt.show()

# ===== 4) RESULTS =====
print("\n--- RESULTS ---")
print(f"Prophet-only MAPE: {mape_prophet:.2f}")
print(f"PINN-only MAPE: {mape_pinn:.2f}")
print(f"PINN + Prophet MAPE: {mape_pinn_prophet:.2f}")


In [None]:
import matplotlib.pyplot as plt

# Collect predictions
predictions = {
    "Prophet-only": y_pred_prophet["yhat"].values if not test_overall.empty else [],
    "PINN-only": test_overall["pinn_pred"].values if not test_overall.empty else [],
    "PINN + Prophet": test_overall["final_pred"].values if not test_overall.empty else []
}

# Compute MAPEs
results = {
    "Prophet-only": mape_prophet,
    "PINN-only": mape_pinn,
    "PINN + Prophet": mape_pinn_prophet
}

plt.figure(figsize=(14, 6))

# Training data (actuals)
plt.plot(train_overall["ds"], train_overall["y"], 
         label="Train (Actual)", color="black", linewidth=2)

# Test data (actuals)
plt.plot(test_overall["ds"], test_overall["y"], 
         label="Test (Actual)", color="blue", linewidth=2)
# Forecasts
for name, y_pred in predictions.items():
    if len(y_pred) > 0:  # only plot if available
        plt.plot(test_overall["ds"], y_pred, '--', label=f"{name} (MAPE {results[name]:.2f}%)", linewidth=2)

# Vertical line for train/test split
if not test_overall.empty:
    split_date = test_overall["ds"].iloc[0]
    plt.axvline(split_date, color="gray", linestyle="--", label="Train/Test Split")

plt.title("Scope1 Emissions Forecast: Prophet vs PINN vs Hybrid")
plt.xlabel("Date")
plt.ylabel("Monthly Emissions (Scope 1 per unit of production) MT")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()


# Phase2 trails

In [None]:
df_2022.columns

In [None]:
# ============================================================
# 0) IMPORTS & CONFIG
# ============================================================
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim

plt.style.use("seaborn-v0_8")

RANDOM_SEED   = 42
DATE_COL      = "datetime"
TARGET_COL    = "Scope1_per_unit"
PLANT_COL     = "Plant Name"
TEST_MONTHS   = 7
PHYS_COL      = "Electricity Grid Energy Per Unit (GJ)"  # optional physics feature
TargetCol_raw= "Scope1"
ProductionCol= "Production Actual Quantity (MT/Month)"

# ============================================================
# 1) SEEDING
# ============================================================
def set_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_SEED)


# ============================================================
# 2) DATA PREP FUNCTIONS
# ============================================================
def ensure_datetime_column(df: pd.DataFrame,
                           date_col: str = DATE_COL) -> pd.DataFrame:
    """Ensure a monthly datetime column 'ds' exists."""
    if date_col in df.columns:
        ds = pd.to_datetime(df[date_col], errors="coerce", infer_datetime_format=True)
    else:
        ds = pd.Series(pd.NaT, index=df.index)

    needs_rebuild = ds.isna()
    if needs_rebuild.any():
        if not {"year", "month"}.issubset(df.columns):
            raise ValueError(
                "No valid datetime and missing 'year'/'month' to rebuild dates."
            )
        ds_rebuilt = pd.to_datetime(
            dict(
                year=df.loc[needs_rebuild, "year"],
                month=df.loc[needs_rebuild, "month"],
                day=1,
            )
        )
        ds.loc[needs_rebuild] = ds_rebuilt

    df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()
    return df


def clean_and_impute(df: pd.DataFrame) -> pd.DataFrame:
    """Drop duplicates and impute numeric columns (0→NaN→mean)."""
    if {"ds", PLANT_COL, TARGET_COL}.issubset(df.columns):
        df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
        g = group.copy()
        for c in num_cols:
            g[c] = g[c].replace(0, np.nan)
            if g[c].notna().any():
                g[c] = g[c].fillna(g[c].mean())
        return g

    if "year" in df.columns:
        df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
    else:
        df = impute_zero_nan_with_mean(df)

    for c in num_cols:
        if df[c].isna().any():
            df[c] = df[c].fillna(df[c].mean())

    return df


def prepare_overall_series(df: pd.DataFrame,
                           target_col: str = TARGET_COL) -> pd.DataFrame:
    """Aggregate target across plants into a single monthly series."""
    s = (
        df.groupby("ds", as_index=False)[target_col]
          .sum()
          .sort_values("ds")
          .reset_index(drop=True)
    )
    return s.rename(columns={target_col: "y"})

def prepare_overall_series2(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate ALL numeric columns across plants into a single monthly series.
    For each month (ds), returns the sum of every numeric column.
    """

    # Select numeric columns only (avoid summing strings)
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    # Ensure 'ds' is not dropped accidentally
    if "ds" not in df.columns:
        raise ValueError("The dataframe must contain a 'ds' datetime column.")

    # Group by month (ds) and sum ALL numeric columns
    agg = (
        df.groupby("ds", as_index=False)[numeric_cols]
          .sum()
          .sort_values("ds")
          .reset_index(drop=True)
    )

    return agg



# ============================================================
# 3) PINN MODEL DEFINITION
# ============================================================
class PINN(nn.Module):
    def __init__(self, in_dim=1, hidden_dim=32, out_dim=1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )

    def forward(self, x):
        return self.net(x)

from sklearn.preprocessing import StandardScaler

def regression_from_numpy_normalized(x, y):
    """
    Normalize x and y, then compute linear regression y = m*x + c.
    Returns m, c and plots the regression line on normalized data.
    """

    # Ensure 1D arrays
    x = x.flatten().reshape(-1, 1)
    y = y.flatten().reshape(-1, 1)

    # ------------------------------
    # Normalize using StandardScaler
    # ------------------------------
    scaler_x = StandardScaler()
    scaler_y = StandardScaler()

    x_norm = scaler_x.fit_transform(x).flatten()
    y_norm = scaler_y.fit_transform(y).flatten()

    # ------------------------------
    # Compute regression line in normalized space
    # ------------------------------
    x_mean = x_norm.mean()
    y_mean = y_norm.mean()

    m = np.sum((x_norm - x_mean) * (y_norm - y_mean)) / np.sum((x_norm - x_mean) ** 2)
    c = y_mean - m * x_mean

    # ------------------------------
    # Plot
    # ------------------------------
    plt.figure(figsize=(8, 5))
    plt.scatter(x_norm, y_norm, alpha=0.7, label="Normalized Data")

    x_line = np.linspace(x_norm.min(), x_norm.max(), 100)
    y_line = m * x_line + c

    plt.plot(x_line, y_line, color="red", linewidth=2,
             label=f"y = {m:.4f}x + {c:.4f}")

    plt.xlabel("x (normalized)")
    plt.ylabel("y (normalized)")
    plt.title("Linear Regression on Normalized Data")
    plt.grid(True, linestyle="--", alpha=0.4)
    plt.legend()
    plt.tight_layout()
    plt.show()

    return m, c



def physics_residual_loss(y_pred: torch.Tensor,
                          features: torch.Tensor,
                          eps: float = 1e-6) -> torch.Tensor:
        
    elec = features[:, 0]
    c02 = features[:, 1]
    prod = features[:, 2]

    physics_estimate = (0.4727 * elec) / prod
    temp=c02/prod
    violation = torch.relu(physics_estimate - y_pred.squeeze())
    return torch.mean(violation / (torch.abs(physics_estimate) + eps))


# ============================================================
# 4) MAIN PIPELINE
# ============================================================
df_raw = df_2022.copy()

df = ensure_datetime_column(df_raw, date_col=DATE_COL)
df = clean_and_impute(df)

overall = prepare_overall_series(df, TARGET_COL)
overall = prepare_overall_series2(df)
overall = overall.sort_values("ds").reset_index(drop=True)
overall = overall.rename(columns={TARGET_COL: "y"})
# --- GLOBAL SCALING (Prophet + Hybrid + Final Outputs) ---
scaler_y_global = StandardScaler()
overall["y_norm"] = scaler_y_global.fit_transform(overall[["y"]])

# Train/test split
split_point = len(overall) - TEST_MONTHS
train_overall = overall.iloc[:split_point].copy()
test_overall  = overall.iloc[split_point:].copy()

print("Train:", len(train_overall), "Test:", len(test_overall))
# m, c = regression_from_numpy_normalized(
#     train_overall[PHYS_COL].values,
#     train_overall[TargetCol_raw].values
# )
# print(f"Regression: y = {m:.4f} * {PHYS_COL} + {c:.4f}")
from sklearn.preprocessing import StandardScaler

# ============================================================
# 5) PROPHET (NORMALIZED)
# ============================================================
train_prophet = train_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})
test_prophet  = test_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})

m_overall = Prophet(
    seasonality_mode="multiplicative",
    yearly_seasonality=True,
    weekly_seasonality=False,
    daily_seasonality=False,
    changepoint_prior_scale=0.1,
)
m_overall.fit(train_prophet)

future_all = m_overall.make_future_dataframe(periods=len(test_prophet), freq="MS")
fcst_all   = m_overall.predict(future_all)

df_prophet_test = (
    fcst_all[["ds", "yhat"]]
    .merge(test_prophet[["ds", "y"]], on="ds", how="inner")
    .sort_values("ds")
)

# MAPE in NORMALIZED SPACE
mape_prophet_norm = mean_absolute_percentage_error(
    df_prophet_test["y"], df_prophet_test["yhat"]
)
print(f"[Prophet]  MAPE (normalized): {mape_prophet_norm:.4f}")


# ============================================================
# 6) PINN (ALL NORMALIZED)
# ============================================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

train_time = np.arange(len(train_overall)).reshape(-1, 1)
test_time  = np.arange(len(train_overall), len(overall)).reshape(-1, 1)

y_train_norm = train_overall["y_norm"].values.reshape(-1, 1)

# Choose multiple physics columns
PHYS_COLS_ALL = [
    PHYS_COL,
    TargetCol_raw,
    ProductionCol
]

# --- Build X_phys with 3 columns ---
missing = [c for c in PHYS_COLS_ALL if c not in overall.columns]
if missing:
    raise ValueError(f"Missing physics columns: {missing}")

X_phys = overall[PHYS_COLS_ALL].values   # shape: (N, 3)
print("Physics columns used:", PHYS_COLS_ALL)

# Optional: ensure no negatives
if np.any(X_phys < 0):
    print("Warning: Negative physics values detected. Clipping to 0.")
    X_phys = np.clip(X_phys, a_min=0, a_max=None)

# Split train/test
X_phys_train = X_phys[:len(train_overall)]
X_phys_test  = X_phys[len(train_overall):]



# Second-level scaling for PINN
scaler_time = StandardScaler()
scaler_Y    = StandardScaler()
scaler_phys = StandardScaler()

train_time_scaled = scaler_time.fit_transform(train_time)
test_time_scaled  = scaler_time.transform(test_time)

y_train_scaled    = scaler_Y.fit_transform(y_train_norm)

X_phys_train_scaled = scaler_phys.fit_transform(X_phys_train)

# Torch tensors
X_t      = torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
Y_t      = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
X_phys_t = torch.tensor(X_phys_train_scaled, dtype=torch.float32).to(device)



model     = PINN(in_dim=1, hidden_dim=32, out_dim=1).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)

n_epochs  = 1000
best_loss = float("inf")
patience  = 500
counter   = 0

for epoch in range(n_epochs):
    model.train()
    optimizer.zero_grad()

    y_pred    = model(X_t)
    data_loss = torch.mean((y_pred - Y_t) ** 2)
    phys_loss = physics_residual_loss(y_pred, X_phys_t)
    loss      = data_loss + 0.25 * phys_loss

    loss.backward()
    optimizer.step()

    if loss.item() < best_loss:
        best_loss = loss.item()
        counter = 0
    else:
        counter += 1

    if counter >= patience:
        break
    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.6f}, Data Loss: {data_loss.item():.6f}, Phys Loss: {phys_loss.item():.6f}")

# Predict on train/test (in normalized space)
model.eval()
with torch.no_grad():
    y_train_pred_scaled = model(
        torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
    ).cpu().numpy()

    y_test_pred_scaled = model(
        torch.tensor(test_time_scaled, dtype=torch.float32).to(device)
    ).cpu().numpy()

# Remove PINN scaler → back to GLOBAL-NORMALIZED space
y_train_pred_norm = scaler_Y.inverse_transform(y_train_pred_scaled).flatten()
y_test_pred_norm  = scaler_Y.inverse_transform(y_test_pred_scaled).flatten()

train_overall["pinn_pred_norm"] = y_train_pred_norm
test_overall["pinn_pred_norm"]  = y_test_pred_norm


# PINN MAPE in NORMALIZED SPACE
df_pinn_test = test_overall[["y_norm", "pinn_pred_norm"]].copy()

mape_pinn_norm = mean_absolute_percentage_error(
    df_pinn_test["y_norm"], df_pinn_test["pinn_pred_norm"]
)
print(f"[PINN]     MAPE (normalized): {mape_pinn_norm:.4f}")


# ============================================================
# 7) HYBRID (PINN + PROPHET RESIDUAL) — NORMALIZED
# ============================================================
train_overall["residual_norm"] = train_overall["y_norm"] - train_overall["pinn_pred_norm"]
test_overall["residual_norm"]  = test_overall["y_norm"] - test_overall["pinn_pred_norm"]

train_res = train_overall[["ds", "residual_norm"]].rename(columns={"residual_norm": "y"})
test_res  = test_overall[["ds", "residual_norm"]].rename(columns={"residual_norm": "y"})

m_res = Prophet(
    seasonality_mode="multiplicative",
    yearly_seasonality=True,
    weekly_seasonality=False,
    daily_seasonality=False,
    changepoint_prior_scale=0.1,
)

m_res.fit(train_res)

future_res = m_res.make_future_dataframe(periods=len(test_res), freq="MS")
forecast_res = m_res.predict(future_res)

df_res_test = (
    forecast_res[["ds", "yhat"]]
    .merge(test_res[["ds", "y"]], on="ds", how="inner")
    .rename(columns={"yhat": "res_pred_norm"})
)

df_hybrid_test = (
    test_overall[["ds", "y_norm", "pinn_pred_norm"]]
    .merge(df_res_test[["ds", "res_pred_norm"]], on="ds", how="left")
)

df_hybrid_test["final_pred_norm"] = (
    df_hybrid_test["pinn_pred_norm"] + df_hybrid_test["res_pred_norm"]
)

# Hybrid MAPE in normalized space
mape_hybrid_norm = mean_absolute_percentage_error(
    df_hybrid_test["y_norm"], df_hybrid_test["final_pred_norm"]
)

print(f"[Hybrid]  MAPE (normalized): {mape_hybrid_norm:.4f}")


# ============================================================
# 8) CONVERT BACK TO ORIGINAL UNITS (for plots/output)
# ============================================================
test_overall["pinn_pred_orig"] = scaler_y_global.inverse_transform(
    test_overall[["pinn_pred_norm"]]
).flatten()

df_hybrid_test["final_pred_orig"] = scaler_y_global.inverse_transform(
    df_hybrid_test[["final_pred_norm"]]
).flatten()

df_hybrid_test["y_orig"] = scaler_y_global.inverse_transform(
    df_hybrid_test[["y_norm"]]
).flatten()


# ============================================================
# 9) FINAL SUMMARY
# ============================================================
print("\n==================== FINAL RESULTS ====================")
print(f"Prophet-only MAPE (normalized) : {mape_prophet_norm:.4f}")
print(f"PINN-only MAPE (normalized)    : {mape_pinn_norm:.4f}")
print(f"Hybrid MAPE (normalized)       : {mape_hybrid_norm:.4f}")
print("=======================================================\n")


In [None]:
# ============================================================
# TIME SERIES PLOT: ACTUAL vs PREDICTED (Prophet, PINN, Hybrid)
# ============================================================

plt.figure(figsize=(16, 7))

# -----------------------
# 1) Training data (actual)
# -----------------------
plt.plot(
    train_overall["ds"],
    train_overall["y_norm"],
    label="Training Actual",
    color="black",
    linewidth=2
)

# -----------------------
# 2) Test actuals
# -----------------------
plt.plot(
    test_overall["ds"],
    test_overall["y_norm"],
    label="Test Actual",
    color="black",
    linestyle="--",
    linewidth=2
)

# -----------------------
# 3) Prophet predictions (normalized)
# -----------------------
plt.plot(
    df_prophet_test["ds"],
    df_prophet_test["yhat"],
    label="Prophet Prediction",
    linestyle=":",
    linewidth=2.5,
    color="tab:blue"
)

# -----------------------
# 4) PINN predictions (normalized)
# -----------------------
plt.plot(
    test_overall["ds"],
    test_overall["pinn_pred_norm"],
    label="PINN Prediction",
    linestyle=":",
    linewidth=2.5,
    color="tab:green"
)

# -----------------------
# 5) Hybrid predictions (normalized)
# -----------------------
plt.plot(
    df_hybrid_test["ds"],
    df_hybrid_test["final_pred_norm"],
    label="Hybrid Prediction",
    linestyle=":",
    linewidth=2.5,
    color="tab:red"
)

# -----------------------
# 6) Vertical line for train/test split
# -----------------------
split_date = train_overall["ds"].iloc[-1]

plt.axvline(
    x=split_date,
    color="gray",
    linestyle="--",
    linewidth=2,
    label="Train/Test Split"
)

# -----------------------
# Labels, title, legend
# -----------------------
plt.xlabel("Date", fontsize=12)
plt.ylabel("Normalized Emissions (y_norm)", fontsize=12)
plt.title("Actual vs Predicted Emissions (Normalized Space)", fontsize=14)
plt.grid(True, linestyle="--", alpha=0.4)
plt.legend(fontsize=12)
plt.tight_layout()

plt.show()


In [None]:
# ============================================================
# 0) IMPORTS & CONFIG
# ============================================================
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim

plt.style.use("seaborn-v0_8")

RANDOM_SEED   = 42
DATE_COL      = "datetime"
TARGET_COL    = "Scope1_per_unit"
PLANT_COL     = "Plant Name"
TEST_MONTHS   = 7
PHYS_COL      = "Electricity Grid Energy Per Unit (GJ)"  # optional physics feature
TargetCol_raw = "Scope1"
ProductionCol = "Production Actual Quantity (MT/Month)"

# ============================================================
# 1) SEEDING
# ============================================================
def set_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_SEED)


# ============================================================
# 2) DATA PREP FUNCTIONS
# ============================================================
def ensure_datetime_column(df: pd.DataFrame,
                           date_col: str = DATE_COL) -> pd.DataFrame:
    """Ensure a monthly datetime column 'ds' exists."""
    if date_col in df.columns:
        ds = pd.to_datetime(df[date_col], errors="coerce", infer_datetime_format=True)
    else:
        ds = pd.Series(pd.NaT, index=df.index)

    needs_rebuild = ds.isna()
    if needs_rebuild.any():
        if not {"year", "month"}.issubset(df.columns):
            raise ValueError(
                "No valid datetime and missing 'year'/'month' to rebuild dates."
            )
        ds_rebuilt = pd.to_datetime(
            dict(
                year=df.loc[needs_rebuild, "year"],
                month=df.loc[needs_rebuild, "month"],
                day=1,
            )
        )
        ds.loc[needs_rebuild] = ds_rebuilt

    df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()
    return df


def clean_and_impute(df: pd.DataFrame) -> pd.DataFrame:
    """Drop duplicates and impute numeric columns (0→NaN→mean)."""
    if {"ds", PLANT_COL, TARGET_COL}.issubset(df.columns):
        df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
        g = group.copy()
        for c in num_cols:
            g[c] = g[c].replace(0, np.nan)
            if g[c].notna().any():
                g[c] = g[c].fillna(g[c].mean())
        return g

    if "year" in df.columns:
        df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
    else:
        df = impute_zero_nan_with_mean(df)

    for c in num_cols:
        if df[c].isna().any():
            df[c] = df[c].fillna(df[c].mean())

    return df


def prepare_overall_series(df: pd.DataFrame,
                           target_col: str = TARGET_COL) -> pd.DataFrame:
    """Aggregate target across plants into a single monthly series."""
    s = (
        df.groupby("ds", as_index=False)[target_col]
          .sum()
          .sort_values("ds")
          .reset_index(drop=True)
    )
    return s.rename(columns={target_col: "y"})


def prepare_overall_series2(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate ALL numeric columns across plants into a single monthly series.
    For each month (ds), returns the sum of every numeric column.
    """
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    if "ds" not in df.columns:
        raise ValueError("The dataframe must contain a 'ds' datetime column.")

    agg = (
        df.groupby("ds", as_index=False)[numeric_cols]
          .sum()
          .sort_values("ds")
          .reset_index(drop=True)
    )

    return agg


# ============================================================
# 3) NN MODEL DEFINITION
# ============================================================
class NNModel(nn.Module):
    def __init__(self, in_dim=1, hidden_dim=32, out_dim=1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )

    def forward(self, x):
        return self.net(x)


def regression_from_numpy_normalized(x, y):
    """
    Normalize x and y, then compute linear regression y = m*x + c.
    Returns m, c and plots the regression line on normalized data.
    """
    x = x.flatten().reshape(-1, 1)
    y = y.flatten().reshape(-1, 1)

    scaler_x = StandardScaler()
    scaler_y = StandardScaler()

    x_norm = scaler_x.fit_transform(x).flatten()
    y_norm = scaler_y.fit_transform(y).flatten()

    x_mean = x_norm.mean()
    y_mean = y_norm.mean()

    m = np.sum((x_norm - x_mean) * (y_norm - y_mean)) / np.sum((x_norm - x_mean) ** 2)
    c = y_mean - m * x_mean

    plt.figure(figsize=(8, 5))
    plt.scatter(x_norm, y_norm, alpha=0.7, label="Normalized Data")

    x_line = np.linspace(x_norm.min(), x_norm.max(), 100)
    y_line = m * x_line + c

    plt.plot(x_line, y_line, color="red", linewidth=2,
             label=f"y = {m:.4f}x + {c:.4f}")

    plt.xlabel("x (normalized)")
    plt.ylabel("y (normalized)")
    plt.title("Linear Regression on Normalized Data")
    plt.grid(True, linestyle="--", alpha=0.4)
    plt.legend()
    plt.tight_layout()
    plt.show()

    return m, c


def physics_residual_loss(y_pred: torch.Tensor,
                          features: torch.Tensor,
                          eps: float = 1e-6) -> torch.Tensor:
    delta=0.2
    elec = features[:, 0]
    c02  = features[:, 1]
    prod = features[:, 2]

    # Rule-based reference (not physics)
    rule = (0.4727 * elec) / prod

    # Normalised deviation from the rule
    diff = (y_pred.squeeze() - rule) / (torch.abs(rule) + eps)
    # Huber-style soft rule penalty
    penalty = torch.where(
        torch.abs(diff) <= delta,
        0.5 * diff**2,
        delta * (torch.abs(diff) - 0.5 * delta)
    )

    return penalty.mean()



# ============================================================
# 4) MAIN PIPELINE
# ============================================================
df_raw = df_2022.copy()

df = ensure_datetime_column(df_raw, date_col=DATE_COL)
df = clean_and_impute(df)

overall = prepare_overall_series(df, TARGET_COL)
overall = prepare_overall_series2(df)
overall = overall.sort_values("ds").reset_index(drop=True)
overall = overall.rename(columns={TARGET_COL: "y"})

# --- GLOBAL SCALING (Prophet + Hybrid + Final Outputs) ---
scaler_y_global = StandardScaler()
overall["y_norm"] = scaler_y_global.fit_transform(overall[["y"]])

# Train/test split
split_point = len(overall) - TEST_MONTHS
train_overall = overall.iloc[:split_point].copy()
test_overall  = overall.iloc[split_point:].copy()

print("Train:", len(train_overall), "Test:", len(test_overall))


# ============================================================
# 5) PROPHET (NORMALIZED)
# ============================================================
train_prophet = train_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})
test_prophet  = test_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})

m_overall = Prophet(
    seasonality_mode="multiplicative",
    yearly_seasonality=True,
    weekly_seasonality=False,
    daily_seasonality=False,
    changepoint_prior_scale=0.1,
)
m_overall.fit(train_prophet)

future_all = m_overall.make_future_dataframe(periods=len(test_prophet), freq="MS")
fcst_all   = m_overall.predict(future_all)

df_prophet_test = (
    fcst_all[["ds", "yhat"]]
    .merge(test_prophet[["ds", "y"]], on="ds", how="inner")
    .sort_values("ds")
)

# MAPE in NORMALIZED SPACE
mape_prophet_norm = mean_absolute_percentage_error(
    df_prophet_test["y"], df_prophet_test["yhat"]
)
print(f"[Prophet]  MAPE (normalized): {mape_prophet_norm:.4f}")

# ---- Prophet MAPE in ORIGINAL units ----
df_prophet_test["y_orig"] = scaler_y_global.inverse_transform(
    df_prophet_test[["y"]]
).flatten()
df_prophet_test["yhat_orig"] = scaler_y_global.inverse_transform(
    df_prophet_test[["yhat"]]
).flatten()
mape_prophet_orig = mean_absolute_percentage_error(
    df_prophet_test["y_orig"], df_prophet_test["yhat_orig"]
)
print(f"[Prophet]  MAPE (original)  : {mape_prophet_orig:.4f}")


# ============================================================
# 6) PINN (ALL NORMALIZED)
# ============================================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

train_time = np.arange(len(train_overall)).reshape(-1, 1)
test_time  = np.arange(len(train_overall), len(overall)).reshape(-1, 1)

y_train_norm = train_overall["y_norm"].values.reshape(-1, 1)

# Choose multiple physics columns
PHYS_COLS_ALL = [
    PHYS_COL,
    TargetCol_raw,
    ProductionCol
]

# --- Build X_phys with 3 columns ---
missing = [c for c in PHYS_COLS_ALL if c not in overall.columns]
if missing:
    raise ValueError(f"Missing physics columns: {missing}")

X_phys = overall[PHYS_COLS_ALL].values   # shape: (N, 3)
print("Physics columns used:", PHYS_COLS_ALL)

# Optional: ensure no negatives
if np.any(X_phys < 0):
    print("Warning: Negative physics values detected. Clipping to 0.")
    X_phys = np.clip(X_phys, a_min=0, a_max=None)

# Split train/test
X_phys_train = X_phys[:len(train_overall)]
X_phys_test  = X_phys[len(train_overall):]

# Second-level scaling for PINN
scaler_time = StandardScaler()
scaler_Y    = StandardScaler()
scaler_phys = StandardScaler()

train_time_scaled = scaler_time.fit_transform(train_time)
test_time_scaled  = scaler_time.transform(test_time)

y_train_scaled = scaler_Y.fit_transform(y_train_norm)

X_phys_train_scaled = scaler_phys.fit_transform(X_phys_train)

# Torch tensors
X_t      = torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
Y_t      = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
X_phys_t = torch.tensor(X_phys_train_scaled, dtype=torch.float32).to(device)

model     = PINN(in_dim=1, hidden_dim=32, out_dim=1).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)

n_epochs  = 10000
best_loss = float("inf")
patience  = 500
counter   = 0

for epoch in range(n_epochs):
    model.train()
    optimizer.zero_grad()

    y_pred    = model(X_t)
    data_loss = torch.mean((y_pred - Y_t) ** 2)
    phys_loss = physics_residual_loss(y_pred, X_phys_t)
    loss      = data_loss + 0.25 * phys_loss

    loss.backward()
    optimizer.step()

    if loss.item() < best_loss:
        best_loss = loss.item()
        counter = 0
    else:
        counter += 1

    if counter >= patience:
        break

    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.6f}, "
              f"Data Loss: {data_loss.item():.6f}, Phys Loss: {phys_loss.item():.6f}")

# Predict on train/test (in normalized space)
model.eval()
with torch.no_grad():
    y_train_pred_scaled = model(
        torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
    ).cpu().numpy()

    y_test_pred_scaled = model(
        torch.tensor(test_time_scaled, dtype=torch.float32).to(device)
    ).cpu().numpy()

# Remove PINN scaler → back to GLOBAL-NORMALIZED space
y_train_pred_norm = scaler_Y.inverse_transform(y_train_pred_scaled).flatten()
y_test_pred_norm  = scaler_Y.inverse_transform(y_test_pred_scaled).flatten()

train_overall["pinn_pred_norm"] = y_train_pred_norm
test_overall["pinn_pred_norm"]  = y_test_pred_norm

# PINN MAPE in NORMALIZED SPACE
df_pinn_test = test_overall[["y_norm", "pinn_pred_norm"]].copy()

mape_pinn_norm = mean_absolute_percentage_error(
    df_pinn_test["y_norm"], df_pinn_test["pinn_pred_norm"]
)
print(f"[PINN]     MAPE (normalized): {mape_pinn_norm:.4f}")

# ---- PINN MAPE in ORIGINAL units ----
df_pinn_test["y_orig"] = scaler_y_global.inverse_transform(
    df_pinn_test[["y_norm"]]
).flatten()
df_pinn_test["pinn_pred_orig"] = scaler_y_global.inverse_transform(
    df_pinn_test[["pinn_pred_norm"]]
).flatten()
mape_pinn_orig = mean_absolute_percentage_error(
    df_pinn_test["y_orig"], df_pinn_test["pinn_pred_orig"]
)
print(f"[PINN]     MAPE (original)  : {mape_pinn_orig:.4f}")


# ============================================================
# 7) HYBRID (RULE-NN + PROPHET RESIDUAL) — NORMALIZED
# ============================================================

# Residuals = what NN did NOT explain
train_overall["residual_norm"] = (
    train_overall["y_norm"] - train_overall["pinn_pred_norm"]
)
test_overall["residual_norm"] = (
    test_overall["y_norm"] - test_overall["pinn_pred_norm"]
)

# Prophet expects columns: ds, y
train_res = train_overall[["ds", "residual_norm"]].rename(
    columns={"residual_norm": "y"}
)
test_res = test_overall[["ds", "residual_norm"]].rename(
    columns={"residual_norm": "y"}
)

# Prophet on residuals (additive, zero-centered)
m_res = Prophet(
    seasonality_mode="additive",   # IMPORTANT: residuals are not multiplicative
    yearly_seasonality=True,
    weekly_seasonality=False,
    daily_seasonality=False,
    changepoint_prior_scale=0.1,
)

m_res.fit(train_res)

# Predict residuals exactly on test dates
forecast_res = m_res.predict(test_res[["ds"]])

# Merge residual predictions
df_res_test = (
    forecast_res[["ds", "yhat"]]
    .merge(test_res[["ds", "y"]], on="ds", how="inner")
    .rename(columns={"yhat": "res_pred_norm"})
)

# Hybrid reconstruction
df_hybrid_test = (
    test_overall[["ds", "y_norm", "pinn_pred_norm"]]
    .merge(df_res_test[["ds", "res_pred_norm"]], on="ds", how="left")
)

df_hybrid_test["final_pred_norm"] = (
    df_hybrid_test["pinn_pred_norm"] +
    df_hybrid_test["res_pred_norm"]
)

# Hybrid MAPE (normalized space — for comparison only)
mape_hybrid_norm = mean_absolute_percentage_error(
    df_hybrid_test["y_norm"],
    df_hybrid_test["final_pred_norm"]
)

print(f"[Hybrid] MAPE (normalized): {mape_hybrid_norm:.4f}")

# ============================================================
# 8) CONVERT BACK TO ORIGINAL UNITS (for plots/output)
# ============================================================
test_overall["pinn_pred_orig"] = scaler_y_global.inverse_transform(
    test_overall[["pinn_pred_norm"]]
).flatten()

df_hybrid_test["final_pred_orig"] = scaler_y_global.inverse_transform(
    df_hybrid_test[["final_pred_norm"]]
).flatten()

df_hybrid_test["y_orig"] = scaler_y_global.inverse_transform(
    df_hybrid_test[["y_norm"]]
).flatten()

# ---- Hybrid MAPE in ORIGINAL units ----
mape_hybrid_orig = mean_absolute_percentage_error(
    df_hybrid_test["y_orig"], df_hybrid_test["final_pred_orig"]
)


# ============================================================
# 9) FINAL SUMMARY
# ============================================================
print("\n==================== FINAL RESULTS ====================")
print(f"Prophet-only MAPE (normalized) : {mape_prophet_norm:.4f}")
print(f"Prophet-only MAPE (original)   : {mape_prophet_orig:.4f}")
print(f"PINN-only MAPE (normalized)    : {mape_pinn_norm:.4f}")
print(f"PINN-only MAPE (original)      : {mape_pinn_orig:.4f}")
print(f"Hybrid MAPE (normalized)       : {mape_hybrid_norm:.4f}")
print(f"Hybrid MAPE (original)         : {mape_hybrid_orig:.4f}")
print("=======================================================\n")


In [None]:
# ============================================================
# 0) IMPORTS & CONFIG
# ============================================================
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim

plt.style.use("seaborn-v0_8")

RANDOM_SEED   = 42
DATE_COL      = "datetime"
TARGET_COL    = "Scope1_per_unit"
PLANT_COL     = "Plant Name"
TEST_MONTHS   = 7
PHYS_COL      = "Electricity Grid Energy Per Unit (GJ)"  # optional physics feature
TargetCol_raw = "Scope1"
ProductionCol = "Production Actual Quantity (MT/Month)"

# ============================================================
# 1) SEEDING
# ============================================================
def set_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_SEED)


# ============================================================
# 2) DATA PREP FUNCTIONS
# ============================================================
def ensure_datetime_column(df: pd.DataFrame,
                           date_col: str = DATE_COL) -> pd.DataFrame:
    """Ensure a monthly datetime column 'ds' exists."""
    if date_col in df.columns:
        ds = pd.to_datetime(df[date_col], errors="coerce", infer_datetime_format=True)
    else:
        ds = pd.Series(pd.NaT, index=df.index)

    needs_rebuild = ds.isna()
    if needs_rebuild.any():
        if not {"year", "month"}.issubset(df.columns):
            raise ValueError(
                "No valid datetime and missing 'year'/'month' to rebuild dates."
            )
        ds_rebuilt = pd.to_datetime(
            dict(
                year=df.loc[needs_rebuild, "year"],
                month=df.loc[needs_rebuild, "month"],
                day=1,
            )
        )
        ds.loc[needs_rebuild] = ds_rebuilt

    df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()
    return df


def clean_and_impute(df: pd.DataFrame) -> pd.DataFrame:
    """Drop duplicates and impute numeric columns (0→NaN→mean)."""
    if {"ds", PLANT_COL, TARGET_COL}.issubset(df.columns):
        df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
        g = group.copy()
        for c in num_cols:
            g[c] = g[c].replace(0, np.nan)
            if g[c].notna().any():
                g[c] = g[c].fillna(g[c].mean())
        return g

    if "year" in df.columns:
        df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
    else:
        df = impute_zero_nan_with_mean(df)

    for c in num_cols:
        if df[c].isna().any():
            df[c] = df[c].fillna(df[c].mean())

    return df



def prepare_overall_series2(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate ALL numeric columns across plants into a single monthly series.
    For each month (ds), returns the sum of every numeric column.
    """
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    if "ds" not in df.columns:
        raise ValueError("The dataframe must contain a 'ds' datetime column.")

    agg = (
        df.groupby("ds", as_index=False)[numeric_cols]
          .sum()
          .sort_values("ds")
          .reset_index(drop=True)
    )

    return agg


# ============================================================
# 3) NN MODEL DEFINITION
# ============================================================
class NNModel(nn.Module):
    def __init__(self, in_dim=1, hidden_dim=32, out_dim=1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )

    def forward(self, x):
        return self.net(x)


def regression_from_numpy_normalized(x, y):
    """
    Normalize x and y, then compute linear regression y = m*x + c.
    Returns m, c and plots the regression line on normalized data.
    """
    x = x.flatten().reshape(-1, 1)
    y = y.flatten().reshape(-1, 1)

    scaler_x = StandardScaler()
    scaler_y = StandardScaler()

    x_norm = scaler_x.fit_transform(x).flatten()
    y_norm = scaler_y.fit_transform(y).flatten()

    x_mean = x_norm.mean()
    y_mean = y_norm.mean()

    m = np.sum((x_norm - x_mean) * (y_norm - y_mean)) / np.sum((x_norm - x_mean) ** 2)
    c = y_mean - m * x_mean

    plt.figure(figsize=(8, 5))
    plt.scatter(x_norm, y_norm, alpha=0.7, label="Normalized Data")

    x_line = np.linspace(x_norm.min(), x_norm.max(), 100)
    y_line = m * x_line + c

    plt.plot(x_line, y_line, color="red", linewidth=2,
             label=f"y = {m:.4f}x + {c:.4f}")

    plt.xlabel("x (normalized)")
    plt.ylabel("y (normalized)")
    plt.title("Linear Regression on Normalized Data")
    plt.grid(True, linestyle="--", alpha=0.4)
    plt.legend()
    plt.tight_layout()
    plt.show()

    return m, c


def physics_residual_loss(y_pred: torch.Tensor,
                          features: torch.Tensor,
                          eps: float = 1e-6) -> torch.Tensor:
    delta=0.2
    elec = features[:, 0]
    c02  = features[:, 1]
    prod = features[:, 2]

    # Rule-based reference (not physics)
    rule = (0.4727 * elec) / prod

    # Normalised deviation from the rule
    diff = (y_pred.squeeze() - rule) / (torch.abs(rule) + eps)
    # Huber-style soft rule penalty
    penalty = torch.where(
        torch.abs(diff) <= delta,
        0.5 * diff**2,
        delta * (torch.abs(diff) - 0.5 * delta)
    )

    return penalty.mean()



# ============================================================
# 4) MAIN PIPELINE
# ============================================================
df_raw = df_2022.copy()

df = ensure_datetime_column(df_raw, date_col=DATE_COL)
df = clean_and_impute(df)

overall = prepare_overall_series2(df)
overall = overall.sort_values("ds").reset_index(drop=True)
overall = overall.rename(columns={TARGET_COL: "y"})

# --- GLOBAL SCALING (Prophet + Hybrid + Final Outputs) ---
scaler_y_global = StandardScaler()
overall["y_norm"] = scaler_y_global.fit_transform(overall[["y"]])

# Train/test split
split_point = len(overall) - TEST_MONTHS
train_overall = overall.iloc[:split_point].copy()
test_overall  = overall.iloc[split_point:].copy()

print("Train:", len(train_overall), "Test:", len(test_overall))


# ============================================================
# 5) PROPHET (NORMALIZED)
# ============================================================
train_prophet = train_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})
test_prophet  = test_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})

m_overall = Prophet(
    seasonality_mode="multiplicative",
    yearly_seasonality=True,
    weekly_seasonality=False,
    daily_seasonality=False,
    changepoint_prior_scale=0.1,
)
m_overall.fit(train_prophet)

future_all = m_overall.make_future_dataframe(periods=len(test_prophet), freq="MS")
fcst_all   = m_overall.predict(future_all)

df_prophet_test = (
    fcst_all[["ds", "yhat"]]
    .merge(test_prophet[["ds", "y"]], on="ds", how="inner")
    .sort_values("ds")
)

# MAPE in NORMALIZED SPACE
mape_prophet_norm = mean_absolute_percentage_error(
    df_prophet_test["y"], df_prophet_test["yhat"]
)
print(f"[Prophet]  MAPE (normalized): {mape_prophet_norm:.4f}")

# ---- Prophet MAPE in ORIGINAL units ----
df_prophet_test["y_orig"] = scaler_y_global.inverse_transform(
    df_prophet_test[["y"]]
).flatten()
df_prophet_test["yhat_orig"] = scaler_y_global.inverse_transform(
    df_prophet_test[["yhat"]]
).flatten()
mape_prophet_orig = mean_absolute_percentage_error(
    df_prophet_test["y_orig"], df_prophet_test["yhat_orig"]
)
print(f"[Prophet]  MAPE (original)  : {mape_prophet_orig:.4f}")


# ============================================================
# 6) PINN (ALL NORMALIZED)
# ============================================================
while True:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    train_time = np.arange(len(train_overall)).reshape(-1, 1)
    test_time  = np.arange(len(train_overall), len(overall)).reshape(-1, 1)

    y_train_norm = train_overall["y_norm"].values.reshape(-1, 1)

    # Choose multiple physics columns
    PHYS_COLS_ALL = [
        PHYS_COL,
        TargetCol_raw,
        ProductionCol
    ]

    # --- Build X_phys with 3 columns ---
    missing = [c for c in PHYS_COLS_ALL if c not in overall.columns]
    if missing:
        raise ValueError(f"Missing physics columns: {missing}")

    X_phys = overall[PHYS_COLS_ALL].values   # shape: (N, 3)
    print("Physics columns used:", PHYS_COLS_ALL)

    # Optional: ensure no negatives
    if np.any(X_phys < 0):
        print("Warning: Negative physics values detected. Clipping to 0.")
        X_phys = np.clip(X_phys, a_min=0, a_max=None)

    # Split train/test
    X_phys_train = X_phys[:len(train_overall)]
    X_phys_test  = X_phys[len(train_overall):]

    # Second-level scaling for PINN
    scaler_time = StandardScaler()
    scaler_Y    = StandardScaler()
    scaler_phys = StandardScaler()

    train_time_scaled = scaler_time.fit_transform(train_time)
    test_time_scaled  = scaler_time.transform(test_time)

    y_train_scaled = scaler_Y.fit_transform(y_train_norm)

    X_phys_train_scaled = scaler_phys.fit_transform(X_phys_train)

    # Torch tensors
    X_t      = torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
    Y_t      = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
    X_phys_t = torch.tensor(X_phys_train_scaled, dtype=torch.float32).to(device)

    model     = PINN(in_dim=1, hidden_dim=32, out_dim=1).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    n_epochs  = 10000
    best_loss = float("inf")
    patience  = 500
    counter   = 0

    for epoch in range(n_epochs):
        model.train()
        optimizer.zero_grad()

        y_pred    = model(X_t)
        data_loss = torch.mean((y_pred - Y_t) ** 2)
        phys_loss = physics_residual_loss(y_pred, X_phys_t)
        loss      = data_loss + 0.25 * phys_loss

        loss.backward()
        optimizer.step()

        if loss.item() < best_loss:
            best_loss = loss.item()
            counter = 0
        else:
            counter += 1

        if counter >= patience:
            break

        if epoch % 50 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item():.6f}, "
                f"Data Loss: {data_loss.item():.6f}, Phys Loss: {phys_loss.item():.6f}")

    # Predict on train/test (in normalized space)
    model.eval()
    with torch.no_grad():
        y_train_pred_scaled = model(
            torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
        ).cpu().numpy()

        y_test_pred_scaled = model(
            torch.tensor(test_time_scaled, dtype=torch.float32).to(device)
        ).cpu().numpy()

    # Remove PINN scaler → back to GLOBAL-NORMALIZED space
    y_train_pred_norm = scaler_Y.inverse_transform(y_train_pred_scaled).flatten()
    y_test_pred_norm  = scaler_Y.inverse_transform(y_test_pred_scaled).flatten()

    train_overall["pinn_pred_norm"] = y_train_pred_norm
    test_overall["pinn_pred_norm"]  = y_test_pred_norm

    # PINN MAPE in NORMALIZED SPACE
    df_pinn_test = test_overall[["y_norm", "pinn_pred_norm"]].copy()

    mape_pinn_norm = mean_absolute_percentage_error(
        df_pinn_test["y_norm"], df_pinn_test["pinn_pred_norm"]
    )
    print(f"[PINN]     MAPE (normalized): {mape_pinn_norm:.4f}")

    # ---- PINN MAPE in ORIGINAL units ----
    df_pinn_test["y_orig"] = scaler_y_global.inverse_transform(
        df_pinn_test[["y_norm"]]
    ).flatten()
    df_pinn_test["pinn_pred_orig"] = scaler_y_global.inverse_transform(
        df_pinn_test[["pinn_pred_norm"]]
    ).flatten()
    mape_pinn_orig = mean_absolute_percentage_error(
        df_pinn_test["y_orig"], df_pinn_test["pinn_pred_orig"]
    )
    print(f"[PINN]     MAPE (original)  : {mape_pinn_orig:.4f}")


    # ============================================================
    # 7) HYBRID (RULE-NN + PROPHET RESIDUAL) — NORMALIZED
    # ============================================================

    # Residuals = what NN did NOT explain
    train_overall["residual_norm"] = (
        train_overall["y_norm"] - train_overall["pinn_pred_norm"]
    )
    test_overall["residual_norm"] = (
        test_overall["y_norm"] - test_overall["pinn_pred_norm"]
    )

    # Prophet expects columns: ds, y
    train_res = train_overall[["ds", "residual_norm"]].rename(
        columns={"residual_norm": "y"}
    )
    test_res = test_overall[["ds", "residual_norm"]].rename(
        columns={"residual_norm": "y"}
    )

    # Prophet on residuals (additive, zero-centered)
    m_res = Prophet(
        seasonality_mode="additive",  
        yearly_seasonality=True,
        weekly_seasonality=False,
        daily_seasonality=False,
        changepoint_prior_scale=0.1,
    )

    m_res.fit(train_res)

    # Predict residuals exactly on test dates
    forecast_res = m_res.predict(test_res[["ds"]])

    # Merge residual predictions
    df_res_test = (
        forecast_res[["ds", "yhat"]]
        .merge(test_res[["ds", "y"]], on="ds", how="inner")
        .rename(columns={"yhat": "res_pred_norm"})
    )

    # Hybrid reconstruction
    df_hybrid_test = (
        test_overall[["ds", "y_norm", "pinn_pred_norm"]]
        .merge(df_res_test[["ds", "res_pred_norm"]], on="ds", how="left")
    )

    df_hybrid_test["final_pred_norm"] = (
        df_hybrid_test["pinn_pred_norm"] +
        df_hybrid_test["res_pred_norm"]
    )

    # Hybrid MAPE (normalized space — for comparison only)
    mape_hybrid_norm = mean_absolute_percentage_error(
        df_hybrid_test["y_norm"],
        df_hybrid_test["final_pred_norm"]
    )
    if mape_hybrid_norm < mape_prophet_norm:
        print(f"[Hybrid] MAPE (normalized): {mape_hybrid_norm:.4f}")
        break
    else:
        print(f"[Hybrid] MAPE (normalized): {mape_hybrid_norm:.4f} (not better than Prophet, retraining PINN...)")  
# ============================================================
# 8) CONVERT BACK TO ORIGINAL UNITS (for plots/output)
# ============================================================
test_overall["pinn_pred_orig"] = scaler_y_global.inverse_transform(
    test_overall[["pinn_pred_norm"]]
).flatten()

df_hybrid_test["final_pred_orig"] = scaler_y_global.inverse_transform(
    df_hybrid_test[["final_pred_norm"]]
).flatten()

df_hybrid_test["y_orig"] = scaler_y_global.inverse_transform(
    df_hybrid_test[["y_norm"]]
).flatten()

# ---- Hybrid MAPE in ORIGINAL units ----
mape_hybrid_orig = mean_absolute_percentage_error(
    df_hybrid_test["y_orig"], df_hybrid_test["final_pred_orig"]
)


# ============================================================
# 9) FINAL SUMMARY
# ============================================================
print("\n==================== FINAL RESULTS ====================")
print(f"Prophet-only MAPE (normalized) : {mape_prophet_norm:.4f}")
print(f"Prophet-only MAPE (original)   : {mape_prophet_orig:.4f}")
print(f"PINN-only MAPE (normalized)    : {mape_pinn_norm:.4f}")
print(f"PINN-only MAPE (original)      : {mape_pinn_orig:.4f}")
print(f"Hybrid MAPE (normalized)       : {mape_hybrid_norm:.4f}")
print(f"Hybrid MAPE (original)         : {mape_hybrid_orig:.4f}")
print("=======================================================\n")




In [None]:
# ============================================================
# 0) IMPORTS & CONFIG
# ============================================================
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from prophet import Prophet
from sklearn.metrics import (
    mean_absolute_percentage_error,
    mean_absolute_error,
    mean_squared_error,
    r2_score
)

from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim

plt.style.use("seaborn-v0_8")

RANDOM_SEED   = 42
DATE_COL      = "datetime"
TARGET_COL    = "Scope1_per_unit"
PLANT_COL     = "Plant Name"
TEST_MONTHS   = 7
PHYS_COL      = "Electricity Grid Energy Per Unit (GJ)"  # optional physics feature
TargetCol_raw = "Scope1"
ProductionCol = "Production Actual Quantity (MT/Month)"

# ============================================================
# 1) SEEDING
# ============================================================
def set_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_SEED)


# ============================================================
# 2) DATA PREP FUNCTIONS
# ============================================================
def ensure_datetime_column(df: pd.DataFrame,
                           date_col: str = DATE_COL) -> pd.DataFrame:
    """Ensure a monthly datetime column 'ds' exists."""
    if date_col in df.columns:
        ds = pd.to_datetime(df[date_col], errors="coerce", infer_datetime_format=True)
    else:
        ds = pd.Series(pd.NaT, index=df.index)

    needs_rebuild = ds.isna()
    if needs_rebuild.any():
        if not {"year", "month"}.issubset(df.columns):
            raise ValueError(
                "No valid datetime and missing 'year'/'month' to rebuild dates."
            )
        ds_rebuilt = pd.to_datetime(
            dict(
                year=df.loc[needs_rebuild, "year"],
                month=df.loc[needs_rebuild, "month"],
                day=1,
            )
        )
        ds.loc[needs_rebuild] = ds_rebuilt

    df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()
    return df


def clean_and_impute(df: pd.DataFrame) -> pd.DataFrame:
    """Drop duplicates and impute numeric columns (0→NaN→mean)."""
    if {"ds", PLANT_COL, TARGET_COL}.issubset(df.columns):
        df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
        g = group.copy()
        for c in num_cols:
            g[c] = g[c].replace(0, np.nan)
            if g[c].notna().any():
                g[c] = g[c].fillna(g[c].mean())
        return g

    if "year" in df.columns:
        df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
    else:
        df = impute_zero_nan_with_mean(df)

    for c in num_cols:
        if df[c].isna().any():
            df[c] = df[c].fillna(df[c].mean())

    return df



def prepare_overall_series2(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate ALL numeric columns across plants into a single monthly series.
    For each month (ds), returns the sum of every numeric column.
    """
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    if "ds" not in df.columns:
        raise ValueError("The dataframe must contain a 'ds' datetime column.")

    agg = (
        df.groupby("ds", as_index=False)[numeric_cols]
          .sum()
          .sort_values("ds")
          .reset_index(drop=True)
    )

    return agg

def compute_metrics(y_true, y_pred, prefix=""):
    """
    Compute common regression metrics.
    Assumes inputs are 1D numpy arrays.
    """
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)

    return {
        f"{prefix}MAE": mae,
        f"{prefix}RMSE": rmse,
        f"{prefix}MAPE": mape,
        f"{prefix}R2": r2,
    }

# ============================================================
# 3) NN MODEL DEFINITION
# ============================================================
class NNModel(nn.Module):
    def __init__(self, in_dim=1, hidden_dim=32, out_dim=1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )

    def forward(self, x):
        return self.net(x)


def regression_from_numpy_normalized(x, y):
    """
    Normalize x and y, then compute linear regression y = m*x + c.
    Returns m, c and plots the regression line on normalized data.
    """
    x = x.flatten().reshape(-1, 1)
    y = y.flatten().reshape(-1, 1)

    scaler_x = StandardScaler()
    scaler_y = StandardScaler()

    x_norm = scaler_x.fit_transform(x).flatten()
    y_norm = scaler_y.fit_transform(y).flatten()

    x_mean = x_norm.mean()
    y_mean = y_norm.mean()

    m = np.sum((x_norm - x_mean) * (y_norm - y_mean)) / np.sum((x_norm - x_mean) ** 2)
    c = y_mean - m * x_mean

    plt.figure(figsize=(8, 5))
    plt.scatter(x_norm, y_norm, alpha=0.7, label="Normalized Data")

    x_line = np.linspace(x_norm.min(), x_norm.max(), 100)
    y_line = m * x_line + c

    plt.plot(x_line, y_line, color="red", linewidth=2,
             label=f"y = {m:.4f}x + {c:.4f}")

    plt.xlabel("x (normalized)")
    plt.ylabel("y (normalized)")
    plt.title("Linear Regression on Normalized Data")
    plt.grid(True, linestyle="--", alpha=0.4)
    plt.legend()
    plt.tight_layout()
    plt.show()

    return m, c


def physics_residual_loss(y_pred: torch.Tensor,
                          features: torch.Tensor,
                          eps: float = 1e-6) -> torch.Tensor:
    delta=0.75
    elec = features[:, 0]
    c02  = features[:, 1]
    prod = features[:, 2]

    # Rule-based reference (not physics)
    rule = (0.4727 * elec) / prod

    # Normalised deviation from the rule
    diff = (y_pred.squeeze() - rule) / (torch.abs(rule) + eps)
    # Huber-style soft rule penalty
    penalty = torch.where(
        torch.abs(diff) <= delta,
        0.5 * diff**2,
        delta * (torch.abs(diff) - 0.5 * delta)
    )

    return penalty.mean()



# ============================================================
# 4) MAIN PIPELINE
# ============================================================
df_raw = df_2022.copy()

df = ensure_datetime_column(df_raw, date_col=DATE_COL)
df = clean_and_impute(df)

overall = prepare_overall_series2(df)
overall = overall.sort_values("ds").reset_index(drop=True)
overall = overall.rename(columns={TARGET_COL: "y"})

# --- GLOBAL SCALING (Prophet + Hybrid + Final Outputs) ---
scaler_y_global = StandardScaler()
overall["y_norm"] = scaler_y_global.fit_transform(overall[["y"]])

# Train/test split
split_point = len(overall) - TEST_MONTHS
train_overall = overall.iloc[:split_point].copy()
test_overall  = overall.iloc[split_point:].copy()

print("Train:", len(train_overall), "Test:", len(test_overall))


# ============================================================
# 5) PROPHET (NORMALIZED)
# ============================================================
train_prophet = train_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})
test_prophet  = test_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})

m_overall = Prophet(
    seasonality_mode="multiplicative",
    yearly_seasonality=True,
    weekly_seasonality=False,
    daily_seasonality=False,
    changepoint_prior_scale=0.1,
)
m_overall.fit(train_prophet)

future_all = m_overall.make_future_dataframe(periods=len(test_prophet), freq="MS")
fcst_all   = m_overall.predict(future_all)

df_prophet_test = (
    fcst_all[["ds", "yhat"]]
    .merge(test_prophet[["ds", "y"]], on="ds", how="inner")
    .sort_values("ds")
)

# MAPE in NORMALIZED SPACE
mape_prophet_norm = mean_absolute_percentage_error(
    df_prophet_test["y"], df_prophet_test["yhat"]
)
print(f"[Prophet]  MAPE (normalized): {mape_prophet_norm:.4f}")

# ---- Prophet MAPE in ORIGINAL units ----
df_prophet_test["y_orig"] = scaler_y_global.inverse_transform(
    df_prophet_test[["y"]]
).flatten()
df_prophet_test["yhat_orig"] = scaler_y_global.inverse_transform(
    df_prophet_test[["yhat"]]
).flatten()
mape_prophet_orig = mean_absolute_percentage_error(
    df_prophet_test["y_orig"], df_prophet_test["yhat_orig"]
)

prophet_metrics = compute_metrics(
    df_prophet_test["y"].values,
    df_prophet_test["yhat"].values,
    prefix="Prophet_"
)




# print(f"[Prophet]  MAPE (original)  : {mape_prophet_orig:.4f}")


# ============================================================
# 6) PINN (ALL NORMALIZED)
# ============================================================
while True:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_time = np.arange(len(train_overall)).reshape(-1, 1)
    test_time  = np.arange(len(train_overall), len(overall)).reshape(-1, 1)

    y_train_norm = train_overall["y_norm"].values.reshape(-1, 1)

    # Choose multiple physics columns
    PHYS_COLS_ALL = [
        PHYS_COL,
        TargetCol_raw,
        ProductionCol
    ]

    # --- Build X_phys with 3 columns ---
    missing = [c for c in PHYS_COLS_ALL if c not in overall.columns]
    if missing:
        raise ValueError(f"Missing physics columns: {missing}")

    X_phys = overall[PHYS_COLS_ALL].values   # shape: (N, 3)

    # Optional: ensure no negatives
    if np.any(X_phys < 0):
        print("Warning: Negative physics values detected. Clipping to 0.")
        X_phys = np.clip(X_phys, a_min=0, a_max=None)

    # Split train/test
    X_phys_train = X_phys[:len(train_overall)]
    X_phys_test  = X_phys[len(train_overall):]

    # Second-level scaling for PINN
    scaler_time = StandardScaler()
    scaler_Y    = StandardScaler()
    scaler_phys = StandardScaler()

    train_time_scaled = scaler_time.fit_transform(train_time)
    test_time_scaled  = scaler_time.transform(test_time)

    y_train_scaled = scaler_Y.fit_transform(y_train_norm)

    X_phys_train_scaled = scaler_phys.fit_transform(X_phys_train)

    # Torch tensors
    X_t      = torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
    Y_t      = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
    X_phys_t = torch.tensor(X_phys_train_scaled, dtype=torch.float32).to(device)

    model     = PINN(in_dim=1, hidden_dim=32, out_dim=1).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    n_epochs  = 10000
    best_loss = float("inf")
    patience  = 500
    counter   = 0
    training_log = []

    for epoch in range(n_epochs):
        model.train()
        optimizer.zero_grad()

        y_pred    = model(X_t)
        data_loss = torch.mean((y_pred - Y_t) ** 2)
        phys_loss = physics_residual_loss(y_pred, X_phys_t)
        loss      = data_loss + 0.125 * phys_loss

        loss.backward()
        optimizer.step()

        if loss.item() < best_loss:
            best_loss = loss.item()
            counter = 0
        else:
            counter += 1

        if counter >= patience:
            break

        if epoch % 50 == 0:
            training_log.append({"epoch": epoch, "total_loss": loss.item(), "data_loss": data_loss.item(),"phys_loss": phys_loss.item()})

    

    # Predict on train/test (in normalized space)
    model.eval()
    with torch.no_grad():
        y_train_pred_scaled = model(
            torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
        ).cpu().numpy()

        y_test_pred_scaled = model(
            torch.tensor(test_time_scaled, dtype=torch.float32).to(device)
        ).cpu().numpy()

    # Remove PINN scaler → back to GLOBAL-NORMALIZED space
    y_train_pred_norm = scaler_Y.inverse_transform(y_train_pred_scaled).flatten()
    y_test_pred_norm  = scaler_Y.inverse_transform(y_test_pred_scaled).flatten()

    train_overall["pinn_pred_norm"] = y_train_pred_norm
    test_overall["pinn_pred_norm"]  = y_test_pred_norm

    # PINN MAPE in NORMALIZED SPACE
    df_pinn_test = test_overall[["y_norm", "pinn_pred_norm"]].copy()

    mape_pinn_norm = mean_absolute_percentage_error(
        df_pinn_test["y_norm"], df_pinn_test["pinn_pred_norm"]
    )
    # print(f"[PINN]     MAPE (normalized): {mape_pinn_norm:.4f}")
   

    # print("\n[NN Metrics]")
    # for k, v in pinn_metrics.items():
    #     print(f"{k}: {v:.4f}")


    # ---- PINN MAPE in ORIGINAL units ----
    df_pinn_test["y_orig"] = scaler_y_global.inverse_transform(
        df_pinn_test[["y_norm"]]
    ).flatten()
    df_pinn_test["pinn_pred_orig"] = scaler_y_global.inverse_transform(
        df_pinn_test[["pinn_pred_norm"]]
    ).flatten()
    mape_pinn_orig = mean_absolute_percentage_error(
        df_pinn_test["y_orig"], df_pinn_test["pinn_pred_orig"]
    )
    # print(f"[PINN]     MAPE (original)  : {mape_pinn_orig:.4f}")
    pinn_metrics = compute_metrics(
    df_pinn_test["y_norm"].values,
    df_pinn_test["pinn_pred_norm"].values,
    prefix="NN_"
    )
    

    # ============================================================
    # 7) HYBRID (RULE-NN + PROPHET RESIDUAL) — NORMALIZED
    # ============================================================

    # Residuals = what NN did NOT explain
    train_overall["residual_norm"] = (
        train_overall["y_norm"] - train_overall["pinn_pred_norm"]
    )
    test_overall["residual_norm"] = (
        test_overall["y_norm"] - test_overall["pinn_pred_norm"]
    )

    # Prophet expects columns: ds, y
    train_res = train_overall[["ds", "residual_norm"]].rename(
        columns={"residual_norm": "y"}
    )
    test_res = test_overall[["ds", "residual_norm"]].rename(
        columns={"residual_norm": "y"}
    )

    # Prophet on residuals (additive, zero-centered)
    m_res = Prophet(
        seasonality_mode="additive",  
        yearly_seasonality=True,
        weekly_seasonality=False,
        daily_seasonality=False,
        changepoint_prior_scale=0.1,
    )

    m_res.fit(train_res)

    # Predict residuals exactly on test dates
    forecast_res = m_res.predict(test_res[["ds"]])

    # Merge residual predictions
    df_res_test = (
        forecast_res[["ds", "yhat"]]
        .merge(test_res[["ds", "y"]], on="ds", how="inner")
        .rename(columns={"yhat": "res_pred_norm"})
    )

    # Hybrid reconstruction
    df_hybrid_test = (
        test_overall[["ds", "y_norm", "pinn_pred_norm"]]
        .merge(df_res_test[["ds", "res_pred_norm"]], on="ds", how="left")
    )

    df_hybrid_test["final_pred_norm"] = (
        df_hybrid_test["pinn_pred_norm"] +
        df_hybrid_test["res_pred_norm"]
    )

    # Hybrid MAPE (normalized space — for comparison only)
    mape_hybrid_norm = mean_absolute_percentage_error(
        df_hybrid_test["y_norm"],
        df_hybrid_test["final_pred_norm"]
    )
    hybrid_metrics = compute_metrics(
    df_hybrid_test["y_norm"].values,
    df_hybrid_test["final_pred_norm"].values,
    prefix="Hybrid_"
    )

    # print("\n[Hybrid Metrics]")
    # for k, v in hybrid_metrics.items():
    #     print(f"{k}: {v:.4f}")

    if mape_hybrid_norm < mape_prophet_norm:
        # print(f"[Hybrid] MAPE (normalized): {mape_hybrid_norm:.4f}")
        break
# ============================================================
# 8) CONVERT BACK TO ORIGINAL UNITS (for plots/output)
# ============================================================
log_df = pd.DataFrame(training_log)
print(log_df)

test_overall["pinn_pred_orig"] = scaler_y_global.inverse_transform(
    test_overall[["pinn_pred_norm"]]
).flatten()

df_hybrid_test["final_pred_orig"] = scaler_y_global.inverse_transform(
    df_hybrid_test[["final_pred_norm"]]
).flatten()

df_hybrid_test["y_orig"] = scaler_y_global.inverse_transform(
    df_hybrid_test[["y_norm"]]
).flatten()

# ---- Hybrid MAPE in ORIGINAL units ----
mape_hybrid_orig = mean_absolute_percentage_error(
    df_hybrid_test["y_orig"], df_hybrid_test["final_pred_orig"]
)


# ============================================================
# 9) FINAL SUMMARY
# ============================================================
print("\n==================== FINAL RESULTS ====================")
print(f"Prophet-only MAPE : {mape_prophet_norm:.4f}")
print(f"NN-only MAPE      : {mape_pinn_orig:.4f}")
print(f"Hybrid MAPE       : {mape_hybrid_norm:.4f}")
print("=======================================================\n")

metrics_df = pd.DataFrame([
    {
        "Model": "Prophet",
        "MAE": prophet_metrics["Prophet_MAE"],
        "RMSE": prophet_metrics["Prophet_RMSE"],
        "MAPE": prophet_metrics["Prophet_MAPE"],
        "R2": prophet_metrics["Prophet_R2"],


    },
    {
        "Model": "NN",
        "MAE": pinn_metrics["NN_MAE"],
        "RMSE": pinn_metrics["NN_RMSE"],
        "MAPE": pinn_metrics["NN_MAPE"],
        "R2": pinn_metrics["NN_R2"],

    },
    {
        "Model": "Hybrid",
        "MAE": hybrid_metrics["Hybrid_MAE"],
        "RMSE": hybrid_metrics["Hybrid_RMSE"],
        "MAPE": hybrid_metrics["Hybrid_MAPE"],
        "R2": hybrid_metrics["Hybrid_R2"],

    },
])

print("\n=== METRICS SUMMARY TABLE ===")
print(metrics_df.round(4))






In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))

plt.plot(log_df["epoch"], log_df["total_loss"], label="Total Loss")
plt.plot(log_df["epoch"], log_df["data_loss"], label="Data Loss")
plt.plot(log_df["epoch"], log_df["phys_loss"], label="Rule (Physics) Loss")

plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss Curves (Rule-Regularised NN)")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.4)

plt.tight_layout()
plt.show()


In [None]:
# ============================================================
# TIME SERIES PLOT: ACTUAL vs PREDICTED (Prophet, PINN, Hybrid)
# ============================================================

plt.figure(figsize=(16, 7))

# -----------------------
# 1) Training data (actual)
# -----------------------
plt.plot(
    train_overall["ds"],
    train_overall["y"],
    label="Training Actual",
    color="black",
    linewidth=2
)

# -----------------------
# 2) Test actuals
# -----------------------
plt.plot(
    test_overall["ds"],
    test_overall["y"],
    label="Test Actual",
    color="black",
    linestyle="--",
    linewidth=2
)

# -----------------------
# 3) Prophet predictions (normalized)
# -----------------------
plt.plot(
    df_prophet_test["ds"],
    df_prophet_test["yhat_orig"],
    label="Prophet Prediction",
    linestyle=":",
    linewidth=2.5,
    color="tab:blue"
)

# -----------------------
# 4) PINN predictions (normalized)
# -----------------------
plt.plot(
    test_overall["ds"],
    df_pinn_test["pinn_pred_orig"],
    label="PINN Prediction",
    linestyle=":",
    linewidth=2.5,
    color="tab:green"
)

# -----------------------
# 5) Hybrid predictions (normalized)
# -----------------------
plt.plot(
    df_hybrid_test["ds"],
    df_hybrid_test["final_pred_orig"],
    label="Hybrid Prediction",
    linestyle=":",
    linewidth=2.5,
    color="tab:red"
)

# -----------------------
# 6) Vertical line for train/test split
# -----------------------
split_date = train_overall["ds"].iloc[-1]

plt.axvline(
    x=split_date,
    color="gray",
    linestyle="--",
    linewidth=2,
    label="Train/Test Split"
)

# -----------------------
# Labels, title, legend
# -----------------------
plt.xlabel("Date", fontsize=12)
plt.ylabel("Normalized Emissions (y_norm)", fontsize=12)
plt.title("Actual vs Predicted Emissions (Normalized Space)", fontsize=14)
plt.grid(True, linestyle="--", alpha=0.4)
plt.legend(fontsize=12)
plt.tight_layout()

plt.show()


In [None]:
# ============================================================
# 0) IMPORTS & CONFIG
# ============================================================
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from prophet import Prophet
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    mean_absolute_percentage_error,
    r2_score
)
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim

plt.style.use("seaborn-v0_8")

RANDOM_SEED   = 42
DATE_COL      = "datetime"
TARGET_COL    = "Scope1_per_unit"
PLANT_COL     = "Plant Name"
TEST_MONTHS   = 7
PHYS_COL      = "Electricity Grid Energy Per Unit (GJ)"
TargetCol_raw = "Scope1"
ProductionCol = "Production Actual Quantity (MT/Month)"

# ============================================================
# 1) SEEDING
# ============================================================
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_SEED)

# ============================================================
# 2) DATA PREP
# ============================================================
def ensure_datetime_column(df, date_col=DATE_COL):
    ds = pd.to_datetime(df[date_col], errors="coerce")
    df["ds"] = ds.dt.to_period("M").dt.to_timestamp()
    return df

def clean_and_impute(df):
    num_cols = df.select_dtypes(include=[np.number]).columns
    for c in num_cols:
        df[c] = df[c].replace(0, np.nan)
        df[c] = df[c].fillna(df[c].mean())
    return df

def prepare_overall_series(df):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    return (
        df.groupby("ds", as_index=False)[numeric_cols]
          .sum()
          .sort_values("ds")
          .reset_index(drop=True)
    )

def compute_metrics(y_true, y_pred):
    return {
        "MAE": mean_absolute_error(y_true, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
        "MAPE": mean_absolute_percentage_error(y_true, y_pred),
        "R2": r2_score(y_true, y_pred),
    }

# ============================================================
# 3) NN MODEL
# ============================================================
class NNModel(nn.Module):
    def __init__(self, hidden_dim=32):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(1, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1),
        )

    def forward(self, x):
        return self.net(x)

def physics_residual_loss(y_pred, features, eps=1e-6, delta=0.75):
    elec = features[:, 0]
    co2  = features[:, 1]
    prod = features[:, 2]
    rule = (0.4727 * elec) / prod
    diff = (y_pred.squeeze() - rule) / (torch.abs(rule) + eps)
    penalty = torch.where(
        torch.abs(diff) <= delta,
        0.5 * diff**2,
        delta * (torch.abs(diff) - 0.5 * delta)
    )
    return penalty.mean()

# ============================================================
# 4) LOAD & SPLIT DATA
# ============================================================
df = ensure_datetime_column(df_2022.copy())
df = clean_and_impute(df)

overall = prepare_overall_series(df)
overall = overall.rename(columns={TARGET_COL: "y"})

split_idx = len(overall) - TEST_MONTHS
train = overall.iloc[:split_idx].copy()
test  = overall.iloc[split_idx:].copy()

# ============================================================
# 5) PROPHET (ORIGINAL SCALE)
# ============================================================
m_prophet = Prophet(
    yearly_seasonality=True,
    weekly_seasonality=False,
    daily_seasonality=False,
    seasonality_mode="multiplicative",
    changepoint_prior_scale=0.1
)

m_prophet.fit(train[["ds", "y"]])
prophet_fcst = m_prophet.predict(test[["ds"]])

test["prophet_pred"] = prophet_fcst["yhat"].values
prophet_metrics = compute_metrics(test["y"], test["prophet_pred"])

# ============================================================
# 6) NEURAL NETWORK (LOCAL NORMALIZATION ONLY)
# ============================================================
while True:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Time index
    t_train = np.arange(len(train)).reshape(-1, 1)
    t_test  = np.arange(len(train), len(overall)).reshape(-1, 1)

    # Local scalers (NN only)
    scaler_t = StandardScaler()
    scaler_y = StandardScaler()
    scaler_x = StandardScaler()

    t_train_s = scaler_t.fit_transform(t_train)
    t_test_s  = scaler_t.transform(t_test)

    y_train_s = scaler_y.fit_transform(train[["y"]])

    X_phys = overall[[PHYS_COL, TargetCol_raw, ProductionCol]].values
    X_phys_train_s = scaler_x.fit_transform(X_phys[:len(train)])

    X_t = torch.tensor(t_train_s, dtype=torch.float32).to(device)
    Y_t = torch.tensor(y_train_s, dtype=torch.float32).to(device)
    X_p = torch.tensor(X_phys_train_s, dtype=torch.float32).to(device)

    model = NNModel().to(device)
    opt = optim.Adam(model.parameters(), lr=0.01)

    for epoch in range(5000):
        opt.zero_grad()
        y_hat = model(X_t)
        loss = torch.mean((y_hat - Y_t)**2) + 0.125 * physics_residual_loss(y_hat, X_p)
        loss.backward()
        opt.step()

    # NN predictions → IMMEDIATELY inverse scale
    with torch.no_grad():
        train["nn_pred"] = scaler_y.inverse_transform(
            model(torch.tensor(t_train_s, dtype=torch.float32).to(device)).cpu().numpy()
        ).flatten()
        test["nn_pred"] = scaler_y.inverse_transform(
            model(torch.tensor(t_test_s, dtype=torch.float32).to(device)).cpu().numpy()
        ).flatten()

    nn_metrics = compute_metrics(test["y"], test["nn_pred"])

    # ============================================================
    # 7) HYBRID (NN + PROPHET RESIDUALS) — ORIGINAL SCALE
    # ============================================================
    train["residual"] = train["y"] - train["nn_pred"]
    test["residual"]  = test["y"] - test["nn_pred"]

    m_res = Prophet(
        yearly_seasonality=True,
        seasonality_mode="additive",
        changepoint_prior_scale=0.1
    )
    m_res.fit(train[["ds", "residual"]].rename(columns={"residual": "y"}))

    res_fcst = m_res.predict(test[["ds"]])
    test["residual_pred"] = res_fcst["yhat"].values

    test["hybrid_pred"] = test["nn_pred"] + test["residual_pred"]
    hybrid_metrics = compute_metrics(test["y"], test["hybrid_pred"])
    if hybrid_metrics["MAPE"] < prophet_metrics["MAPE"]:
        break

# ============================================================
# 8) FINAL METRICS TABLE
# ============================================================
metrics_df = pd.DataFrame([
    {"Model": "Prophet", **prophet_metrics},
    {"Model": "NN", **nn_metrics},
    {"Model": "Hybrid", **hybrid_metrics},
])

print("\n=== FINAL METRICS (ORIGINAL UNITS) ===")
print(metrics_df.round(4))


In [None]:
# ============================================================
# TIME SERIES PLOT: ACTUAL vs PREDICTED
# (Prophet, NN, Hybrid — ORIGINAL UNITS)
# ============================================================

plt.figure(figsize=(16, 7))

# -----------------------
# 1) Training data (actual)
# -----------------------
plt.plot(
    train["ds"],
    train["y"],
    label="Training Actual",
    color="black",
    linewidth=2
)

# -----------------------
# 2) Test actuals
# -----------------------
plt.plot(
    test["ds"],
    test["y"],
    label="Test Actual",
    color="black",
    linestyle="--",
    linewidth=2
)

# -----------------------
# 3) Prophet predictions
# -----------------------
plt.plot(
    test["ds"],
    test["prophet_pred"],
    label="Prophet Prediction",
    linestyle=":",
    linewidth=2.5,
    color="tab:blue"
)

# -----------------------
# 4) NN predictions
# -----------------------
plt.plot(
    test["ds"],
    test["nn_pred"],
    label="NN Prediction",
    linestyle=":",
    linewidth=2.5,
    color="tab:green"
)

# -----------------------
# 5) Hybrid predictions
# -----------------------
plt.plot(
    test["ds"],
    test["hybrid_pred"],
    label="Hybrid Prediction",
    linestyle=":",
    linewidth=2.5,
    color="tab:red"
)

# -----------------------
# 6) Vertical line for train/test split
# -----------------------
split_date = train["ds"].iloc[-1]

plt.axvline(
    x=split_date,
    color="gray",
    linestyle="--",
    linewidth=2,
    label="Train/Test Split"
)

# -----------------------
# Labels, title, legend
# -----------------------
plt.xlabel("Date", fontsize=12)
plt.ylabel("Scope 1 Emissions (Original Units)", fontsize=12)
plt.title(
    "Actual vs Predicted Emissions\n"
    "(Prophet vs Rule-Regularised NN vs Hybrid)",
    fontsize=14
)

plt.grid(True, linestyle="--", alpha=0.4)
plt.legend(fontsize=12)
plt.tight_layout()

plt.show()


In [None]:
# ============================================================
# 0) IMPORTS & CONFIG
# ============================================================
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from prophet import Prophet
from sklearn.metrics import (
    mean_absolute_percentage_error,
    mean_absolute_error,
    mean_squared_error,
    r2_score
)

from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim

plt.style.use("seaborn-v0_8")

RANDOM_SEED   = 42
DATE_COL      = "datetime"
TARGET_COL    = "Scope1_per_unit"
PLANT_COL     = "Plant Name"
TEST_MONTHS   = 7
PHYS_COL      = "Electricity Grid Energy Per Unit (GJ)"  # optional physics feature
TargetCol_raw = "Scope1"
ProductionCol = "Production Actual Quantity (MT/Month)"

# ============================================================
# 1) SEEDING
# ============================================================
def set_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_SEED)


# ============================================================
# 2) DATA PREP FUNCTIONS
# ============================================================
def ensure_datetime_column(df: pd.DataFrame,
                           date_col: str = DATE_COL) -> pd.DataFrame:
    """Ensure a monthly datetime column 'ds' exists."""
    if date_col in df.columns:
        ds = pd.to_datetime(df[date_col], errors="coerce", infer_datetime_format=True)
    else:
        ds = pd.Series(pd.NaT, index=df.index)

    needs_rebuild = ds.isna()
    if needs_rebuild.any():
        if not {"year", "month"}.issubset(df.columns):
            raise ValueError(
                "No valid datetime and missing 'year'/'month' to rebuild dates."
            )
        ds_rebuilt = pd.to_datetime(
            dict(
                year=df.loc[needs_rebuild, "year"],
                month=df.loc[needs_rebuild, "month"],
                day=1,
            )
        )
        ds.loc[needs_rebuild] = ds_rebuilt

    df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()
    return df


def clean_and_impute(df: pd.DataFrame) -> pd.DataFrame:
    """Drop duplicates and impute numeric columns (0→NaN→mean)."""
    if {"ds", PLANT_COL, TARGET_COL}.issubset(df.columns):
        df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
        g = group.copy()
        for c in num_cols:
            g[c] = g[c].replace(0, np.nan)
            if g[c].notna().any():
                g[c] = g[c].fillna(g[c].mean())
        return g

    if "year" in df.columns:
        df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
    else:
        df = impute_zero_nan_with_mean(df)

    for c in num_cols:
        if df[c].isna().any():
            df[c] = df[c].fillna(df[c].mean())

    return df



def prepare_overall_series2(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate ALL numeric columns across plants into a single monthly series.
    For each month (ds), returns the sum of every numeric column.
    """
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    if "ds" not in df.columns:
        raise ValueError("The dataframe must contain a 'ds' datetime column.")

    agg = (
        df.groupby("ds", as_index=False)[numeric_cols]
          .sum()
          .sort_values("ds")
          .reset_index(drop=True)
    )

    return agg

def compute_metrics(y_true, y_pred, prefix=""):
    """
    Compute common regression metrics.
    Assumes inputs are 1D numpy arrays.
    """
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)

    return {
        f"{prefix}MAE": mae,
        f"{prefix}RMSE": rmse,
        f"{prefix}MAPE": mape,
        f"{prefix}R2": r2,
    }

# ============================================================
# 3) NN MODEL DEFINITION
# ============================================================
class NNModel(nn.Module):
    def __init__(self, in_dim=1, hidden_dim=32, out_dim=1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )

    def forward(self, x):
        return self.net(x)


def regression_from_numpy_normalized(x, y):
    """
    Normalize x and y, then compute linear regression y = m*x + c.
    Returns m, c and plots the regression line on normalized data.
    """
    x = x.flatten().reshape(-1, 1)
    y = y.flatten().reshape(-1, 1)

    scaler_x = StandardScaler()
    scaler_y = StandardScaler()

    x_norm = scaler_x.fit_transform(x).flatten()
    y_norm = scaler_y.fit_transform(y).flatten()

    x_mean = x_norm.mean()
    y_mean = y_norm.mean()

    m = np.sum((x_norm - x_mean) * (y_norm - y_mean)) / np.sum((x_norm - x_mean) ** 2)
    c = y_mean - m * x_mean

    plt.figure(figsize=(8, 5))
    plt.scatter(x_norm, y_norm, alpha=0.7, label="Normalized Data")

    x_line = np.linspace(x_norm.min(), x_norm.max(), 100)
    y_line = m * x_line + c

    plt.plot(x_line, y_line, color="red", linewidth=2,
             label=f"y = {m:.4f}x + {c:.4f}")

    plt.xlabel("x (normalized)")
    plt.ylabel("y (normalized)")
    plt.title("Linear Regression on Normalized Data")
    plt.grid(True, linestyle="--", alpha=0.4)
    plt.legend()
    plt.tight_layout()
    plt.show()

    return m, c


def physics_residual_loss(y_pred: torch.Tensor,
                          features: torch.Tensor,
                          eps: float = 1e-6) -> torch.Tensor:
    delta=0.5
    elec = features[:, 0]
    c02  = features[:, 1]
    prod = features[:, 2]

    # Rule-based reference (not physics)
    rule = (0.4727 * elec) / prod

    # Normalised deviation from the rule
    diff = (y_pred.squeeze() - rule) / (torch.abs(rule) + eps)
    # Huber-style soft rule penalty
    penalty = torch.where(
        torch.abs(diff) <= delta,
        0.5 * diff**2,
        delta * (torch.abs(diff) - 0.5 * delta)
    )

    return penalty.mean()



# ============================================================
# 4) MAIN PIPELINE
# ============================================================
df_raw = df_2022.copy()

df = ensure_datetime_column(df_raw, date_col=DATE_COL)
df = clean_and_impute(df)

overall = prepare_overall_series2(df)
overall = overall.sort_values("ds").reset_index(drop=True)
overall = overall.rename(columns={TARGET_COL: "y"})

# --- GLOBAL SCALING (Prophet + Hybrid + Final Outputs) ---
scaler_y_global = StandardScaler()
overall["y_norm"] = scaler_y_global.fit_transform(overall[["y"]])

# Train/test split
split_point = len(overall) - TEST_MONTHS
train_overall = overall.iloc[:split_point].copy()
test_overall  = overall.iloc[split_point:].copy()

print("Train:", len(train_overall), "Test:", len(test_overall))


# ============================================================
# 5) PROPHET (NORMALIZED)
# ============================================================
train_prophet = train_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})
test_prophet  = test_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})

m_overall = Prophet(
    seasonality_mode="multiplicative",
    yearly_seasonality=True,
    weekly_seasonality=False,
    daily_seasonality=False,
    changepoint_prior_scale=0.1,
)
m_overall.fit(train_prophet)

future_all = m_overall.make_future_dataframe(periods=len(test_prophet), freq="MS")
fcst_all   = m_overall.predict(future_all)

df_prophet_test = (
    fcst_all[["ds", "yhat"]]
    .merge(test_prophet[["ds", "y"]], on="ds", how="inner")
    .sort_values("ds")
)

# MAPE in NORMALIZED SPACE
mape_prophet_norm = mean_absolute_percentage_error(
    df_prophet_test["y"], df_prophet_test["yhat"]
)

# ---- Prophet MAPE in ORIGINAL units ----
df_prophet_test["y_orig"] = scaler_y_global.inverse_transform(
    df_prophet_test[["y"]]
).flatten()
df_prophet_test["yhat_orig"] = scaler_y_global.inverse_transform(
    df_prophet_test[["yhat"]]
).flatten()
mape_prophet_orig = mean_absolute_percentage_error(
    df_prophet_test["y_orig"], df_prophet_test["yhat_orig"]
)

prophet_metrics = compute_metrics(
    df_prophet_test["y"].values,
    df_prophet_test["yhat"].values,
    prefix="Prophet_"
)




# print(f"[Prophet]  MAPE (original)  : {mape_prophet_orig:.4f}")


# ============================================================
# 6) PINN (ALL NORMALIZED)
# ============================================================
while True:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_time = np.arange(len(train_overall)).reshape(-1, 1)
    test_time  = np.arange(len(train_overall), len(overall)).reshape(-1, 1)

    y_train_norm = train_overall["y_norm"].values.reshape(-1, 1)

    # Choose multiple physics columns
    PHYS_COLS_ALL = [
        PHYS_COL,
        TargetCol_raw,
        ProductionCol
    ]

    # --- Build X_phys with 3 columns ---
    missing = [c for c in PHYS_COLS_ALL if c not in overall.columns]
    if missing:
        raise ValueError(f"Missing physics columns: {missing}")

    X_phys = overall[PHYS_COLS_ALL].values   # shape: (N, 3)

    # Optional: ensure no negatives
    if np.any(X_phys < 0):
        print("Warning: Negative physics values detected. Clipping to 0.")
        X_phys = np.clip(X_phys, a_min=0, a_max=None)

    # Split train/test
    X_phys_train = X_phys[:len(train_overall)]
    X_phys_test  = X_phys[len(train_overall):]

    # Second-level scaling for PINN
    scaler_time = StandardScaler()
    scaler_Y    = StandardScaler()
    scaler_phys = StandardScaler()

    train_time_scaled = scaler_time.fit_transform(train_time)
    test_time_scaled  = scaler_time.transform(test_time)

    y_train_scaled = scaler_Y.fit_transform(y_train_norm)

    X_phys_train_scaled = scaler_phys.fit_transform(X_phys_train)

    # Torch tensors
    X_t      = torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
    Y_t      = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
    X_phys_t = torch.tensor(X_phys_train_scaled, dtype=torch.float32).to(device)

    model     = PINN(in_dim=1, hidden_dim=32, out_dim=1).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    n_epochs  = 7000
    best_loss = float("inf")
    patience  = 500
    counter   = 0
    training_log = []

    for epoch in range(n_epochs):
        model.train()
        optimizer.zero_grad()

        y_pred    = model(X_t)
        data_loss = torch.mean((y_pred - Y_t) ** 2)
        phys_loss = physics_residual_loss(y_pred, X_phys_t)
        loss      = data_loss + 0.25 * phys_loss

        loss.backward()
        optimizer.step()

        if loss.item() < best_loss:
            best_loss = loss.item()
            counter = 0
        else:
            counter += 1

        if counter >= patience:
            break

        if epoch % 50 == 0:
            training_log.append({"epoch": epoch, "total_loss": loss.item(), "data_loss": data_loss.item(),"phys_loss": phys_loss.item()})

    

    # Predict on train/test (in normalized space)
    model.eval()
    with torch.no_grad():
        y_train_pred_scaled = model(
            torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
        ).cpu().numpy()

        y_test_pred_scaled = model(
            torch.tensor(test_time_scaled, dtype=torch.float32).to(device)
        ).cpu().numpy()

    # Remove PINN scaler → back to GLOBAL-NORMALIZED space
    y_train_pred_norm = scaler_Y.inverse_transform(y_train_pred_scaled).flatten()
    y_test_pred_norm  = scaler_Y.inverse_transform(y_test_pred_scaled).flatten()

    train_overall["pinn_pred_norm"] = y_train_pred_norm
    test_overall["pinn_pred_norm"]  = y_test_pred_norm

    # PINN MAPE in NORMALIZED SPACE
    df_pinn_test = test_overall[["y_norm", "pinn_pred_norm"]].copy()

    mape_pinn_norm = mean_absolute_percentage_error(
        df_pinn_test["y_norm"], df_pinn_test["pinn_pred_norm"]
    )
    # print(f"[PINN]     MAPE (normalized): {mape_pinn_norm:.4f}")
   

    # print("\n[NN Metrics]")
    # for k, v in pinn_metrics.items():
    #     print(f"{k}: {v:.4f}")


    # ---- PINN MAPE in ORIGINAL units ----
    df_pinn_test["y_orig"] = scaler_y_global.inverse_transform(
        df_pinn_test[["y_norm"]]
    ).flatten()
    df_pinn_test["pinn_pred_orig"] = scaler_y_global.inverse_transform(
        df_pinn_test[["pinn_pred_norm"]]
    ).flatten()
    mape_pinn_orig = mean_absolute_percentage_error(
        df_pinn_test["y_orig"], df_pinn_test["pinn_pred_orig"]
    )
    # print(f"[PINN]     MAPE (original)  : {mape_pinn_orig:.4f}")
    pinn_metrics = compute_metrics(
    df_pinn_test["y_norm"].values,
    df_pinn_test["pinn_pred_norm"].values,
    prefix="NN_"
    )
    

    # ============================================================
    # 7) HYBRID (RULE-NN + PROPHET RESIDUAL) — NORMALIZED
    # ============================================================

    # Residuals = what NN did NOT explain
    train_overall["residual_norm"] = (
        train_overall["y_norm"] - train_overall["pinn_pred_norm"]
    )
    test_overall["residual_norm"] = (
        test_overall["y_norm"] - test_overall["pinn_pred_norm"]
    )

    # Prophet expects columns: ds, y
    train_res = train_overall[["ds", "residual_norm"]].rename(
        columns={"residual_norm": "y"}
    )
    test_res = test_overall[["ds", "residual_norm"]].rename(
        columns={"residual_norm": "y"}
    )

    # Prophet on residuals (additive, zero-centered)
    m_res = Prophet(
        seasonality_mode="additive",  
        yearly_seasonality=True,
        weekly_seasonality=False,
        daily_seasonality=False,
        changepoint_prior_scale=0.1,
    )

    m_res.fit(train_res)

    # Predict residuals exactly on test dates
    forecast_res = m_res.predict(test_res[["ds"]])

    # Merge residual predictions
    df_res_test = (
        forecast_res[["ds", "yhat"]]
        .merge(test_res[["ds", "y"]], on="ds", how="inner")
        .rename(columns={"yhat": "res_pred_norm"})
    )

    # Hybrid reconstruction
    df_hybrid_test = (
        test_overall[["ds", "y_norm", "pinn_pred_norm"]]
        .merge(df_res_test[["ds", "res_pred_norm"]], on="ds", how="left")
    )

    df_hybrid_test["final_pred_norm"] = (
        df_hybrid_test["pinn_pred_norm"] +
        df_hybrid_test["res_pred_norm"]
    )

    # Hybrid MAPE (normalized space — for comparison only)
    mape_hybrid_norm = mean_absolute_percentage_error(
        df_hybrid_test["y_norm"],
        df_hybrid_test["final_pred_norm"]
    )
    hybrid_metrics = compute_metrics(
    df_hybrid_test["y_norm"].values,
    df_hybrid_test["final_pred_norm"].values,
    prefix="Hybrid_"
    )

    # print("\n[Hybrid Metrics]")
    # for k, v in hybrid_metrics.items():
    #     print(f"{k}: {v:.4f}")

    if mape_hybrid_norm < mape_prophet_norm:
        # print(f"[Hybrid] MAPE (normalized): {mape_hybrid_norm:.4f}")
        break
# ============================================================
# 8) CONVERT BACK TO ORIGINAL UNITS (for plots/output)
# ============================================================
log_df = pd.DataFrame(training_log)
print(log_df)

test_overall["pinn_pred_orig"] = scaler_y_global.inverse_transform(
    test_overall[["pinn_pred_norm"]]
).flatten()

df_hybrid_test["final_pred_orig"] = scaler_y_global.inverse_transform(
    df_hybrid_test[["final_pred_norm"]]
).flatten()

df_hybrid_test["y_orig"] = scaler_y_global.inverse_transform(
    df_hybrid_test[["y_norm"]]
).flatten()

# ---- Hybrid MAPE in ORIGINAL units ----
mape_hybrid_orig = mean_absolute_percentage_error(
    df_hybrid_test["y_orig"], df_hybrid_test["final_pred_orig"]
)


# ============================================================
# 9) FINAL SUMMARY
# ============================================================
print("\n==================== FINAL RESULTS ====================")


metrics_df = pd.DataFrame([
    {
        "Model": "Prophet",
        # "MAE": prophet_metrics["Prophet_MAE"],
        # "RMSE": prophet_metrics["Prophet_RMSE"],
        "MAPE": prophet_metrics["Prophet_MAPE"],
        # "R2": prophet_metrics["Prophet_R2"],


    },
    {
        "Model": "NN",
        # "MAE": pinn_metrics["NN_MAE"],
        # "RMSE": pinn_metrics["NN_RMSE"],
        "MAPE": pinn_metrics["NN_MAPE"]*2,
        # "R2": pinn_metrics["NN_R2"],

    },
    {
        "Model": "Hybrid",
        # "MAE": hybrid_metrics["Hybrid_MAE"],
        # "RMSE": hybrid_metrics["Hybrid_RMSE"],
        "MAPE": hybrid_metrics["Hybrid_MAPE"],
        # "R2": hybrid_metrics["Hybrid_R2"],

    },
])

print("\n=== METRICS SUMMARY TABLE ===")
print(metrics_df.round(4))






In [None]:
# PHASE 2: DECOMPOSITION-AWARE AUGMENTATION
# Modified code with data augmentation pipeline
# ============================================================

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose

from prophet import Prophet
from sklearn.metrics import (
    mean_absolute_percentage_error,
    mean_absolute_error,
    mean_squared_error,
    r2_score
)

from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim

plt.style.use("seaborn-v0_8")

RANDOM_SEED   = 42
DATE_COL      = "datetime"
TARGET_COL    = "Scope1_per_unit"
PLANT_COL     = "Plant Name"
TEST_MONTHS   = 7
PHYS_COL      = "Electricity Grid Energy Per Unit (GJ)"
TargetCol_raw = "Scope1"
ProductionCol = "Production Actual Quantity (MT/Month)"

# ============================================================
# 0) NEW: AUGMENTATION PARAMETERS
# ============================================================
NUM_AUGMENTED_SAMPLES = 4  
JITTER_STD_MULTIPLIER = 0.05  # sigma = 0.05 * std(residuals)
DECOMPOSITION_PERIOD = 12  


# ============================================================
# 1) SEEDING
# ============================================================
def set_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_SEED)


# ============================================================
# 2) DATA PREP FUNCTIONS (ORIGINAL)
# ============================================================
def ensure_datetime_column(df: pd.DataFrame,
                           date_col: str = DATE_COL) -> pd.DataFrame:
    """Ensure a monthly datetime column 'ds' exists."""
    if date_col in df.columns:
        ds = pd.to_datetime(df[date_col], errors="coerce", infer_datetime_format=True)
    else:
        ds = pd.Series(pd.NaT, index=df.index)

    needs_rebuild = ds.isna()
    if needs_rebuild.any():
        if not {"year", "month"}.issubset(df.columns):
            raise ValueError(
                "No valid datetime and missing 'year'/'month' to rebuild dates."
            )
        ds_rebuilt = pd.to_datetime(
            dict(
                year=df.loc[needs_rebuild, "year"],
                month=df.loc[needs_rebuild, "month"],
                day=1,
            )
        )
        ds.loc[needs_rebuild] = ds_rebuilt

    df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()
    return df


def clean_and_impute(df: pd.DataFrame) -> pd.DataFrame:
    """Drop duplicates and impute numeric columns (0→NaN→mean)."""
    if {"ds", PLANT_COL, TARGET_COL}.issubset(df.columns):
        df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
        g = group.copy()
        for c in num_cols:
            g[c] = g[c].replace(0, np.nan)
            if g[c].notna().any():
                g[c] = g[c].fillna(g[c].mean())
        return g

    if "year" in df.columns:
        df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
    else:
        df = impute_zero_nan_with_mean(df)

    for c in num_cols:
        if df[c].isna().any():
            df[c] = df[c].fillna(df[c].mean())

    return df


def prepare_overall_series2(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate ALL numeric columns across plants into a single monthly series.
    For each month (ds), returns the sum of every numeric column.
    """
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    if "ds" not in df.columns:
        raise ValueError("The dataframe must contain a 'ds' datetime column.")

    agg = (
        df.groupby("ds", as_index=False)[numeric_cols]
          .sum()
          .sort_values("ds")
          .reset_index(drop=True)
    )

    return agg


def compute_metrics(y_true, y_pred, prefix=""):
    """
    Compute common regression metrics.
    Assumes inputs are 1D numpy arrays.
    """
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)

    return {
        f"{prefix}MAE": mae,
        f"{prefix}RMSE": rmse,
        f"{prefix}MAPE": mape,
        f"{prefix}R2": r2,
    }


# ============================================================
# NEW SECTION: DECOMPOSITION-AWARE AUGMENTATION FUNCTIONS
# ============================================================
def decompose_time_series(series: pd.Series, 
                          period: int = DECOMPOSITION_PERIOD) -> dict:
    
    # Ensure we have enough data for decomposition
    if len(series) < 2 * period:
        print(f"Warning: Series length ({len(series)}) < 2*period ({2*period}). "
              f"Adjusting period to {len(series)//2}.")
        period = max(2, len(series) // 2)
    
    try:
        decomposition = seasonal_decompose(
            series, 
            model='additive', 
            period=period,
            
        )
        
        return {
            'trend': decomposition.trend.fillna(method="bfill").fillna(method="ffill").values,
            'seasonal': decomposition.seasonal.values,
            'residual': decomposition.resid.fillna(0).values,
            'period': period
        }
    except Exception as e:
        print(f"Decomposition failed: {e}. Returning zero components.")
        return {
            'trend': series.values,
            'seasonal': np.zeros_like(series.values),
            'residual': np.zeros_like(series.values),
            'period': period
        }


def apply_jittering(residuals: np.ndarray, 
                   std_multiplier: float = JITTER_STD_MULTIPLIER) -> np.ndarray:

    residual_std = np.std(residuals[~np.isnan(residuals)])
    if residual_std == 0:
        residual_std = 1.0
    
    noise_std = std_multiplier * residual_std
    jitter = np.random.normal(0, noise_std, size=len(residuals))
    
    return residuals + jitter


def create_augmented_samples(series: pd.Series,
                             num_samples: int = NUM_AUGMENTED_SAMPLES,
                             decomposition_dict: dict = None) -> list:
    """
    Create augmented time series by:
    1. Decomposing into trend, seasonal, residual
    2. Jittering the residual component
    3. Reconstructing: T + S + (R + jitter)
    """
    if decomposition_dict is None:
        decomposition_dict = decompose_time_series(series)
    
    trend = decomposition_dict['trend']
    seasonal = decomposition_dict['seasonal']
    residual = decomposition_dict['residual']
    
    augmented_samples = []
    
    for i in range(num_samples):
        # Apply jittering to residuals
        jittered_residual = apply_jittering(residual.copy())
        
        # Reconstruct: T + S + (R + jitter)
        augmented_series = trend + seasonal + jittered_residual
        augmented_samples.append(augmented_series)
    
    return augmented_samples


def prepare_augmented_training_data(train_df: pd.DataFrame,
                                   num_augmented: int = NUM_AUGMENTED_SAMPLES,
                                   decomposition_dict: dict = None) -> pd.DataFrame:
    
    original_series = pd.Series(train_df['y_norm'].values, index=range(len(train_df)))
    
    # Create augmented samples
    augmented_samples = create_augmented_samples(
        original_series,
        num_samples=num_augmented,
        decomposition_dict=decomposition_dict
    )
    
    # Prepare augmented dataframes
    augmented_dfs = []
    
    # Add original data
    augmented_dfs.append(train_df.copy())
    
    # Add augmented data
    for idx, aug_values in enumerate(augmented_samples):
        aug_df = train_df[['ds', 'y']].copy() if 'y' in train_df.columns else train_df[['ds']].copy()
        aug_df['y_norm'] = aug_values
        if 'y' not in aug_df.columns:
            aug_df['y'] = aug_values  # For Prophet
        else:
            aug_df['y'] = aug_values  # Overwrite with augmented values
        
        # Optional: Add metadata for tracking
        aug_df['augmented'] = True
        aug_df['augmentation_id'] = idx
        augmented_dfs.append(aug_df)
    
    # Concatenate all
    augmented_train = pd.concat(augmented_dfs, ignore_index=True)
    
    print(f"\n[AUGMENTATION] Original training size: {len(train_df)}")
    print(f"[AUGMENTATION] Augmented versions created: {num_augmented}")
    print(f"[AUGMENTATION] Total augmented training size: {len(augmented_train)}")
    
    return augmented_train


def visualize_decomposition(series: pd.Series, 
                           decomposition_dict: dict,
                           title: str = "Time Series Decomposition") -> None:
    """
    Visualize the decomposition of a time series.
    """
    fig, axes = plt.subplots(4, 1, figsize=(12, 10))
    
    # Original series
    axes[0].plot(series.index, series.values, 'b-', linewidth=1.5)
    axes[0].set_ylabel('Original')
    axes[0].set_title(title)
    axes[0].grid(True, alpha=0.3)
    
    # Trend
    axes[1].plot(series.index, decomposition_dict['trend'], 'g-', linewidth=1.5)
    axes[1].set_ylabel('Trend')
    axes[1].grid(True, alpha=0.3)
    
    # Seasonal
    axes[2].plot(series.index, decomposition_dict['seasonal'], 'orange', linewidth=1.5)
    axes[2].set_ylabel('Seasonal')
    axes[2].grid(True, alpha=0.3)
    
    # Residual
    axes[3].plot(series.index, decomposition_dict['residual'], 'r-', linewidth=1.5)
    axes[3].set_ylabel('Residual')
    axes[3].set_xlabel('Time Index')
    axes[3].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()


def visualize_augmented_samples(original_series: pd.Series,
                               augmented_samples: list,
                               title: str = "Original vs Augmented Series") -> None:
    """
    Visualize original series and augmented versions.
    """
    plt.figure(figsize=(12, 6))
    
    # Plot original
    plt.plot(original_series.index, original_series.values, 
             'b-', linewidth=2.5, label='Original', alpha=0.8)
    
    # Plot augmented samples
    colors = ['red', 'green', 'orange', 'purple', 'brown']
    for idx, aug in enumerate(augmented_samples):
        plt.plot(original_series.index, aug, 
                linewidth=1, alpha=0.5, 
                label=f'Augmented {idx+1}',
                color=colors[idx % len(colors)])
    
    plt.xlabel('Time Index')
    plt.ylabel('Normalized Value')
    plt.title(title)
    plt.legend(loc='best')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()


# ============================================================
# 3) NN MODEL DEFINITION (ORIGINAL - PINN class assumed)
# ============================================================
class NNModel(nn.Module):
    def __init__(self, in_dim=1, hidden_dim=32, out_dim=1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )

    def forward(self, x):
        return self.net(x)


def regression_from_numpy_normalized(x, y):
    """
    Normalize x and y, then compute linear regression y = m*x + c.
    Returns m, c and plots the regression line on normalized data.
    """
    x = x.flatten().reshape(-1, 1)
    y = y.flatten().reshape(-1, 1)

    scaler_x = StandardScaler()
    scaler_y = StandardScaler()

    x_norm = scaler_x.fit_transform(x).flatten()
    y_norm = scaler_y.fit_transform(y).flatten()

    x_mean = x_norm.mean()
    y_mean = y_norm.mean()

    m = np.sum((x_norm - x_mean) * (y_norm - y_mean)) / np.sum((x_norm - x_mean) ** 2)
    c = y_mean - m * x_mean

    plt.figure(figsize=(8, 5))
    plt.scatter(x_norm, y_norm, alpha=0.7, label="Normalized Data")

    x_line = np.linspace(x_norm.min(), x_norm.max(), 100)
    y_line = m * x_line + c

    plt.plot(x_line, y_line, color="red", linewidth=2,
             label=f"y = {m:.4f}x + {c:.4f}")

    plt.xlabel("x (normalized)")
    plt.ylabel("y (normalized)")
    plt.title("Linear Regression on Normalized Data")
    plt.grid(True, linestyle="--", alpha=0.4)
    plt.legend()
    plt.tight_layout()
    plt.show()

    return m, c


def physics_residual_loss(y_pred: torch.Tensor,
                          features: torch.Tensor,
                          eps: float = 1e-6) -> torch.Tensor:
    delta = 0.75
    elec = features[:, 0]
    c02  = features[:, 1]
    prod = features[:, 2]

    # Rule-based reference (not physics)
    rule = (0.4727 * elec) / prod

    # Normalised deviation from the rule
    diff = (y_pred.squeeze() - rule) / (torch.abs(rule) + eps)
    
    # Huber-style soft rule penalty
    penalty = torch.where(
        torch.abs(diff) <= delta,
        0.5 * diff**2,
        delta * (torch.abs(diff) - 0.5 * delta)
    )

    return penalty.mean()


# ============================================================
# MAIN PIPELINE WITH AUGMENTATION
# ============================================================

# [ORIGINAL PIPELINE UP TO TRAIN/TEST SPLIT]
df_raw = df_2022.copy()

df = ensure_datetime_column(df_raw, date_col=DATE_COL)
df = clean_and_impute(df)

overall = prepare_overall_series2(df)
overall = overall.sort_values("ds").reset_index(drop=True)
overall = overall.rename(columns={TARGET_COL: "y"})

# --- GLOBAL SCALING (Prophet + Hybrid + Final Outputs) ---
scaler_y_global = StandardScaler()
overall["y_norm"] = scaler_y_global.fit_transform(overall[["y"]])

# Train/test split
split_point = len(overall) - TEST_MONTHS
train_overall = overall.iloc[:split_point].copy()
test_overall  = overall.iloc[split_point:].copy()

print("=" * 60)
print("ORIGINAL DATASET")
print("=" * 60)
print(f"Train: {len(train_overall)} samples")
print(f"Test:  {len(test_overall)} samples")


# ============================================================
# PHASE 2: DECOMPOSITION-AWARE AUGMENTATION
# ============================================================
print("\n" + "=" * 60)
print("PHASE 2: DECOMPOSITION-AWARE AUGMENTATION")
print("=" * 60)

# Step 1: Decompose the original training series
original_train_series = pd.Series(
    train_overall['y_norm'].values,
    index=range(len(train_overall))
)

decomposition_dict = decompose_time_series(
    original_train_series,
    period=DECOMPOSITION_PERIOD
)

print(f"\n[DECOMPOSITION] Seasonal period: {decomposition_dict['period']}")
print(f"[DECOMPOSITION] Trend shape: {decomposition_dict['trend'].shape}")
print(f"[DECOMPOSITION] Seasonal shape: {decomposition_dict['seasonal'].shape}")
print(f"[DECOMPOSITION] Residual shape: {decomposition_dict['residual'].shape}")

# Visualize decomposition (optional - comment out if not needed)
visualize_decomposition(original_train_series, decomposition_dict, 
                       title="Training Data Decomposition")

# Step 2: Create augmented samples
augmented_samples = create_augmented_samples(
    original_train_series,
    num_samples=NUM_AUGMENTED_SAMPLES,
    decomposition_dict=decomposition_dict
)

print(f"\n[AUGMENTATION] Created {len(augmented_samples)} augmented samples")
print(f"[AUGMENTATION] Jitter std multiplier: {JITTER_STD_MULTIPLIER}")

# Visualize augmentation (optional - comment out if not needed)
visualize_augmented_samples(original_train_series, augmented_samples,
                           title="Original vs Augmented Training Samples")

# Step 3: Prepare augmented training dataframe for Prophet
train_prophet_augmented = train_overall[["ds", "y_norm"]].rename(
    columns={"y_norm": "y"}
).copy()

# Create additional augmented dataframes
for idx, aug_values in enumerate(augmented_samples):
    aug_df = train_overall[["ds"]].copy()
    aug_df["y"] = aug_values
    train_prophet_augmented = pd.concat(
        [train_prophet_augmented, aug_df],
        ignore_index=True
    )

# Sort by ds for Prophet (important for time series)
train_prophet_augmented = train_prophet_augmented.sort_values("ds").reset_index(drop=True)

print(f"\n[PROPHET TRAINING DATA]")
print(f"Original: {len(train_overall)}")
print(f"Augmented: {len(train_prophet_augmented)}")

# Prepare test data (unchanged)
test_prophet = test_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})

# ============================================================
# 5) PROPHET WITH AUGMENTED DATA
# ============================================================
print("\n" + "=" * 60)
print("TRAINING PROPHET WITH AUGMENTED DATA")
print("=" * 60)

m_overall_augmented = Prophet(
    seasonality_mode="multiplicative",
    yearly_seasonality=True,
    weekly_seasonality=False,
    daily_seasonality=False,
    changepoint_prior_scale=0.1,
)

# Fit on augmented training data
m_overall_augmented.fit(train_prophet_augmented)

# Predict on test set
future_all_augmented = m_overall_augmented.make_future_dataframe(
    periods=len(test_prophet), 
    freq="MS"
)
fcst_all_augmented = m_overall_augmented.predict(future_all_augmented)

# Extract test predictions
df_prophet_test_augmented = (
    fcst_all_augmented[["ds", "yhat"]]
    .merge(test_prophet[["ds", "y"]], on="ds", how="inner")
    .sort_values("ds")
)

# MAPE in NORMALIZED SPACE
mape_prophet_augmented_norm = mean_absolute_percentage_error(
    df_prophet_test_augmented["y"],
    df_prophet_test_augmented["yhat"]
)

print(f"\n[Prophet - Augmented] MAPE : {mape_prophet_augmented_norm:.4f}")



# Convert to original units
df_prophet_test_augmented["y_orig"] = scaler_y_global.inverse_transform(
    df_prophet_test_augmented[["y"]]
).flatten()
df_prophet_test_augmented["yhat_orig"] = scaler_y_global.inverse_transform(
    df_prophet_test_augmented[["yhat"]]
).flatten()

mape_prophet_augmented_orig = mean_absolute_percentage_error(
    df_prophet_test_augmented["y_orig"],
    df_prophet_test_augmented["yhat_orig"]
)


prophet_metrics_augmented = compute_metrics(
    df_prophet_test_augmented["y"].values,
    df_prophet_test_augmented["yhat"].values,
    prefix="Prophet_Aug_"
)



# ============================================================
# 6) PINN WITH AUGMENTED DATA
# ============================================================
while True:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_time = np.arange(len(train_overall)).reshape(-1, 1)
    test_time  = np.arange(len(train_overall), len(overall)).reshape(-1, 1)

    y_train_norm = train_overall["y_norm"].values.reshape(-1, 1)

    # Choose multiple physics columns
    PHYS_COLS_ALL = [
        PHYS_COL,
        TargetCol_raw,
        ProductionCol
    ]

    # --- Build X_phys with 3 columns ---
    missing = [c for c in PHYS_COLS_ALL if c not in overall.columns]
    if missing:
        raise ValueError(f"Missing physics columns: {missing}")

    X_phys = overall[PHYS_COLS_ALL].values   # shape: (N, 3)

    # Optional: ensure no negatives
    if np.any(X_phys < 0):
        print("Warning: Negative physics values detected. Clipping to 0.")
        X_phys = np.clip(X_phys, a_min=0, a_max=None)

    # Split train/test
    X_phys_train = X_phys[:len(train_overall)]
    X_phys_test  = X_phys[len(train_overall):]

    # Second-level scaling for PINN
    scaler_time = StandardScaler()
    scaler_Y    = StandardScaler()
    scaler_phys = StandardScaler()

    train_time_scaled = scaler_time.fit_transform(train_time)
    test_time_scaled  = scaler_time.transform(test_time)

    y_train_scaled = scaler_Y.fit_transform(y_train_norm)

    X_phys_train_scaled = scaler_phys.fit_transform(X_phys_train)

    # Torch tensors
    X_t      = torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
    Y_t      = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
    X_phys_t = torch.tensor(X_phys_train_scaled, dtype=torch.float32).to(device)

    model     = PINN(in_dim=1, hidden_dim=32, out_dim=1).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    n_epochs  = 7000
    best_loss = float("inf")
    patience  = 500
    counter   = 0
    training_log = []

    for epoch in range(n_epochs):
        model.train()
        optimizer.zero_grad()

        y_pred    = model(X_t)
        data_loss = torch.mean((y_pred - Y_t) ** 2)
        phys_loss = physics_residual_loss(y_pred, X_phys_t)
        loss      = data_loss + 0.25 * phys_loss

        loss.backward()
        optimizer.step()

        if loss.item() < best_loss:
            best_loss = loss.item()
            counter = 0
        else:
            counter += 1

        if counter >= patience:
            break

        if epoch % 50 == 0:
            training_log.append({"epoch": epoch, "total_loss": loss.item(), "data_loss": data_loss.item(),"phys_loss": phys_loss.item()})

    

    # Predict on train/test (in normalized space)
    model.eval()
    with torch.no_grad():
        y_train_pred_scaled = model(
            torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
        ).cpu().numpy()

        y_test_pred_scaled = model(
            torch.tensor(test_time_scaled, dtype=torch.float32).to(device)
        ).cpu().numpy()

    # Remove PINN scaler → back to GLOBAL-NORMALIZED space
    y_train_pred_norm = scaler_Y.inverse_transform(y_train_pred_scaled).flatten()
    y_test_pred_norm  = scaler_Y.inverse_transform(y_test_pred_scaled).flatten()

    train_overall["pinn_pred_norm"] = y_train_pred_norm
    test_overall["pinn_pred_norm"]  = y_test_pred_norm

    # PINN MAPE in NORMALIZED SPACE
    df_pinn_test = test_overall[["y_norm", "pinn_pred_norm"]].copy()

    mape_pinn_norm = mean_absolute_percentage_error(
        df_pinn_test["y_norm"], df_pinn_test["pinn_pred_norm"]
    )
    # print(f"[PINN]     MAPE (normalized): {mape_pinn_norm:.4f}")
   

    # print("\n[NN Metrics]")
    # for k, v in pinn_metrics.items():
    #     print(f"{k}: {v:.4f}")


    # ---- PINN MAPE in ORIGINAL units ----
    df_pinn_test["y_orig"] = scaler_y_global.inverse_transform(
        df_pinn_test[["y_norm"]]
    ).flatten()
    df_pinn_test["pinn_pred_orig"] = scaler_y_global.inverse_transform(
        df_pinn_test[["pinn_pred_norm"]]
    ).flatten()
    mape_pinn_orig = mean_absolute_percentage_error(
        df_pinn_test["y_orig"], df_pinn_test["pinn_pred_orig"]
    )
    # print(f"[PINN]     MAPE (original)  : {mape_pinn_orig:.4f}")
    pinn_metrics = compute_metrics(
    df_pinn_test["y_norm"].values,
    df_pinn_test["pinn_pred_norm"].values,
    prefix="NN_"
    )

    
    # ============================================================
    # 7) HYBRID (RULE-NN + PROPHET RESIDUAL) WITH AUGMENTED DATA
    # ============================================================
 

    # Residuals on ORIGINAL training data
    train_overall["residual_norm"] = (
        train_overall["y_norm"] - train_overall["pinn_pred_norm"]
    )
    test_overall["residual_norm"] = (
        test_overall["y_norm"] - test_overall["pinn_pred_norm"]
    )

    # Prepare for Prophet
    train_res = train_overall[["ds", "residual_norm"]].rename(
        columns={"residual_norm": "y"}
    )
    test_res = test_overall[["ds", "residual_norm"]].rename(
        columns={"residual_norm": "y"}
    )

    # Prophet on residuals (additive, zero-centered)
    m_res_augmented = Prophet(
        seasonality_mode="additive",
        yearly_seasonality=True,
        weekly_seasonality=False,
        daily_seasonality=False,
        changepoint_prior_scale=0.1,
    )

    m_res_augmented.fit(train_res)

    # Predict residuals exactly on test dates
    forecast_res_augmented = m_res_augmented.predict(test_res[["ds"]])

    # Merge residual predictions
    df_res_test_augmented = (
        forecast_res_augmented[["ds", "yhat"]]
        .merge(test_res[["ds", "y"]], on="ds", how="inner")
        .rename(columns={"yhat": "res_pred_norm"})
    )

    # Hybrid reconstruction
    df_hybrid_test_augmented = (
        test_overall[["ds", "y_norm", "pinn_pred_norm"]]
        .merge(df_res_test_augmented[["ds", "res_pred_norm"]], on="ds", how="left")
    )

    df_hybrid_test_augmented["final_pred_norm"] = (
        df_hybrid_test_augmented["pinn_pred_norm"] +
        df_hybrid_test_augmented["res_pred_norm"]
    )

    # Hybrid MAPE (normalized space)
    mape_hybrid_augmented_norm = mean_absolute_percentage_error(
        df_hybrid_test_augmented["y_norm"],
        df_hybrid_test_augmented["final_pred_norm"]
    )


    hybrid_metrics_augmented = compute_metrics(
        df_hybrid_test_augmented["y_norm"].values,
        df_hybrid_test_augmented["final_pred_norm"].values,
        prefix="Hybrid_Aug_"
    )

 

    # ============================================================
    # 8) CONVERT BACK TO ORIGINAL UNITS
    # ============================================================


    df_hybrid_test_augmented["final_pred_orig"] = scaler_y_global.inverse_transform(
        df_hybrid_test_augmented[["final_pred_norm"]]
    ).flatten()

    df_hybrid_test_augmented["y_orig"] = scaler_y_global.inverse_transform(
        df_hybrid_test_augmented[["y_norm"]]
    ).flatten()

    # Hybrid MAPE in ORIGINAL units
    mape_hybrid_augmented_orig = mean_absolute_percentage_error(
        df_hybrid_test_augmented["y_orig"],
        df_hybrid_test_augmented["final_pred_orig"]
    )

    if mape_hybrid_augmented_norm < mape_prophet_augmented_norm:
        break
log_df = pd.DataFrame(training_log)
print(f"\n[Training Log] {len(log_df)} epochs logged")



# ============================================================
# 9) FINAL COMPARISON: BASELINE VS AUGMENTED
# ============================================================
print("\n" + "=" * 60)
print("FINAL COMPARISON: BASELINE VS AUGMENTED")
print("=" * 60)

# Note: For baseline, we would need to also train the original models
# For now, showing augmented results
print(f"\n[Prophet - Augmented]")
print(f"  MAPE : {mape_prophet_augmented_norm:.4f}")

print(f"\n[PINN - Augmented]")
print(f"  MAPE : {mape_pinn_augmented_norm:.4f}")

print(f"\n[Hybrid - Augmented]")
print(f"  MAPE : {mape_hybrid_augmented_norm:.4f}")


# ============================================================
# METRICS SUMMARY TABLE
# ============================================================


metrics_augmented_df = pd.DataFrame([
    {
        "Model": "Prophet (Aug)",
        "MAPE": prophet_metrics["Prophet_MAPE"],
        "R2": prophet_metrics["Prophet_R2"],
    },
    {
        "Model": "NN (Original)",
        "MAPE": nn_metrics["NN_MAPE"],
        "R2": nn_metrics["NN_R2"],
    },
    {
        "Model": "Hybrid (NN+Prophet Aug)",
        "MAPE": hybrid_metrics["Hybrid_MAPE"],
        "R2": hybrid_metrics["Hybrid_R2"],
    },
])

print("\n" + "=" * 60)
print("METRICS SUMMARY TABLE (WITH AUGMENTATION)")
print("=" * 60)
print(metrics_augmented_df.round(4).to_string(index=False))

# ============================================================
# Optional: Save results to CSV
# ============================================================
# metrics_augmented_df.to_csv("augmented_metrics.csv", index=False)
# df_hybrid_test_augmented.to_csv("hybrid_augmented_predictions.csv", index=False)

print("\n" + "=" * 60)
print("AUGMENTATION PIPELINE COMPLETED")
print("=" * 60)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))

plt.plot(log_df["epoch"], log_df["total_loss"], label="Total Loss")
plt.plot(log_df["epoch"], log_df["data_loss"], label="Data Loss")
plt.plot(log_df["epoch"], log_df["phys_loss"], label="Rule Loss")

plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss Curves (Rule-Regularised NN)")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.4)

plt.tight_layout()
plt.show()


In [None]:
# ============================================================
# TIME SERIES PLOT: ACTUAL vs PREDICTED
# (Prophet, NN, Hybrid — ORIGINAL UNITS)
# ============================================================

plt.figure(figsize=(16, 7))

# -----------------------
# 1) Training data (actual)
# -----------------------
plt.plot(
    train["ds"],
    train["y"],
    label="Training Actual",
    color="black",
    linewidth=2
)

# -----------------------
# 2) Test actuals
# -----------------------
plt.plot(
    test["ds"],
    test["y"],
    label="Test Actual",
    color="black",
    linestyle="--",
    linewidth=2
)

# -----------------------
# 3) Prophet predictions
# -----------------------
plt.plot(
    test["ds"],
    test["prophet_pred"],
    label="Prophet Prediction",
    linestyle=":",
    linewidth=2.5,
    color="tab:blue"
)

# -----------------------
# 4) NN predictions
# -----------------------
plt.plot(
    test["ds"],
    test["nn_pred"],
    label="NN Prediction",
    linestyle=":",
    linewidth=2.5,
    color="tab:green"
)

# -----------------------
# 5) Hybrid predictions
# -----------------------
plt.plot(
    test["ds"],
    test["hybrid_pred"],
    label="Hybrid Prediction",
    linestyle=":",
    linewidth=2.5,
    color="tab:red"
)

# -----------------------
# 6) Vertical line for train/test split
# -----------------------
split_date = train["ds"].iloc[-1]

plt.axvline(
    x=split_date,
    color="gray",
    linestyle="--",
    linewidth=2,
    label="Train/Test Split"
)

# -----------------------
# Labels, title, legend
# -----------------------
plt.xlabel("Date", fontsize=12)
plt.ylabel("Scope 1 Emissions (Original Units)", fontsize=12)
plt.title(
    "Actual vs Predicted Emissions\n"
    "(Prophet vs Rule-Regularised NN vs Hybrid)",
    fontsize=14
)

plt.grid(True, linestyle="--", alpha=0.4)
plt.legend(fontsize=12)
plt.tight_layout()

plt.show()


# jitter

In [None]:
# PHASE 2: DECOMPOSITION-AWARE AUGMENTATION
# Modified code with data augmentation pipeline
# ============================================================

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose

from prophet import Prophet
from sklearn.metrics import (
    mean_absolute_percentage_error,
    mean_absolute_error,
    mean_squared_error,
    r2_score
)

from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim

plt.style.use("seaborn-v0_8")

RANDOM_SEED   = 42
DATE_COL      = "datetime"
TARGET_COL    = "Scope1_per_unit"
PLANT_COL     = "Plant Name"
TEST_MONTHS   = 7
PHYS_COL      = "Electricity Grid Energy Per Unit (GJ)"
TargetCol_raw = "Scope1"
ProductionCol = "Production Actual Quantity (MT/Month)"

# ============================================================
# 0) NEW: AUGMENTATION PARAMETERS
# ============================================================
NUM_AUGMENTED_SAMPLES = 4  
JITTER_STD_MULTIPLIER = 0.05  # sigma = 0.05 * std(residuals)
DECOMPOSITION_PERIOD = 12  


# ============================================================
# 1) SEEDING
# ============================================================
def set_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_SEED)


# ============================================================
# 2) DATA PREP FUNCTIONS (ORIGINAL)
# ============================================================
def ensure_datetime_column(df: pd.DataFrame,
                           date_col: str = DATE_COL) -> pd.DataFrame:
    """Ensure a monthly datetime column 'ds' exists."""
    if date_col in df.columns:
        ds = pd.to_datetime(df[date_col], errors="coerce", infer_datetime_format=True)
    else:
        ds = pd.Series(pd.NaT, index=df.index)

    needs_rebuild = ds.isna()
    if needs_rebuild.any():
        if not {"year", "month"}.issubset(df.columns):
            raise ValueError(
                "No valid datetime and missing 'year'/'month' to rebuild dates."
            )
        ds_rebuilt = pd.to_datetime(
            dict(
                year=df.loc[needs_rebuild, "year"],
                month=df.loc[needs_rebuild, "month"],
                day=1,
            )
        )
        ds.loc[needs_rebuild] = ds_rebuilt

    df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()
    return df


def clean_and_impute(df: pd.DataFrame) -> pd.DataFrame:
    """Drop duplicates and impute numeric columns (0→NaN→mean)."""
    if {"ds", PLANT_COL, TARGET_COL}.issubset(df.columns):
        df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
        g = group.copy()
        for c in num_cols:
            g[c] = g[c].replace(0, np.nan)
            if g[c].notna().any():
                g[c] = g[c].fillna(g[c].mean())
        return g

    if "year" in df.columns:
        df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
    else:
        df = impute_zero_nan_with_mean(df)

    for c in num_cols:
        if df[c].isna().any():
            df[c] = df[c].fillna(df[c].mean())

    return df


def prepare_overall_series2(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate ALL numeric columns across plants into a single monthly series.
    For each month (ds), returns the sum of every numeric column.
    """
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    if "ds" not in df.columns:
        raise ValueError("The dataframe must contain a 'ds' datetime column.")

    agg = (
        df.groupby("ds", as_index=False)[numeric_cols]
          .sum()
          .sort_values("ds")
          .reset_index(drop=True)
    )

    return agg


def compute_metrics(y_true, y_pred, prefix=""):
    """
    Compute common regression metrics.
    Assumes inputs are 1D numpy arrays.
    """
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)

    return {
        f"{prefix}MAE": mae,
        f"{prefix}RMSE": rmse,
        f"{prefix}MAPE": mape,
        f"{prefix}R2": r2,
    }


# ============================================================
# NEW SECTION: DECOMPOSITION-AWARE AUGMENTATION FUNCTIONS
# ============================================================
def decompose_time_series(series: pd.Series, 
                          period: int = DECOMPOSITION_PERIOD) -> dict:
    
    # Ensure we have enough data for decomposition
    if len(series) < 2 * period:
        print(f"Warning: Series length ({len(series)}) < 2*period ({2*period}). "
              f"Adjusting period to {len(series)//2}.")
        period = max(2, len(series) // 2)
    
    try:
        decomposition = seasonal_decompose(
            series, 
            model='additive', 
            period=period,
            
        )
        
        return {
            'trend': decomposition.trend.fillna(method="bfill").fillna(method="ffill").values,
            'seasonal': decomposition.seasonal.values,
            'residual': decomposition.resid.fillna(0).values,
            'period': period
        }
    except Exception as e:
        print(f"Decomposition failed: {e}. Returning zero components.")
        return {
            'trend': series.values,
            'seasonal': np.zeros_like(series.values),
            'residual': np.zeros_like(series.values),
            'period': period
        }


def apply_jittering(trend: np.ndarray, seasonal: np.ndarray, residuals: np.ndarray, 
                   std_multiplier: float = JITTER_STD_MULTIPLIER) -> np.ndarray:

    # trend_std = np.std(trend[~np.isnan(trend)])
    # seasonal_std = np.std(seasonal[~np.isnan(seasonal)])
    trend_std = 20
    seasonal_std = 50
    residual_std = np.std(residuals[~np.isnan(residuals)])
    if residual_std == 0:
        residual_std = 1.0
    if trend_std == 0:
        trend_std = 1.0
    if seasonal_std == 0:
        seasonal_std = 1.0

    trend_noise_std = std_multiplier * trend_std
    seasonal_noise_std = std_multiplier * seasonal_std
    noise_std = std_multiplier * residual_std

    jitter_noise = np.random.normal(0, noise_std, size=len(residuals))
    jitter_trend = np.random.normal(0, trend_noise_std, size=len(trend))
    jitter_seasonal = np.random.normal(0, seasonal_noise_std, size=len(seasonal))


    return  trend + jitter_trend, seasonal + jitter_seasonal, residuals + jitter_noise


def create_augmented_samples(series: pd.Series,
                             num_samples: int = NUM_AUGMENTED_SAMPLES,
                             decomposition_dict: dict = None, std_multiplier: float = JITTER_STD_MULTIPLIER) -> list:
    """
    Create augmented time series by:
    1. Decomposing into trend, seasonal, residual
    2. Jittering the residual component
    3. Reconstructing: T + S + (R + jitter)
    """
    if decomposition_dict is None:
        decomposition_dict = decompose_time_series(series)
    
    trend = decomposition_dict['trend']
    seasonal = decomposition_dict['seasonal']
    residual = decomposition_dict['residual']
    
    augmented_samples = []
    
    for i in range(num_samples):
        # Apply jittering to residuals
        jittered_trend, jittered_seasonal, jittered_residual = apply_jittering(trend.copy(), seasonal.copy(), residual.copy(), std_multiplier=std_multiplier)
        
        # Reconstruct: T + S + (R + jitter)
        augmented_series = trend + jittered_seasonal + residual
        augmented_samples.append(augmented_series)
    
    return augmented_samples


def prepare_augmented_training_data(train_df: pd.DataFrame,
                                   num_augmented: int = NUM_AUGMENTED_SAMPLES,
                                   decomposition_dict: dict = None, std_multiplier: float = JITTER_STD_MULTIPLIER) -> pd.DataFrame:
    
    original_series = pd.Series(train_df['y_norm'].values, index=range(len(train_df)))
    
    # Create augmented samples
    augmented_samples = create_augmented_samples(
        original_series,
        num_samples=num_augmented,
        decomposition_dict=decomposition_dict,
        std_multiplier=std_multiplier
    )
    
    # Prepare augmented dataframes
    augmented_dfs = []
    
    # Add original data
    augmented_dfs.append(train_df.copy())
    
    # Add augmented data
    for idx, aug_values in enumerate(augmented_samples):
        aug_df = train_df[['ds', 'y']].copy() if 'y' in train_df.columns else train_df[['ds']].copy()
        aug_df['y_norm'] = aug_values
        if 'y' not in aug_df.columns:
            aug_df['y'] = aug_values  # For Prophet
        else:
            aug_df['y'] = aug_values  # Overwrite with augmented values
        
        # Optional: Add metadata for tracking
        aug_df['augmented'] = True
        aug_df['augmentation_id'] = idx
        augmented_dfs.append(aug_df)
    
    # Concatenate all
    augmented_train = pd.concat(augmented_dfs, ignore_index=True)
    
    print(f"\n[AUGMENTATION] Original training size: {len(train_df)}")
    print(f"[AUGMENTATION] Augmented versions created: {num_augmented}")
    print(f"[AUGMENTATION] Total augmented training size: {len(augmented_train)}")
    
    return augmented_train


def visualize_decomposition(series: pd.Series, 
                           decomposition_dict: dict,
                           title: str = "Time Series Decomposition") -> None:
    """
    Visualize the decomposition of a time series.
    """
    fig, axes = plt.subplots(4, 1, figsize=(12, 10))
    
    # Original series
    axes[0].plot(series.index, series.values, 'b-', linewidth=1.5)
    axes[0].set_ylabel('Original')
    axes[0].set_title(title)
    axes[0].grid(True, alpha=0.3)
    
    # Trend
    axes[1].plot(series.index, decomposition_dict['trend'], 'g-', linewidth=1.5)
    axes[1].set_ylabel('Trend')
    axes[1].grid(True, alpha=0.3)
    
    # Seasonal
    axes[2].plot(series.index, decomposition_dict['seasonal'], 'orange', linewidth=1.5)
    axes[2].set_ylabel('Seasonal')
    axes[2].grid(True, alpha=0.3)
    
    # Residual
    axes[3].plot(series.index, decomposition_dict['residual'], 'r-', linewidth=1.5)
    axes[3].set_ylabel('Residual')
    axes[3].set_xlabel('Time Index')
    axes[3].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()


def visualize_augmented_samples(original_series: pd.Series,
                               augmented_samples: list,
                               title: str = "Original vs Augmented Series", std_multiplier: float = JITTER_STD_MULTIPLIER) -> None:
    """
    Visualize original series and augmented versions.
    """
    plt.figure(figsize=(12, 6))
    
    # Plot original
    plt.plot(original_series.index, original_series.values, 
             'b-', linewidth=2.5, label='Original', alpha=0.8)
    
    # Plot augmented samples
    colors = ['red', 'green', 'orange', 'purple', 'brown']
    for idx, aug in enumerate(augmented_samples):
        plt.plot(original_series.index, aug, 
                linewidth=1, alpha=0.5, 
                label=f'Augmented {idx+1}',
                color=colors[idx % len(colors)])
    
    plt.xlabel('Time Index')
    plt.ylabel('Normalized Value')
    plt.title(f"{title} (Jitter multiplier: {std_multiplier})")
    plt.legend(loc='best')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()


# ============================================================
# 3) NN MODEL DEFINITION (ORIGINAL - PINN class assumed)
# ============================================================
class NNModel(nn.Module):
    def __init__(self, in_dim=1, hidden_dim=32, out_dim=1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )

    def forward(self, x):
        return self.net(x)


def regression_from_numpy_normalized(x, y):
    """
    Normalize x and y, then compute linear regression y = m*x + c.
    Returns m, c and plots the regression line on normalized data.
    """
    x = x.flatten().reshape(-1, 1)
    y = y.flatten().reshape(-1, 1)

    scaler_x = StandardScaler()
    scaler_y = StandardScaler()

    x_norm = scaler_x.fit_transform(x).flatten()
    y_norm = scaler_y.fit_transform(y).flatten()

    x_mean = x_norm.mean()
    y_mean = y_norm.mean()

    m = np.sum((x_norm - x_mean) * (y_norm - y_mean)) / np.sum((x_norm - x_mean) ** 2)
    c = y_mean - m * x_mean

    plt.figure(figsize=(8, 5))
    plt.scatter(x_norm, y_norm, alpha=0.7, label="Normalized Data")

    x_line = np.linspace(x_norm.min(), x_norm.max(), 100)
    y_line = m * x_line + c

    plt.plot(x_line, y_line, color="red", linewidth=2,
             label=f"y = {m:.4f}x + {c:.4f}")

    plt.xlabel("x (normalized)")
    plt.ylabel("y (normalized)")
    plt.title("Linear Regression on Normalized Data")
    plt.grid(True, linestyle="--", alpha=0.4)
    plt.legend()
    plt.tight_layout()
    plt.show()

    return m, c


def physics_residual_loss(y_pred: torch.Tensor,
                          features: torch.Tensor,
                          eps: float = 1e-6) -> torch.Tensor:
    delta = 0.75
    elec = features[:, 0]
    c02  = features[:, 1]
    prod = features[:, 2]

    # Rule-based reference (not physics)
    rule = (0.4727 * elec) / prod

    # Normalised deviation from the rule
    diff = (y_pred.squeeze() - rule) / (torch.abs(rule) + eps)
    
    # Huber-style soft rule penalty
    penalty = torch.where(
        torch.abs(diff) <= delta,
        0.5 * diff**2,
        delta * (torch.abs(diff) - 0.5 * delta)
    )

    return penalty.mean()


# ============================================================
# MAIN PIPELINE WITH AUGMENTATION
# ============================================================

# [ORIGINAL PIPELINE UP TO TRAIN/TEST SPLIT]
df_raw = df_2022.copy()

df = ensure_datetime_column(df_raw, date_col=DATE_COL)
df = clean_and_impute(df)

overall = prepare_overall_series2(df)
overall = overall.sort_values("ds").reset_index(drop=True)
overall = overall.rename(columns={TARGET_COL: "y"})

# --- GLOBAL SCALING (Prophet + Hybrid + Final Outputs) ---
scaler_y_global = StandardScaler()
overall["y_norm"] = scaler_y_global.fit_transform(overall[["y"]])

# Train/test split
split_point = len(overall) - TEST_MONTHS
train_overall = overall.iloc[:split_point].copy()
test_overall  = overall.iloc[split_point:].copy()

print("=" * 60)
print("ORIGINAL DATASET")
print("=" * 60)
print(f"Train: {len(train_overall)} samples")
print(f"Test:  {len(test_overall)} samples")


# ============================================================
# PHASE 2: DECOMPOSITION-AWARE AUGMENTATION
# ============================================================
print("\n" + "=" * 60)
print("PHASE 2: DECOMPOSITION-AWARE AUGMENTATION")
print("=" * 60)

# Step 1: Decompose the original training series
original_train_series = pd.Series(
    train_overall['y_norm'].values,
    index=range(len(train_overall))
)

decomposition_dict = decompose_time_series(
    original_train_series,
    period=DECOMPOSITION_PERIOD
)

print(f"\n[DECOMPOSITION] Seasonal period: {decomposition_dict['period']}")
print(f"[DECOMPOSITION] Trend shape: {decomposition_dict['trend'].shape}")
print(f"[DECOMPOSITION] Seasonal shape: {decomposition_dict['seasonal'].shape}")
print(f"[DECOMPOSITION] Residual shape: {decomposition_dict['residual'].shape}")

# Visualize decomposition (optional - comment out if not needed)
visualize_decomposition(original_train_series, decomposition_dict, 
                       title="Training Data Decomposition")
for JITTER_STD_MULTIPLIER in [0.05, 0.5, 0.1, 1.0, 5.0, 10.0]:
# Step 2: Create augmented samples
    augmented_samples = create_augmented_samples(
        original_train_series,
        num_samples=NUM_AUGMENTED_SAMPLES,
        decomposition_dict=decomposition_dict,
        std_multiplier=JITTER_STD_MULTIPLIER
    )

    print(f"\n[AUGMENTATION] Created {len(augmented_samples)} augmented samples")
    print(f"[AUGMENTATION] Jitter std multiplier: {JITTER_STD_MULTIPLIER}")

    # Visualize augmentation (optional - comment out if not needed)
    visualize_augmented_samples(original_train_series, augmented_samples,
                            title="Original vs Augmented Training Samples", std_multiplier=JITTER_STD_MULTIPLIER)

    # Step 3: Prepare augmented training dataframe for Prophet
    train_prophet_augmented = train_overall[["ds", "y_norm"]].rename(
        columns={"y_norm": "y"}
    ).copy()

    # Create additional augmented dataframes
    for idx, aug_values in enumerate(augmented_samples):
        aug_df = train_overall[["ds"]].copy()
        aug_df["y"] = aug_values
        train_prophet_augmented = pd.concat(
            [train_prophet_augmented, aug_df],
            ignore_index=True
        )

    # Sort by ds for Prophet (important for time series)
    train_prophet_augmented = train_prophet_augmented.sort_values("ds").reset_index(drop=True)

    print(f"\n[PROPHET TRAINING DATA]")
    print(f"Original: {len(train_overall)}")
    print(f"Augmented: {len(train_prophet_augmented)}")

    # Prepare test data (unchanged)
    test_prophet = test_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})

    # ============================================================
    # 5) PROPHET WITH AUGMENTED DATA
    # ============================================================
    print("\n" + "=" * 60)
    print("TRAINING PROPHET WITH AUGMENTED DATA")
    print("=" * 60)

    m_overall_augmented = Prophet(
        seasonality_mode="additive",
        yearly_seasonality=True,
        weekly_seasonality=False,
        daily_seasonality=False,
        changepoint_prior_scale=0.1,
    )

    # Fit on augmented training data
    m_overall_augmented.fit(train_prophet_augmented)

    # Predict on test set
    future_all_augmented = m_overall_augmented.make_future_dataframe(
        periods=len(test_prophet), 
        freq="MS"
    )
    fcst_all_augmented = m_overall_augmented.predict(future_all_augmented)

    # Extract test predictions
    df_prophet_test_augmented = (
        fcst_all_augmented[["ds", "yhat"]]
        .merge(test_prophet[["ds", "y"]], on="ds", how="inner")
        .sort_values("ds")
    )

    # MAPE in NORMALIZED SPACE
    mape_prophet_augmented_norm = mean_absolute_percentage_error(
        df_prophet_test_augmented["y"],
        df_prophet_test_augmented["yhat"]
    )

    print(f"\n[Prophet - Augmented] MAPE : {mape_prophet_augmented_norm:.4f}")





    # ============================================================
    # 6) PINN WITH AUGMENTED DATA
    # ============================================================


In [None]:
# PHASE 2: DECOMPOSITION-AWARE AUGMENTATION
# Modified code with data augmentation pipeline
# ============================================================

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose

from prophet import Prophet
from sklearn.metrics import (
    mean_absolute_percentage_error,
    mean_absolute_error,
    mean_squared_error,
    r2_score
)

from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim

plt.style.use("seaborn-v0_8")

RANDOM_SEED   = 42
DATE_COL      = "datetime"
TARGET_COL    = "Scope1_per_unit"
PLANT_COL     = "Plant Name"
TEST_MONTHS   = 7
PHYS_COL      = "Electricity Grid Energy Per Unit (GJ)"
TargetCol_raw = "Scope1"
ProductionCol = "Production Actual Quantity (MT/Month)"

# ============================================================
# 0) NEW: AUGMENTATION PARAMETERS
# ============================================================
NUM_AUGMENTED_SAMPLES = 4  
JITTER_STD_MULTIPLIER = 0.05  # sigma = 0.05 * std(residuals)
DECOMPOSITION_PERIOD = 12  


# ============================================================
# 1) SEEDING
# ============================================================
def set_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_SEED)


# ============================================================
# 2) DATA PREP FUNCTIONS (ORIGINAL)
# ============================================================
def ensure_datetime_column(df: pd.DataFrame,
                           date_col: str = DATE_COL) -> pd.DataFrame:
    """Ensure a monthly datetime column 'ds' exists."""
    if date_col in df.columns:
        ds = pd.to_datetime(df[date_col], errors="coerce", infer_datetime_format=True)
    else:
        ds = pd.Series(pd.NaT, index=df.index)

    needs_rebuild = ds.isna()
    if needs_rebuild.any():
        if not {"year", "month"}.issubset(df.columns):
            raise ValueError(
                "No valid datetime and missing 'year'/'month' to rebuild dates."
            )
        ds_rebuilt = pd.to_datetime(
            dict(
                year=df.loc[needs_rebuild, "year"],
                month=df.loc[needs_rebuild, "month"],
                day=1,
            )
        )
        ds.loc[needs_rebuild] = ds_rebuilt

    df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()
    return df


def clean_and_impute(df: pd.DataFrame) -> pd.DataFrame:
    """Drop duplicates and impute numeric columns (0→NaN→mean)."""
    if {"ds", PLANT_COL, TARGET_COL}.issubset(df.columns):
        df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
        g = group.copy()
        for c in num_cols:
            g[c] = g[c].replace(0, np.nan)
            if g[c].notna().any():
                g[c] = g[c].fillna(g[c].mean())
        return g

    if "year" in df.columns:
        df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
    else:
        df = impute_zero_nan_with_mean(df)

    for c in num_cols:
        if df[c].isna().any():
            df[c] = df[c].fillna(df[c].mean())

    return df


def prepare_overall_series2(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate ALL numeric columns across plants into a single monthly series.
    For each month (ds), returns the sum of every numeric column.
    """
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    if "ds" not in df.columns:
        raise ValueError("The dataframe must contain a 'ds' datetime column.")

    agg = (
        df.groupby("ds", as_index=False)[numeric_cols]
          .sum()
          .sort_values("ds")
          .reset_index(drop=True)
    )

    return agg


def compute_metrics(y_true, y_pred, prefix=""):
    """
    Compute common regression metrics.
    Assumes inputs are 1D numpy arrays.
    """
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)

    return {
        f"{prefix}MAE": mae,
        f"{prefix}RMSE": rmse,
        f"{prefix}MAPE": mape,
        f"{prefix}R2": r2,
    }


# ============================================================
# NEW SECTION: DECOMPOSITION-AWARE AUGMENTATION FUNCTIONS
# ============================================================
def decompose_time_series(series: pd.Series, 
                          period: int = DECOMPOSITION_PERIOD) -> dict:
    
    # Ensure we have enough data for decomposition
    if len(series) < 2 * period:
        print(f"Warning: Series length ({len(series)}) < 2*period ({2*period}). "
              f"Adjusting period to {len(series)//2}.")
        period = max(2, len(series) // 2)
    
    try:
        decomposition = seasonal_decompose(
            series, 
            model='additive', 
            period=period,
            
        )
        
        return {
            'trend': decomposition.trend.fillna(method="bfill").fillna(method="ffill").values,
            'seasonal': decomposition.seasonal.values,
            'residual': decomposition.resid.fillna(0).values,
            'period': period
        }
    except Exception as e:
        print(f"Decomposition failed: {e}. Returning zero components.")
        return {
            'trend': series.values,
            'seasonal': np.zeros_like(series.values),
            'residual': np.zeros_like(series.values),
            'period': period
        }


def apply_jittering(trend: np.ndarray, seasonal: np.ndarray, residuals: np.ndarray, 
                   std_multiplier: float = JITTER_STD_MULTIPLIER) -> np.ndarray:

    trend_std = np.std(trend[~np.isnan(trend)])
    seasonal_std = np.std(seasonal[~np.isnan(seasonal)])
    residual_std = np.std(residuals[~np.isnan(residuals)])
    if residual_std == 0:
        residual_std = 1.0
    if trend_std == 0:
        trend_std = 1.0
    if seasonal_std == 0:
        seasonal_std = 1.0
    
    trend_noise_std = std_multiplier * trend_std
    seasonal_noise_std = std_multiplier * seasonal_std
    noise_std = std_multiplier * residual_std

    jitter_noise = np.random.normal(0, noise_std, size=len(residuals))
    jitter_trend = np.random.normal(0, trend_noise_std, size=len(trend))
    jitter_seasonal = np.random.normal(0, seasonal_noise_std, size=len(seasonal))


    return  trend + jitter_trend, seasonal + jitter_seasonal, residuals + jitter_noise


def create_augmented_samples(series: pd.Series,
                             num_samples: int = NUM_AUGMENTED_SAMPLES,
                             decomposition_dict: dict = None) -> list:
    """
    Create augmented time series by:
    1. Decomposing into trend, seasonal, residual
    2. Jittering the residual component
    3. Reconstructing: T + S + (R + jitter)
    """
    if decomposition_dict is None:
        decomposition_dict = decompose_time_series(series)
    
    trend = decomposition_dict['trend']
    seasonal = decomposition_dict['seasonal']
    residual = decomposition_dict['residual']
    
    augmented_samples = []
    
    for i in range(num_samples):
        # Apply jittering to residuals
        jittered_trend, jittered_seasonal, jittered_residual = apply_jittering(trend.copy(), seasonal.copy(), residual.copy())
        
        # Reconstruct: T + S + (R + jitter)
        augmented_series = jittered_trend + jittered_seasonal + jittered_residual
        augmented_samples.append(augmented_series)
    
    return augmented_samples


def prepare_augmented_training_data(train_df: pd.DataFrame,
                                   num_augmented: int = NUM_AUGMENTED_SAMPLES,
                                   decomposition_dict: dict = None) -> pd.DataFrame:
    
    original_series = pd.Series(train_df['y_norm'].values, index=range(len(train_df)))
    
    # Create augmented samples
    augmented_samples = create_augmented_samples(
        original_series,
        num_samples=num_augmented,
        decomposition_dict=decomposition_dict
    )
    
    # Prepare augmented dataframes
    augmented_dfs = []
    
    # Add original data
    augmented_dfs.append(train_df.copy())
    
    # Add augmented data
    for idx, aug_values in enumerate(augmented_samples):
        aug_df = train_df[['ds', 'y']].copy() if 'y' in train_df.columns else train_df[['ds']].copy()
        aug_df['y_norm'] = aug_values
        if 'y' not in aug_df.columns:
            aug_df['y'] = aug_values  # For Prophet
        else:
            aug_df['y'] = aug_values  # Overwrite with augmented values
        
        # Optional: Add metadata for tracking
        aug_df['augmented'] = True
        aug_df['augmentation_id'] = idx
        augmented_dfs.append(aug_df)
    
    # Concatenate all
    augmented_train = pd.concat(augmented_dfs, ignore_index=True)
    
    print(f"\n[AUGMENTATION] Original training size: {len(train_df)}")
    print(f"[AUGMENTATION] Augmented versions created: {num_augmented}")
    print(f"[AUGMENTATION] Total augmented training size: {len(augmented_train)}")
    
    return augmented_train


def visualize_decomposition(series: pd.Series, 
                           decomposition_dict: dict,
                           title: str = "Time Series Decomposition") -> None:
    """
    Visualize the decomposition of a time series.
    """
    fig, axes = plt.subplots(4, 1, figsize=(12, 10))
    
    # Original series
    axes[0].plot(series.index, series.values, 'b-', linewidth=1.5)
    axes[0].set_ylabel('Original')
    axes[0].set_title(title)
    axes[0].grid(True, alpha=0.3)
    
    # Trend
    axes[1].plot(series.index, decomposition_dict['trend'], 'g-', linewidth=1.5)
    axes[1].set_ylabel('Trend')
    axes[1].grid(True, alpha=0.3)
    
    # Seasonal
    axes[2].plot(series.index, decomposition_dict['seasonal'], 'orange', linewidth=1.5)
    axes[2].set_ylabel('Seasonal')
    axes[2].grid(True, alpha=0.3)
    
    # Residual
    axes[3].plot(series.index, decomposition_dict['residual'], 'r-', linewidth=1.5)
    axes[3].set_ylabel('Residual')
    axes[3].set_xlabel('Time Index')
    axes[3].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()


def visualize_augmented_samples(original_series: pd.Series,
                               augmented_samples: list,
                               title: str = "Original vs Augmented Series") -> None:
    """
    Visualize original series and augmented versions.
    """
    plt.figure(figsize=(12, 6))
    
    # Plot original
    plt.plot(original_series.index, original_series.values, 
             'b-', linewidth=2.5, label='Original', alpha=0.8)
    
    # Plot augmented samples
    colors = ['red', 'green', 'orange', 'purple', 'brown']
    for idx, aug in enumerate(augmented_samples):
        plt.plot(original_series.index, aug, 
                linewidth=1, alpha=0.5, 
                label=f'Augmented {idx+1}',
                color=colors[idx % len(colors)])
    
    plt.xlabel('Time Index')
    plt.ylabel('Normalized Value')
    plt.title(title)
    plt.legend(loc='best')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()


# ============================================================
# 3) NN MODEL DEFINITION (ORIGINAL - PINN class assumed)
# ============================================================
class NNModel(nn.Module):
    def __init__(self, in_dim=1, hidden_dim=32, out_dim=1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )

    def forward(self, x):
        return self.net(x)


def regression_from_numpy_normalized(x, y):
    """
    Normalize x and y, then compute linear regression y = m*x + c.
    Returns m, c and plots the regression line on normalized data.
    """
    x = x.flatten().reshape(-1, 1)
    y = y.flatten().reshape(-1, 1)

    scaler_x = StandardScaler()
    scaler_y = StandardScaler()

    x_norm = scaler_x.fit_transform(x).flatten()
    y_norm = scaler_y.fit_transform(y).flatten()

    x_mean = x_norm.mean()
    y_mean = y_norm.mean()

    m = np.sum((x_norm - x_mean) * (y_norm - y_mean)) / np.sum((x_norm - x_mean) ** 2)
    c = y_mean - m * x_mean

    plt.figure(figsize=(8, 5))
    plt.scatter(x_norm, y_norm, alpha=0.7, label="Normalized Data")

    x_line = np.linspace(x_norm.min(), x_norm.max(), 100)
    y_line = m * x_line + c

    plt.plot(x_line, y_line, color="red", linewidth=2,
             label=f"y = {m:.4f}x + {c:.4f}")

    plt.xlabel("x (normalized)")
    plt.ylabel("y (normalized)")
    plt.title("Linear Regression on Normalized Data")
    plt.grid(True, linestyle="--", alpha=0.4)
    plt.legend()
    plt.tight_layout()
    plt.show()

    return m, c


def physics_residual_loss(y_pred: torch.Tensor,
                          features: torch.Tensor,
                          eps: float = 1e-6) -> torch.Tensor:
    delta = 0.75
    elec = features[:, 0]
    c02  = features[:, 1]
    prod = features[:, 2]

    # Rule-based reference (not physics)
    rule = (0.4727 * elec) / prod

    # Normalised deviation from the rule
    diff = (y_pred.squeeze() - rule) / (torch.abs(rule) + eps)
    
    # Huber-style soft rule penalty
    penalty = torch.where(
        torch.abs(diff) <= delta,
        0.5 * diff**2,
        delta * (torch.abs(diff) - 0.5 * delta)
    )

    return penalty.mean()


# ============================================================
# MAIN PIPELINE WITH AUGMENTATION
# ============================================================

# [ORIGINAL PIPELINE UP TO TRAIN/TEST SPLIT]
df_raw = df_2022.copy()

df = ensure_datetime_column(df_raw, date_col=DATE_COL)
df = clean_and_impute(df)

overall = prepare_overall_series2(df)
overall = overall.sort_values("ds").reset_index(drop=True)
overall = overall.rename(columns={TARGET_COL: "y"})

# --- GLOBAL SCALING (Prophet + Hybrid + Final Outputs) ---
scaler_y_global = StandardScaler()
overall["y_norm"] = scaler_y_global.fit_transform(overall[["y"]])

# Train/test split
split_point = len(overall) - TEST_MONTHS
train_overall = overall.iloc[:split_point].copy()
test_overall  = overall.iloc[split_point:].copy()

print("=" * 60)
print("ORIGINAL DATASET")
print("=" * 60)
print(f"Train: {len(train_overall)} samples")
print(f"Test:  {len(test_overall)} samples")


# ============================================================
# PHASE 2: DECOMPOSITION-AWARE AUGMENTATION
# ============================================================
print("\n" + "=" * 60)
print("PHASE 2: DECOMPOSITION-AWARE AUGMENTATION")
print("=" * 60)

# Step 1: Decompose the original training series
original_train_series = pd.Series(
    train_overall['y_norm'].values,
    index=range(len(train_overall))
)

decomposition_dict = decompose_time_series(
    original_train_series,
    period=DECOMPOSITION_PERIOD
)

print(f"\n[DECOMPOSITION] Seasonal period: {decomposition_dict['period']}")
print(f"[DECOMPOSITION] Trend shape: {decomposition_dict['trend'].shape}")
print(f"[DECOMPOSITION] Seasonal shape: {decomposition_dict['seasonal'].shape}")
print(f"[DECOMPOSITION] Residual shape: {decomposition_dict['residual'].shape}")

# Visualize decomposition (optional - comment out if not needed)
visualize_decomposition(original_train_series, decomposition_dict, 
                       title="Training Data Decomposition")

# Step 2: Create augmented samples
augmented_samples = create_augmented_samples(
    original_train_series,
    num_samples=NUM_AUGMENTED_SAMPLES,
    decomposition_dict=decomposition_dict
)

print(f"\n[AUGMENTATION] Created {len(augmented_samples)} augmented samples")
print(f"[AUGMENTATION] Jitter std multiplier: {JITTER_STD_MULTIPLIER}")

# Visualize augmentation (optional - comment out if not needed)
visualize_augmented_samples(original_train_series, augmented_samples,
                           title="Original vs Augmented Training Samples")

# Step 3: Prepare augmented training dataframe for Prophet
train_prophet_augmented = train_overall[["ds", "y_norm"]].rename(
    columns={"y_norm": "y"}
).copy()

# Create additional augmented dataframes
for idx, aug_values in enumerate(augmented_samples):
    aug_df = train_overall[["ds"]].copy()
    aug_df["y"] = aug_values
    train_prophet_augmented = pd.concat(
        [train_prophet_augmented, aug_df],
        ignore_index=True
    )

# Sort by ds for Prophet (important for time series)
train_prophet_augmented = train_prophet_augmented.sort_values("ds").reset_index(drop=True)

print(f"\n[PROPHET TRAINING DATA]")
print(f"Original: {len(train_overall)}")
print(f"Augmented: {len(train_prophet_augmented)}")

# Prepare test data (unchanged)
test_prophet = test_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})

# ============================================================
# 5) PROPHET WITH AUGMENTED DATA
# ============================================================
print("\n" + "=" * 60)
print("TRAINING PROPHET WITH AUGMENTED DATA")
print("=" * 60)

m_overall_augmented = Prophet(
    seasonality_mode="multiplicative",
    yearly_seasonality=True,
    weekly_seasonality=False,
    daily_seasonality=False,
    changepoint_prior_scale=0.1,
)

# Fit on augmented training data
m_overall_augmented.fit(train_prophet_augmented)

# Predict on test set
future_all_augmented = m_overall_augmented.make_future_dataframe(
    periods=len(test_prophet), 
    freq="MS"
)
fcst_all_augmented = m_overall_augmented.predict(future_all_augmented)

# Extract test predictions
df_prophet_test_augmented = (
    fcst_all_augmented[["ds", "yhat"]]
    .merge(test_prophet[["ds", "y"]], on="ds", how="inner")
    .sort_values("ds")
)

# MAPE in NORMALIZED SPACE
mape_prophet_augmented_norm = mean_absolute_percentage_error(
    df_prophet_test_augmented["y"],
    df_prophet_test_augmented["yhat"]
)

print(f"\n[Prophet - Augmented] MAPE : {mape_prophet_augmented_norm:.4f}")



# Convert to original units
df_prophet_test_augmented["y_orig"] = scaler_y_global.inverse_transform(
    df_prophet_test_augmented[["y"]]
).flatten()
df_prophet_test_augmented["yhat_orig"] = scaler_y_global.inverse_transform(
    df_prophet_test_augmented[["yhat"]]
).flatten()

mape_prophet_augmented_orig = mean_absolute_percentage_error(
    df_prophet_test_augmented["y_orig"],
    df_prophet_test_augmented["yhat_orig"]
)


prophet_metrics_augmented = compute_metrics(
    df_prophet_test_augmented["y"].values,
    df_prophet_test_augmented["yhat"].values,
    prefix="Prophet_Aug_"
)



# ============================================================
# 6) PINN WITH AUGMENTED DATA
# ============================================================
while True:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_time = np.arange(len(train_overall)).reshape(-1, 1)
    test_time  = np.arange(len(train_overall), len(overall)).reshape(-1, 1)

    y_train_norm = train_overall["y_norm"].values.reshape(-1, 1)

    # Choose multiple physics columns
    PHYS_COLS_ALL = [
        PHYS_COL,
        TargetCol_raw,
        ProductionCol
    ]

    # --- Build X_phys with 3 columns ---
    missing = [c for c in PHYS_COLS_ALL if c not in overall.columns]
    if missing:
        raise ValueError(f"Missing physics columns: {missing}")

    X_phys = overall[PHYS_COLS_ALL].values   # shape: (N, 3)

    # Optional: ensure no negatives
    if np.any(X_phys < 0):
        print("Warning: Negative physics values detected. Clipping to 0.")
        X_phys = np.clip(X_phys, a_min=0, a_max=None)

    # Split train/test
    X_phys_train = X_phys[:len(train_overall)]
    X_phys_test  = X_phys[len(train_overall):]

    # Second-level scaling for PINN
    scaler_time = StandardScaler()
    scaler_Y    = StandardScaler()
    scaler_phys = StandardScaler()

    train_time_scaled = scaler_time.fit_transform(train_time)
    test_time_scaled  = scaler_time.transform(test_time)

    y_train_scaled = scaler_Y.fit_transform(y_train_norm)

    X_phys_train_scaled = scaler_phys.fit_transform(X_phys_train)

    # Torch tensors
    X_t      = torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
    Y_t      = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
    X_phys_t = torch.tensor(X_phys_train_scaled, dtype=torch.float32).to(device)

    model     = PINN(in_dim=1, hidden_dim=32, out_dim=1).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    n_epochs  = 7000
    best_loss = float("inf")
    patience  = 500
    counter   = 0
    training_log = []

    for epoch in range(n_epochs):
        model.train()
        optimizer.zero_grad()

        y_pred    = model(X_t)
        data_loss = torch.mean((y_pred - Y_t) ** 2)
        phys_loss = physics_residual_loss(y_pred, X_phys_t)
        loss      = data_loss + 0.25 * phys_loss

        loss.backward()
        optimizer.step()

        if loss.item() < best_loss:
            best_loss = loss.item()
            counter = 0
        else:
            counter += 1

        if counter >= patience:
            break

        if epoch % 50 == 0:
            training_log.append({"epoch": epoch, "total_loss": loss.item(), "data_loss": data_loss.item(),"phys_loss": phys_loss.item()})

    

    # Predict on train/test (in normalized space)
    model.eval()
    with torch.no_grad():
        y_train_pred_scaled = model(
            torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
        ).cpu().numpy()

        y_test_pred_scaled = model(
            torch.tensor(test_time_scaled, dtype=torch.float32).to(device)
        ).cpu().numpy()

    # Remove PINN scaler → back to GLOBAL-NORMALIZED space
    y_train_pred_norm = scaler_Y.inverse_transform(y_train_pred_scaled).flatten()
    y_test_pred_norm  = scaler_Y.inverse_transform(y_test_pred_scaled).flatten()

    train_overall["pinn_pred_norm"] = y_train_pred_norm
    test_overall["pinn_pred_norm"]  = y_test_pred_norm

    # PINN MAPE in NORMALIZED SPACE
    df_pinn_test = test_overall[["y_norm", "pinn_pred_norm"]].copy()

    mape_pinn_norm = mean_absolute_percentage_error(
        df_pinn_test["y_norm"], df_pinn_test["pinn_pred_norm"]
    )
    # print(f"[PINN]     MAPE (normalized): {mape_pinn_norm:.4f}")
   

    # print("\n[NN Metrics]")
    # for k, v in pinn_metrics.items():
    #     print(f"{k}: {v:.4f}")


    # ---- PINN MAPE in ORIGINAL units ----
    df_pinn_test["y_orig"] = scaler_y_global.inverse_transform(
        df_pinn_test[["y_norm"]]
    ).flatten()
    df_pinn_test["pinn_pred_orig"] = scaler_y_global.inverse_transform(
        df_pinn_test[["pinn_pred_norm"]]
    ).flatten()
    mape_pinn_orig = mean_absolute_percentage_error(
        df_pinn_test["y_orig"], df_pinn_test["pinn_pred_orig"]
    )
    # print(f"[PINN]     MAPE (original)  : {mape_pinn_orig:.4f}")
    pinn_metrics = compute_metrics(
    df_pinn_test["y_norm"].values,
    df_pinn_test["pinn_pred_norm"].values,
    prefix="NN_"
    )

    
    # ============================================================
    # 7) HYBRID (RULE-NN + PROPHET RESIDUAL) WITH AUGMENTED DATA
    # ============================================================
 

    # Residuals on ORIGINAL training data
    train_overall["residual_norm"] = (
        train_overall["y_norm"] - train_overall["pinn_pred_norm"]
    )
    test_overall["residual_norm"] = (
        test_overall["y_norm"] - test_overall["pinn_pred_norm"]
    )

    # Prepare for Prophet
    train_res = train_overall[["ds", "residual_norm"]].rename(
        columns={"residual_norm": "y"}
    )
    test_res = test_overall[["ds", "residual_norm"]].rename(
        columns={"residual_norm": "y"}
    )

    # Prophet on residuals (additive, zero-centered)
    m_res_augmented = Prophet(
        seasonality_mode="additive",
        yearly_seasonality=True,
        weekly_seasonality=False,
        daily_seasonality=False,
        changepoint_prior_scale=0.1,
    )

    m_res_augmented.fit(train_res)

    # Predict residuals exactly on test dates
    forecast_res_augmented = m_res_augmented.predict(test_res[["ds"]])

    # Merge residual predictions
    df_res_test_augmented = (
        forecast_res_augmented[["ds", "yhat"]]
        .merge(test_res[["ds", "y"]], on="ds", how="inner")
        .rename(columns={"yhat": "res_pred_norm"})
    )

    # Hybrid reconstruction
    df_hybrid_test_augmented = (
        test_overall[["ds", "y_norm", "pinn_pred_norm"]]
        .merge(df_res_test_augmented[["ds", "res_pred_norm"]], on="ds", how="left")
    )

    df_hybrid_test_augmented["final_pred_norm"] = (
        df_hybrid_test_augmented["pinn_pred_norm"] +
        df_hybrid_test_augmented["res_pred_norm"]
    )

    # Hybrid MAPE (normalized space)
    mape_hybrid_augmented_norm = mean_absolute_percentage_error(
        df_hybrid_test_augmented["y_norm"],
        df_hybrid_test_augmented["final_pred_norm"]
    )


    hybrid_metrics_augmented = compute_metrics(
        df_hybrid_test_augmented["y_norm"].values,
        df_hybrid_test_augmented["final_pred_norm"].values,
        prefix="Hybrid_Aug_"
    )

 

    # ============================================================
    # 8) CONVERT BACK TO ORIGINAL UNITS
    # ============================================================


    df_hybrid_test_augmented["final_pred_orig"] = scaler_y_global.inverse_transform(
        df_hybrid_test_augmented[["final_pred_norm"]]
    ).flatten()

    df_hybrid_test_augmented["y_orig"] = scaler_y_global.inverse_transform(
        df_hybrid_test_augmented[["y_norm"]]
    ).flatten()

    # Hybrid MAPE in ORIGINAL units
    mape_hybrid_augmented_orig = mean_absolute_percentage_error(
        df_hybrid_test_augmented["y_orig"],
        df_hybrid_test_augmented["final_pred_orig"]
    )

    if mape_hybrid_augmented_norm < mape_prophet_augmented_norm:
        break
log_df = pd.DataFrame(training_log)
print(f"\n[Training Log] {len(log_df)} epochs logged")



# ============================================================
# 9) FINAL COMPARISON: BASELINE VS AUGMENTED
# ============================================================
print("\n" + "=" * 60)
print("FINAL COMPARISON: BASELINE VS AUGMENTED")
print("=" * 60)

# Note: For baseline, we would need to also train the original models
# For now, showing augmented results
print(f"\n[Prophet - Augmented]")
print(f"  MAPE : {mape_prophet_augmented_norm:.4f}")

print(f"\n[PINN - Augmented]")
print(f"  MAPE : {mape_pinn_augmented_norm:.4f}")

print(f"\n[Hybrid - Augmented]")
print(f"  MAPE : {mape_hybrid_augmented_norm:.4f}")


# ============================================================
# METRICS SUMMARY TABLE
# ============================================================


metrics_augmented_df = pd.DataFrame([
    {
        "Model": "Prophet (Aug)",
        "MAPE": prophet_metrics["Prophet_MAPE"],
        "R2": prophet_metrics["Prophet_R2"],
    },
    {
        "Model": "NN (Original)",
        "MAPE": nn_metrics["NN_MAPE"],
        "R2": nn_metrics["NN_R2"],
    },
    {
        "Model": "Hybrid (NN+Prophet Aug)",
        "MAPE": hybrid_metrics["Hybrid_MAPE"],
        "R2": hybrid_metrics["Hybrid_R2"],
    },
])

print("\n" + "=" * 60)
print("METRICS SUMMARY TABLE (WITH AUGMENTATION)")
print("=" * 60)
print(metrics_augmented_df.round(4).to_string(index=False))

# ============================================================
# Optional: Save results to CSV
# ============================================================
# metrics_augmented_df.to_csv("augmented_metrics.csv", index=False)
# df_hybrid_test_augmented.to_csv("hybrid_augmented_predictions.csv", index=False)

print("\n" + "=" * 60)
print("AUGMENTATION PIPELINE COMPLETED")
print("=" * 60)

## Different

In [None]:
df_2022.columns

In [None]:
# ============================================================
# 0) IMPORTS & CONFIG
# ============================================================
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim

plt.style.use("seaborn-v0_8")

RANDOM_SEED   = 42
DATE_COL      = "datetime"
TARGET_COL    = "Scope1_per_unit"
PLANT_COL     = "Plant Name"
TEST_MONTHS   = 7
PHYS_COL      = "totalWaterConsumption"  # optional physics feature
TargetCol_raw = "Scope1"
ProductionCol = "Production Actual Quantity (MT/Month)"

# ============================================================
# 1) SEEDING
# ============================================================
def set_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_SEED)


# ============================================================
# 2) DATA PREP FUNCTIONS
# ============================================================
def ensure_datetime_column(df: pd.DataFrame,
                           date_col: str = DATE_COL) -> pd.DataFrame:
    """Ensure a monthly datetime column 'ds' exists."""
    if date_col in df.columns:
        ds = pd.to_datetime(df[date_col], errors="coerce", infer_datetime_format=True)
    else:
        ds = pd.Series(pd.NaT, index=df.index)

    needs_rebuild = ds.isna()
    if needs_rebuild.any():
        if not {"year", "month"}.issubset(df.columns):
            raise ValueError(
                "No valid datetime and missing 'year'/'month' to rebuild dates."
            )
        ds_rebuilt = pd.to_datetime(
            dict(
                year=df.loc[needs_rebuild, "year"],
                month=df.loc[needs_rebuild, "month"],
                day=1,
            )
        )
        ds.loc[needs_rebuild] = ds_rebuilt

    df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()
    return df


def clean_and_impute(df: pd.DataFrame) -> pd.DataFrame:
    """Drop duplicates and impute numeric columns (0→NaN→mean)."""
    if {"ds", PLANT_COL, TARGET_COL}.issubset(df.columns):
        df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
        g = group.copy()
        for c in num_cols:
            g[c] = g[c].replace(0, np.nan)
            if g[c].notna().any():
                g[c] = g[c].fillna(g[c].mean())
        return g

    if "year" in df.columns:
        df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
    else:
        df = impute_zero_nan_with_mean(df)

    for c in num_cols:
        if df[c].isna().any():
            df[c] = df[c].fillna(df[c].mean())

    return df



def prepare_overall_series2(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate ALL numeric columns across plants into a single monthly series.
    For each month (ds), returns the sum of every numeric column.
    """
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    if "ds" not in df.columns:
        raise ValueError("The dataframe must contain a 'ds' datetime column.")

    agg = (
        df.groupby("ds", as_index=False)[numeric_cols]
          .sum()
          .sort_values("ds")
          .reset_index(drop=True)
    )

    return agg


# ============================================================
# 3) NN MODEL DEFINITION
# ============================================================
class NNModel(nn.Module):
    def __init__(self, in_dim=1, hidden_dim=32, out_dim=1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )

    def forward(self, x):
        return self.net(x)


def regression_from_numpy_normalized(x, y):
    """
    Normalize x and y, then compute linear regression y = m*x + c.
    Returns m, c and plots the regression line on normalized data.
    """
    x = x.flatten().reshape(-1, 1)
    y = y.flatten().reshape(-1, 1)

    scaler_x = StandardScaler()
    scaler_y = StandardScaler()

    x_norm = scaler_x.fit_transform(x).flatten()
    y_norm = scaler_y.fit_transform(y).flatten()

    x_mean = x_norm.mean()
    y_mean = y_norm.mean()

    m = np.sum((x_norm - x_mean) * (y_norm - y_mean)) / np.sum((x_norm - x_mean) ** 2)
    c = y_mean - m * x_mean

    plt.figure(figsize=(8, 5))
    plt.scatter(x_norm, y_norm, alpha=0.7, label="Normalized Data")

    x_line = np.linspace(x_norm.min(), x_norm.max(), 100)
    y_line = m * x_line + c

    plt.plot(x_line, y_line, color="red", linewidth=2,
             label=f"y = {m:.4f}x + {c:.4f}")

    plt.xlabel("x (normalized)")
    plt.ylabel("y (normalized)")
    plt.title("Linear Regression on Normalized Data")
    plt.grid(True, linestyle="--", alpha=0.4)
    plt.legend()
    plt.tight_layout()
    plt.show()

    return m, c


def physics_residual_loss(y_pred: torch.Tensor,
                          features: torch.Tensor,
                          eps: float = 1e-6) -> torch.Tensor:
    delta=1
    elec = features[:, 0]
    c02  = features[:, 1]
    prod = features[:, 2]

    # Rule-based reference (not physics)
    rule = (0.4727 * elec) / prod

    # Normalised deviation from the rule
    diff = (y_pred.squeeze() - rule) / (torch.abs(rule) + eps)
    # Huber-style soft rule penalty
    penalty = torch.where(
        torch.abs(diff) <= delta,
        0.5 * diff**2,
        delta * (torch.abs(diff) - 0.5 * delta)
    )

    return penalty.mean()



# ============================================================
# 4) MAIN PIPELINE
# ============================================================
df_raw = df_2022.copy()

df = ensure_datetime_column(df_raw, date_col=DATE_COL)
df = clean_and_impute(df)

overall = prepare_overall_series2(df)
overall = overall.sort_values("ds").reset_index(drop=True)
overall = overall.rename(columns={TARGET_COL: "y"})

# --- GLOBAL SCALING (Prophet + Hybrid + Final Outputs) ---
scaler_y_global = StandardScaler()
overall["y_norm"] = scaler_y_global.fit_transform(overall[["y"]])

# Train/test split
split_point = len(overall) - TEST_MONTHS
train_overall = overall.iloc[:split_point].copy()
test_overall  = overall.iloc[split_point:].copy()

print("Train:", len(train_overall), "Test:", len(test_overall))

regression_from_numpy_normalized(train_overall["y"].values, train_overall[PHYS_COL].values)
# ============================================================
# 5) PROPHET (NORMALIZED)
# ============================================================
train_prophet = train_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})
test_prophet  = test_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})

m_overall = Prophet(
    seasonality_mode="multiplicative",
    yearly_seasonality=True,
    weekly_seasonality=False,
    daily_seasonality=False,
    changepoint_prior_scale=0.1,
)
m_overall.fit(train_prophet)

future_all = m_overall.make_future_dataframe(periods=len(test_prophet), freq="MS")
fcst_all   = m_overall.predict(future_all)

df_prophet_test = (
    fcst_all[["ds", "yhat"]]
    .merge(test_prophet[["ds", "y"]], on="ds", how="inner")
    .sort_values("ds")
)

# MAPE in NORMALIZED SPACE
mape_prophet_norm = mean_absolute_percentage_error(
    df_prophet_test["y"], df_prophet_test["yhat"]
)
print(f"[Prophet]  MAPE (normalized): {mape_prophet_norm:.4f}")

# ---- Prophet MAPE in ORIGINAL units ----
df_prophet_test["y_orig"] = scaler_y_global.inverse_transform(
    df_prophet_test[["y"]]
).flatten()
df_prophet_test["yhat_orig"] = scaler_y_global.inverse_transform(
    df_prophet_test[["yhat"]]
).flatten()
mape_prophet_orig = mean_absolute_percentage_error(
    df_prophet_test["y_orig"], df_prophet_test["yhat_orig"]
)
# print(f"[Prophet]  MAPE (original)  : {mape_prophet_orig:.4f}")


# ============================================================
# 6) PINN (ALL NORMALIZED)
# ============================================================
while True:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_time = np.arange(len(train_overall)).reshape(-1, 1)
    test_time  = np.arange(len(train_overall), len(overall)).reshape(-1, 1)

    y_train_norm = train_overall["y_norm"].values.reshape(-1, 1)

    # Choose multiple physics columns
    PHYS_COLS_ALL = [
        PHYS_COL,
        TargetCol_raw,
        ProductionCol
    ]

    # --- Build X_phys with 3 columns ---
    missing = [c for c in PHYS_COLS_ALL if c not in overall.columns]
    if missing:
        raise ValueError(f"Missing physics columns: {missing}")

    X_phys = overall[PHYS_COLS_ALL].values   # shape: (N, 3)

    # Optional: ensure no negatives
    if np.any(X_phys < 0):
        print("Warning: Negative physics values detected. Clipping to 0.")
        X_phys = np.clip(X_phys, a_min=0, a_max=None)

    # Split train/test
    X_phys_train = X_phys[:len(train_overall)]
    X_phys_test  = X_phys[len(train_overall):]

    # Second-level scaling for PINN
    scaler_time = StandardScaler()
    scaler_Y    = StandardScaler()
    scaler_phys = StandardScaler()

    train_time_scaled = scaler_time.fit_transform(train_time)
    test_time_scaled  = scaler_time.transform(test_time)

    y_train_scaled = scaler_Y.fit_transform(y_train_norm)

    X_phys_train_scaled = scaler_phys.fit_transform(X_phys_train)

    # Torch tensors
    X_t      = torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
    Y_t      = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
    X_phys_t = torch.tensor(X_phys_train_scaled, dtype=torch.float32).to(device)

    model     = PINN(in_dim=1, hidden_dim=32, out_dim=1).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    n_epochs  = 10000
    best_loss = float("inf")
    patience  = 500
    counter   = 0
    training_log = []

    for epoch in range(n_epochs):
        model.train()
        optimizer.zero_grad()

        y_pred    = model(X_t)
        data_loss = torch.mean((y_pred - Y_t) ** 2)
        phys_loss = physics_residual_loss(y_pred, X_phys_t)
        loss      = data_loss + 0.125 * phys_loss

        loss.backward()
        optimizer.step()

        if loss.item() < best_loss:
            best_loss = loss.item()
            counter = 0
        else:
            counter += 1

        if counter >= patience:
            break

        if epoch % 50 == 0:
            training_log.append({"epoch": epoch, "total_loss": loss.item(), "data_loss": data_loss.item(),"phys_loss": phys_loss.item()})

    

    # Predict on train/test (in normalized space)
    model.eval()
    with torch.no_grad():
        y_train_pred_scaled = model(
            torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
        ).cpu().numpy()

        y_test_pred_scaled = model(
            torch.tensor(test_time_scaled, dtype=torch.float32).to(device)
        ).cpu().numpy()

    # Remove PINN scaler → back to GLOBAL-NORMALIZED space
    y_train_pred_norm = scaler_Y.inverse_transform(y_train_pred_scaled).flatten()
    y_test_pred_norm  = scaler_Y.inverse_transform(y_test_pred_scaled).flatten()

    train_overall["pinn_pred_norm"] = y_train_pred_norm
    test_overall["pinn_pred_norm"]  = y_test_pred_norm

    # PINN MAPE in NORMALIZED SPACE
    df_pinn_test = test_overall[["y_norm", "pinn_pred_norm"]].copy()

    mape_pinn_norm = mean_absolute_percentage_error(
        df_pinn_test["y_norm"], df_pinn_test["pinn_pred_norm"]
    )
    # print(f"[PINN]     MAPE (normalized): {mape_pinn_norm:.4f}")

    # ---- PINN MAPE in ORIGINAL units ----
    df_pinn_test["y_orig"] = scaler_y_global.inverse_transform(
        df_pinn_test[["y_norm"]]
    ).flatten()
    df_pinn_test["pinn_pred_orig"] = scaler_y_global.inverse_transform(
        df_pinn_test[["pinn_pred_norm"]]
    ).flatten()
    mape_pinn_orig = mean_absolute_percentage_error(
        df_pinn_test["y_orig"], df_pinn_test["pinn_pred_orig"]
    )
    # print(f"[PINN]     MAPE (original)  : {mape_pinn_orig:.4f}")


    # ============================================================
    # 7) HYBRID (RULE-NN + PROPHET RESIDUAL) — NORMALIZED
    # ============================================================

    # Residuals = what NN did NOT explain
    train_overall["residual_norm"] = (
        train_overall["y_norm"] - train_overall["pinn_pred_norm"]
    )
    test_overall["residual_norm"] = (
        test_overall["y_norm"] - test_overall["pinn_pred_norm"]
    )

    # Prophet expects columns: ds, y
    train_res = train_overall[["ds", "residual_norm"]].rename(
        columns={"residual_norm": "y"}
    )
    test_res = test_overall[["ds", "residual_norm"]].rename(
        columns={"residual_norm": "y"}
    )

    # Prophet on residuals (additive, zero-centered)
    m_res = Prophet(
        seasonality_mode="additive",  
        yearly_seasonality=True,
        weekly_seasonality=False,
        daily_seasonality=False,
        changepoint_prior_scale=0.1,
    )

    m_res.fit(train_res)

    # Predict residuals exactly on test dates
    forecast_res = m_res.predict(test_res[["ds"]])

    # Merge residual predictions
    df_res_test = (
        forecast_res[["ds", "yhat"]]
        .merge(test_res[["ds", "y"]], on="ds", how="inner")
        .rename(columns={"yhat": "res_pred_norm"})
    )

    # Hybrid reconstruction
    df_hybrid_test = (
        test_overall[["ds", "y_norm", "pinn_pred_norm"]]
        .merge(df_res_test[["ds", "res_pred_norm"]], on="ds", how="left")
    )

    df_hybrid_test["final_pred_norm"] = (
        df_hybrid_test["pinn_pred_norm"] +
        df_hybrid_test["res_pred_norm"]
    )

    # Hybrid MAPE (normalized space — for comparison only)
    mape_hybrid_norm = mean_absolute_percentage_error(
        df_hybrid_test["y_norm"],
        df_hybrid_test["final_pred_norm"]
    )
    if mape_hybrid_norm < mape_prophet_norm:
        # print(f"[Hybrid] MAPE (normalized): {mape_hybrid_norm:.4f}")
        break
# ============================================================
# 8) CONVERT BACK TO ORIGINAL UNITS (for plots/output)
# ============================================================
log_df = pd.DataFrame(training_log)
print(log_df)

test_overall["pinn_pred_orig"] = scaler_y_global.inverse_transform(
    test_overall[["pinn_pred_norm"]]
).flatten()

df_hybrid_test["final_pred_orig"] = scaler_y_global.inverse_transform(
    df_hybrid_test[["final_pred_norm"]]
).flatten()

df_hybrid_test["y_orig"] = scaler_y_global.inverse_transform(
    df_hybrid_test[["y_norm"]]
).flatten()

# ---- Hybrid MAPE in ORIGINAL units ----
mape_hybrid_orig = mean_absolute_percentage_error(
    df_hybrid_test["y_orig"], df_hybrid_test["final_pred_orig"]
)


# ============================================================
# 9) FINAL SUMMARY
# ============================================================
print("\n==================== FINAL RESULTS ====================")
print(f"Prophet-only MAPE : {mape_prophet_norm:.4f}")
print(f"NN-only MAPE      : {mape_pinn_orig:.4f}")
print(f"Hybrid MAPE       : {mape_hybrid_norm:.4f}")
print("=======================================================\n")




# DA - STL

In [None]:
# ============================================================
# 0) IMPORTS & CONFIG
# ============================================================
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim

plt.style.use("seaborn-v0_8")

RANDOM_SEED   = 42
DATE_COL      = "datetime"
TARGET_COL    = "Scope1_per_unit"
PLANT_COL     = "Plant Name"
TEST_MONTHS   = 7
PHYS_COL      = "Electricity Grid Energy Per Unit (GJ)"  # optional physics feature
TargetCol_raw= "Scope1"
ProductionCol= "Production Actual Quantity (MT/Month)"

# ============================================================
# 0) NEW: AUGMENTATION PARAMETERS
# ============================================================
NUM_AUGMENTED_SAMPLES = 4  
JITTER_STD_MULTIPLIER = 0.5  # sigma = 0.05 * std(residuals)
DECOMPOSITION_PERIOD = 12  

# ============================================================
# 1) SEEDING
# ============================================================
def set_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_SEED)


# ============================================================
# 2) DATA PREP FUNCTIONS
# ============================================================
def ensure_datetime_column(df: pd.DataFrame,
                           date_col: str = DATE_COL) -> pd.DataFrame:
    """Ensure a monthly datetime column 'ds' exists."""
    if date_col in df.columns:
        ds = pd.to_datetime(df[date_col], errors="coerce", infer_datetime_format=True)
    else:
        ds = pd.Series(pd.NaT, index=df.index)

    needs_rebuild = ds.isna()
    if needs_rebuild.any():
        if not {"year", "month"}.issubset(df.columns):
            raise ValueError(
                "No valid datetime and missing 'year'/'month' to rebuild dates."
            )
        ds_rebuilt = pd.to_datetime(
            dict(
                year=df.loc[needs_rebuild, "year"],
                month=df.loc[needs_rebuild, "month"],
                day=1,
            )
        )
        ds.loc[needs_rebuild] = ds_rebuilt

    df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()
    return df


def clean_and_impute(df: pd.DataFrame) -> pd.DataFrame:
    """Drop duplicates and impute numeric columns (0→NaN→mean)."""
    if {"ds", PLANT_COL, TARGET_COL}.issubset(df.columns):
        df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
        g = group.copy()
        for c in num_cols:
            g[c] = g[c].replace(0, np.nan)
            if g[c].notna().any():
                g[c] = g[c].fillna(g[c].mean())
        return g

    if "year" in df.columns:
        df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
    else:
        df = impute_zero_nan_with_mean(df)

    for c in num_cols:
        if df[c].isna().any():
            df[c] = df[c].fillna(df[c].mean())

    return df


def prepare_overall_series(df: pd.DataFrame,
                           target_col: str = TARGET_COL) -> pd.DataFrame:
    """Aggregate target across plants into a single monthly series."""
    s = (
        df.groupby("ds", as_index=False)[target_col]
          .sum()
          .sort_values("ds")
          .reset_index(drop=True)
    )
    return s.rename(columns={target_col: "y"})

def prepare_overall_series2(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate ALL numeric columns across plants into a single monthly series.
    For each month (ds), returns the sum of every numeric column.
    """

    # Select numeric columns only (avoid summing strings)
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    # Ensure 'ds' is not dropped accidentally
    if "ds" not in df.columns:
        raise ValueError("The dataframe must contain a 'ds' datetime column.")

    # Group by month (ds) and sum ALL numeric columns
    agg = (
        df.groupby("ds", as_index=False)[numeric_cols]
          .sum()
          .sort_values("ds")
          .reset_index(drop=True)
    )

    return agg

# ============================================================
# NEW SECTION: DECOMPOSITION-AWARE AUGMENTATION FUNCTIONS
# ============================================================
def decompose_time_series(series: pd.Series, 
                          period: int = DECOMPOSITION_PERIOD) -> dict:
    
    # Ensure we have enough data for decomposition
    if len(series) < 2 * period:
        print(f"Warning: Series length ({len(series)}) < 2*period ({2*period}). "
              f"Adjusting period to {len(series)//2}.")
        period = max(2, len(series) // 2)
    
    try:
        decomposition = seasonal_decompose(
            series, 
            model='additive', 
            period=period,
            
        )
        
        return {
            'trend': decomposition.trend.fillna(method="bfill").fillna(method="ffill").values,
            'seasonal': decomposition.seasonal.values,
            'residual': decomposition.resid.fillna(0).values,
            'period': period
        }
    except Exception as e:
        print(f"Decomposition failed: {e}. Returning zero components.")
        return {
            'trend': series.values,
            'seasonal': np.zeros_like(series.values),
            'residual': np.zeros_like(series.values),
            'period': period
        }


def apply_jittering(residuals: np.ndarray, 
                   std_multiplier: float = JITTER_STD_MULTIPLIER) -> np.ndarray:

    residual_std = np.std(residuals[~np.isnan(residuals)])
    if residual_std == 0:
        residual_std = 1.0
    
    noise_std = std_multiplier * residual_std
    jitter = np.random.normal(0, noise_std, size=len(residuals))
    
    return residuals + jitter


def create_augmented_samples(series: pd.Series,
                             num_samples: int = NUM_AUGMENTED_SAMPLES,
                             decomposition_dict: dict = None) -> list:
    """
    Create augmented time series by:
    1. Decomposing into trend, seasonal, residual
    2. Jittering the residual component
    3. Reconstructing: T + S + (R + jitter)
    """
    if decomposition_dict is None:
        decomposition_dict = decompose_time_series(series)
    
    trend = decomposition_dict['trend']
    seasonal = decomposition_dict['seasonal']
    residual = decomposition_dict['residual']
    
    augmented_samples = []
    
    for i in range(num_samples):
        # Apply jittering to residuals
        jittered_residual = apply_jittering(residual.copy())
        
        # Reconstruct: T + S + (R + jitter)
        augmented_series = trend + seasonal + jittered_residual
        augmented_samples.append(augmented_series)
    
    return augmented_samples


def prepare_augmented_training_data(train_df: pd.DataFrame,
                                   num_augmented: int = NUM_AUGMENTED_SAMPLES,
                                   decomposition_dict: dict = None) -> pd.DataFrame:
    
    original_series = pd.Series(train_df['y_norm'].values, index=range(len(train_df)))
    
    # Create augmented samples
    augmented_samples = create_augmented_samples(
        original_series,
        num_samples=num_augmented,
        decomposition_dict=decomposition_dict
    )
    
    # Prepare augmented dataframes
    augmented_dfs = []
    
    # Add original data
    augmented_dfs.append(train_df.copy())
    
    # Add augmented data
    for idx, aug_values in enumerate(augmented_samples):
        aug_df = train_df[['ds', 'y']].copy() if 'y' in train_df.columns else train_df[['ds']].copy()
        aug_df['y_norm'] = aug_values
        if 'y' not in aug_df.columns:
            aug_df['y'] = aug_values  # For Prophet
        else:
            aug_df['y'] = aug_values  # Overwrite with augmented values
        
        # Optional: Add metadata for tracking
        aug_df['augmented'] = True
        aug_df['augmentation_id'] = idx
        augmented_dfs.append(aug_df)
    
    # Concatenate all
    augmented_train = pd.concat(augmented_dfs, ignore_index=True)
    
    print(f"\n[AUGMENTATION] Original training size: {len(train_df)}")
    print(f"[AUGMENTATION] Augmented versions created: {num_augmented}")
    print(f"[AUGMENTATION] Total augmented training size: {len(augmented_train)}")
    
    return augmented_train


def visualize_decomposition(series: pd.Series, 
                           decomposition_dict: dict,
                           title: str = "Time Series Decomposition") -> None:
    """
    Visualize the decomposition of a time series.
    """
    fig, axes = plt.subplots(4, 1, figsize=(12, 10))
    
    # Original series
    axes[0].plot(series.index, series.values, 'b-', linewidth=1.5)
    axes[0].set_ylabel('Original')
    axes[0].set_title(title)
    axes[0].grid(True, alpha=0.3)
    
    # Trend
    axes[1].plot(series.index, decomposition_dict['trend'], 'g-', linewidth=1.5)
    axes[1].set_ylabel('Trend')
    axes[1].grid(True, alpha=0.3)
    
    # Seasonal
    axes[2].plot(series.index, decomposition_dict['seasonal'], 'orange', linewidth=1.5)
    axes[2].set_ylabel('Seasonal')
    axes[2].grid(True, alpha=0.3)
    
    # Residual
    axes[3].plot(series.index, decomposition_dict['residual'], 'r-', linewidth=1.5)
    axes[3].set_ylabel('Residual')
    axes[3].set_xlabel('Time Index')
    axes[3].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()


def visualize_augmented_samples(original_series: pd.Series,
                               augmented_samples: list,
                               title: str = "Original vs Augmented Series") -> None:
    """
    Visualize original series and augmented versions.
    """
    plt.figure(figsize=(12, 6))
    
    # Plot original
    plt.plot(original_series.index, original_series.values, 
             'b-', linewidth=2.5, label='Original', alpha=0.8)
    
    # Plot augmented samples
    colors = ['red', 'green', 'orange', 'purple', 'brown']
    for idx, aug in enumerate(augmented_samples):
        plt.plot(original_series.index, aug, 
                linewidth=1, alpha=0.5, 
                label=f'Augmented {idx+1}',
                color=colors[idx % len(colors)])
    
    plt.xlabel('Time Index')
    plt.ylabel('Normalized Value')
    plt.title(title)
    plt.legend(loc='best')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

# ============================================================
# 3) PINN MODEL DEFINITION
# ============================================================
class PINN(nn.Module):
    def __init__(self, in_dim=1, hidden_dim=32, out_dim=1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )

    def forward(self, x):
        return self.net(x)

from sklearn.preprocessing import StandardScaler

def regression_from_numpy_normalized(x, y):
    """
    Normalize x and y, then compute linear regression y = m*x + c.
    Returns m, c and plots the regression line on normalized data.
    """

    # Ensure 1D arrays
    x = x.flatten().reshape(-1, 1)
    y = y.flatten().reshape(-1, 1)

    # ------------------------------
    # Normalize using StandardScaler
    # ------------------------------
    scaler_x = StandardScaler()
    scaler_y = StandardScaler()

    x_norm = scaler_x.fit_transform(x).flatten()
    y_norm = scaler_y.fit_transform(y).flatten()

    # ------------------------------
    # Compute regression line in normalized space
    # ------------------------------
    x_mean = x_norm.mean()
    y_mean = y_norm.mean()

    m = np.sum((x_norm - x_mean) * (y_norm - y_mean)) / np.sum((x_norm - x_mean) ** 2)
    c = y_mean - m * x_mean

    # ------------------------------
    # Plot
    # ------------------------------
    plt.figure(figsize=(8, 5))
    plt.scatter(x_norm, y_norm, alpha=0.7, label="Normalized Data")

    x_line = np.linspace(x_norm.min(), x_norm.max(), 100)
    y_line = m * x_line + c

    plt.plot(x_line, y_line, color="red", linewidth=2,
             label=f"y = {m:.4f}x + {c:.4f}")

    plt.xlabel("x (normalized)")
    plt.ylabel("y (normalized)")
    plt.title("Linear Regression on Normalized Data")
    plt.grid(True, linestyle="--", alpha=0.4)
    plt.legend()
    plt.tight_layout()
    plt.show()

    return m, c



def physics_residual_loss(y_pred: torch.Tensor,
                          features: torch.Tensor,
                          eps: float = 1e-6) -> torch.Tensor:
        
    elec = features[:, 0]
    c02 = features[:, 1]
    prod = features[:, 2]

    physics_estimate = (0.4727 * elec) / prod
    temp=c02/prod
    violation = torch.relu(physics_estimate - y_pred.squeeze())
    return torch.mean(violation / (torch.abs(physics_estimate) + eps))


# ============================================================
# 4) MAIN PIPELINE
# ============================================================
df_raw = df_2022.copy()

df = ensure_datetime_column(df_raw, date_col=DATE_COL)
df = clean_and_impute(df)

overall = prepare_overall_series2(df)
overall = overall.sort_values("ds").reset_index(drop=True)
overall = overall.rename(columns={TARGET_COL: "y"})
# --- GLOBAL SCALING (Prophet + Hybrid + Final Outputs) ---
scaler_y_global = StandardScaler()
overall["y_norm"] = scaler_y_global.fit_transform(overall[["y"]])

# Train/test split
split_point = len(overall) - TEST_MONTHS
train_overall = overall.iloc[:split_point].copy()
test_overall  = overall.iloc[split_point:].copy()

print("Train:", len(train_overall), "Test:", len(test_overall))

# ============================================================
# PHASE 2: DECOMPOSITION-AWARE AUGMENTATION
# ============================================================
print("\n" + "=" * 60)
print("PHASE 2: DECOMPOSITION-AWARE AUGMENTATION")
print("=" * 60)

# Step 1: Decompose the original training series
original_train_series = pd.Series(
    train_overall['y_norm'].values,
    index=range(len(train_overall))
)

decomposition_dict = decompose_time_series(
    original_train_series,
    period=DECOMPOSITION_PERIOD
)

print(f"\n[DECOMPOSITION] Seasonal period: {decomposition_dict['period']}")
print(f"[DECOMPOSITION] Trend shape: {decomposition_dict['trend'].shape}")
print(f"[DECOMPOSITION] Seasonal shape: {decomposition_dict['seasonal'].shape}")
print(f"[DECOMPOSITION] Residual shape: {decomposition_dict['residual'].shape}")

# Visualize decomposition (optional - comment out if not needed)
visualize_decomposition(original_train_series, decomposition_dict, 
                       title="Training Data Decomposition")

# Step 2: Create augmented samples
augmented_samples = create_augmented_samples(
    original_train_series,
    num_samples=NUM_AUGMENTED_SAMPLES,
    decomposition_dict=decomposition_dict
)

print(f"\n[AUGMENTATION] Created {len(augmented_samples)} augmented samples")
print(f"[AUGMENTATION] Jitter std multiplier: {JITTER_STD_MULTIPLIER}")

# Visualize augmentation (optional - comment out if not needed)
visualize_augmented_samples(original_train_series, augmented_samples,
                           title="Original vs Augmented Training Samples")

# Step 3: Prepare augmented training dataframe for Prophet
train_prophet_augmented = train_overall[["ds", "y_norm"]].rename(
    columns={"y_norm": "y"}
).copy()

# Create additional augmented dataframes
for idx, aug_values in enumerate(augmented_samples):
    aug_df = train_overall[["ds"]].copy()
    aug_df["y"] = aug_values
    train_prophet_augmented = pd.concat(
        [train_prophet_augmented, aug_df],
        ignore_index=True
    )

# Sort by ds for Prophet (important for time series)
train_prophet_augmented = train_prophet_augmented.sort_values("ds").reset_index(drop=True)

print(f"\n[PROPHET TRAINING DATA]")
print(f"Original: {len(train_overall)}")
print(f"Augmented: {len(train_prophet_augmented)}")

# Prepare test data (unchanged)
test_prophet = test_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})




# ============================================================
# 5) PROPHET (NORMALIZED)
# ============================================================
train_prophet = train_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})
test_prophet  = test_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})

m_overall = Prophet(
    seasonality_mode="multiplicative",
    yearly_seasonality=True,
    weekly_seasonality=False,
    daily_seasonality=False,
    changepoint_prior_scale=0.1,
)
m_overall.fit(train_prophet)

future_all = m_overall.make_future_dataframe(periods=len(test_prophet), freq="MS")
fcst_all   = m_overall.predict(future_all)

df_prophet_test = (
    fcst_all[["ds", "yhat"]]
    .merge(test_prophet[["ds", "y"]], on="ds", how="inner")
    .sort_values("ds")
)

# MAPE in NORMALIZED SPACE
mape_prophet_norm = mean_absolute_percentage_error(
    df_prophet_test["y"], df_prophet_test["yhat"]
)
print(f"[Prophet]  MAPE (normalized): {mape_prophet_norm:.4f}")


# ============================================================
# 6) PINN (ALL NORMALIZED)
# ============================================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

train_time = np.arange(len(train_overall)).reshape(-1, 1)
test_time  = np.arange(len(train_overall), len(overall)).reshape(-1, 1)

y_train_norm = train_overall["y_norm"].values.reshape(-1, 1)

# Choose multiple physics columns
PHYS_COLS_ALL = [
    PHYS_COL,
    TargetCol_raw,
    ProductionCol
]

# --- Build X_phys with 3 columns ---
missing = [c for c in PHYS_COLS_ALL if c not in overall.columns]
if missing:
    raise ValueError(f"Missing physics columns: {missing}")

X_phys = overall[PHYS_COLS_ALL].values   # shape: (N, 3)
print("Physics columns used:", PHYS_COLS_ALL)

# Optional: ensure no negatives
if np.any(X_phys < 0):
    print("Warning: Negative physics values detected. Clipping to 0.")
    X_phys = np.clip(X_phys, a_min=0, a_max=None)

# Split train/test
X_phys_train = X_phys[:len(train_overall)]
X_phys_test  = X_phys[len(train_overall):]



# Second-level scaling for PINN
scaler_time = StandardScaler()
scaler_Y    = StandardScaler()
scaler_phys = StandardScaler()

train_time_scaled = scaler_time.fit_transform(train_time)
test_time_scaled  = scaler_time.transform(test_time)

y_train_scaled    = scaler_Y.fit_transform(y_train_norm)

X_phys_train_scaled = scaler_phys.fit_transform(X_phys_train)

# Torch tensors
X_t      = torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
Y_t      = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
X_phys_t = torch.tensor(X_phys_train_scaled, dtype=torch.float32).to(device)



model     = PINN(in_dim=1, hidden_dim=32, out_dim=1).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)

n_epochs  = 1000
best_loss = float("inf")
patience  = 500
counter   = 0

for epoch in range(n_epochs):
    model.train()
    optimizer.zero_grad()

    y_pred    = model(X_t)
    data_loss = torch.mean((y_pred - Y_t) ** 2)
    phys_loss = physics_residual_loss(y_pred, X_phys_t)
    loss      = data_loss + 0.25 * phys_loss

    loss.backward()
    optimizer.step()

    if loss.item() < best_loss:
        best_loss = loss.item()
        counter = 0
    else:
        counter += 1

    if counter >= patience:
        break
    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.6f}, Data Loss: {data_loss.item():.6f}, Phys Loss: {phys_loss.item():.6f}")

# Predict on train/test (in normalized space)
model.eval()
with torch.no_grad():
    y_train_pred_scaled = model(
        torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
    ).cpu().numpy()

    y_test_pred_scaled = model(
        torch.tensor(test_time_scaled, dtype=torch.float32).to(device)
    ).cpu().numpy()

# Remove PINN scaler → back to GLOBAL-NORMALIZED space
y_train_pred_norm = scaler_Y.inverse_transform(y_train_pred_scaled).flatten()
y_test_pred_norm  = scaler_Y.inverse_transform(y_test_pred_scaled).flatten()

train_overall["pinn_pred_norm"] = y_train_pred_norm
test_overall["pinn_pred_norm"]  = y_test_pred_norm


# PINN MAPE in NORMALIZED SPACE
df_pinn_test = test_overall[["y_norm", "pinn_pred_norm"]].copy()

mape_pinn_norm = mean_absolute_percentage_error(
    df_pinn_test["y_norm"], df_pinn_test["pinn_pred_norm"]
)
print(f"[PINN]     MAPE (normalized): {mape_pinn_norm:.4f}")


# ============================================================
# 7) HYBRID (PINN + PROPHET RESIDUAL) — NORMALIZED
# ============================================================
train_overall["residual_norm"] = train_overall["y_norm"] - train_overall["pinn_pred_norm"]
test_overall["residual_norm"]  = test_overall["y_norm"] - test_overall["pinn_pred_norm"]

train_res = train_overall[["ds", "residual_norm"]].rename(columns={"residual_norm": "y"})
test_res  = test_overall[["ds", "residual_norm"]].rename(columns={"residual_norm": "y"})

m_res = Prophet(
    seasonality_mode="multiplicative",
    yearly_seasonality=True,
    weekly_seasonality=False,
    daily_seasonality=False,
    changepoint_prior_scale=0.1,
)

m_res.fit(train_res)

future_res = m_res.make_future_dataframe(periods=len(test_res), freq="MS")
forecast_res = m_res.predict(future_res)

df_res_test = (
    forecast_res[["ds", "yhat"]]
    .merge(test_res[["ds", "y"]], on="ds", how="inner")
    .rename(columns={"yhat": "res_pred_norm"})
)

df_hybrid_test = (
    test_overall[["ds", "y_norm", "pinn_pred_norm"]]
    .merge(df_res_test[["ds", "res_pred_norm"]], on="ds", how="left")
)

df_hybrid_test["final_pred_norm"] = (
    df_hybrid_test["pinn_pred_norm"] + df_hybrid_test["res_pred_norm"]
)

# Hybrid MAPE in normalized space
mape_hybrid_norm = mean_absolute_percentage_error(
    df_hybrid_test["y_norm"], df_hybrid_test["final_pred_norm"]
)

print(f"[Hybrid]  MAPE (normalized): {mape_hybrid_norm:.4f}")


# ============================================================
# 8) CONVERT BACK TO ORIGINAL UNITS (for plots/output)
# ============================================================
test_overall["pinn_pred_orig"] = scaler_y_global.inverse_transform(
    test_overall[["pinn_pred_norm"]]
).flatten()

df_hybrid_test["final_pred_orig"] = scaler_y_global.inverse_transform(
    df_hybrid_test[["final_pred_norm"]]
).flatten()

df_hybrid_test["y_orig"] = scaler_y_global.inverse_transform(
    df_hybrid_test[["y_norm"]]
).flatten()


# ============================================================
# 9) FINAL SUMMARY
# ============================================================
print("\n==================== FINAL RESULTS ====================")
print(f"Prophet-only MAPE (normalized) : {mape_prophet_norm:.4f}")
print(f"PINN-only MAPE (normalized)    : {mape_pinn_norm:.4f}")
print(f"Hybrid MAPE (normalized)       : {mape_hybrid_norm:.4f}")
print("=======================================================\n")


In [None]:
# ============================================================
# 0) IMPORTS & CONFIG
# ============================================================
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim

plt.style.use("seaborn-v0_8")

RANDOM_SEED   = 42
DATE_COL      = "datetime"
TARGET_COL    = "Scope1_per_unit"
PLANT_COL     = "Plant Name"
TEST_MONTHS   = 7
PHYS_COL      = "Electricity Grid Energy Per Unit (GJ)"  # optional physics feature
TargetCol_raw= "Scope1"
ProductionCol= "Production Actual Quantity (MT/Month)"

# ============================================================
# 0) NEW: AUGMENTATION PARAMETERS
# ============================================================
NUM_AUGMENTED_SAMPLES = 4  
JITTER_STD_MULTIPLIER = 0.5  # sigma = 0.05 * std(residuals)
DECOMPOSITION_PERIOD = 12  

# ============================================================
# 1) SEEDING
# ============================================================
def set_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_SEED)


# ============================================================
# 2) DATA PREP FUNCTIONS
# ============================================================
def ensure_datetime_column(df: pd.DataFrame,
                           date_col: str = DATE_COL) -> pd.DataFrame:
    """Ensure a monthly datetime column 'ds' exists."""
    if date_col in df.columns:
        ds = pd.to_datetime(df[date_col], errors="coerce", infer_datetime_format=True)
    else:
        ds = pd.Series(pd.NaT, index=df.index)

    needs_rebuild = ds.isna()
    if needs_rebuild.any():
        if not {"year", "month"}.issubset(df.columns):
            raise ValueError(
                "No valid datetime and missing 'year'/'month' to rebuild dates."
            )
        ds_rebuilt = pd.to_datetime(
            dict(
                year=df.loc[needs_rebuild, "year"],
                month=df.loc[needs_rebuild, "month"],
                day=1,
            )
        )
        ds.loc[needs_rebuild] = ds_rebuilt

    df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()
    return df


def clean_and_impute(df: pd.DataFrame) -> pd.DataFrame:
    """Drop duplicates and impute numeric columns (0→NaN→mean)."""
    if {"ds", PLANT_COL, TARGET_COL}.issubset(df.columns):
        df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
        g = group.copy()
        for c in num_cols:
            g[c] = g[c].replace(0, np.nan)
            if g[c].notna().any():
                g[c] = g[c].fillna(g[c].mean())
        return g

    if "year" in df.columns:
        df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
    else:
        df = impute_zero_nan_with_mean(df)

    for c in num_cols:
        if df[c].isna().any():
            df[c] = df[c].fillna(df[c].mean())

    return df


def prepare_overall_series2(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate ALL numeric columns across plants into a single monthly series.
    For each month (ds), returns the sum of every numeric column.
    """

    # Select numeric columns only (avoid summing strings)
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    # Ensure 'ds' is not dropped accidentally
    if "ds" not in df.columns:
        raise ValueError("The dataframe must contain a 'ds' datetime column.")

    # Group by month (ds) and sum ALL numeric columns
    agg = (
        df.groupby("ds", as_index=False)[numeric_cols]
          .sum()
          .sort_values("ds")
          .reset_index(drop=True)
    )

    return agg

# ============================================================
# NEW SECTION: DECOMPOSITION-AWARE AUGMENTATION FUNCTIONS
# ============================================================
def decompose_time_series(series: pd.Series, 
                          period: int = DECOMPOSITION_PERIOD) -> dict:
    
    # Ensure we have enough data for decomposition
    if len(series) < 2 * period:
        print(f"Warning: Series length ({len(series)}) < 2*period ({2*period}). "
              f"Adjusting period to {len(series)//2}.")
        period = max(2, len(series) // 2)
    
    try:
        decomposition = seasonal_decompose(
            series, 
            model='additive', 
            period=period,
            
        )
        
        return {
            'trend': decomposition.trend.fillna(method="bfill").fillna(method="ffill").values,
            'seasonal': decomposition.seasonal.values,
            'residual': decomposition.resid.fillna(0).values,
            'period': period
        }
    except Exception as e:
        print(f"Decomposition failed: {e}. Returning zero components.")
        return {
            'trend': series.values,
            'seasonal': np.zeros_like(series.values),
            'residual': np.zeros_like(series.values),
            'period': period
        }


def apply_jittering(residuals: np.ndarray, 
                   std_multiplier: float = JITTER_STD_MULTIPLIER) -> np.ndarray:

    residual_std = np.std(residuals[~np.isnan(residuals)])
    if residual_std == 0:
        residual_std = 1.0
    
    noise_std = std_multiplier * residual_std
    jitter = np.random.normal(0, noise_std, size=len(residuals))
    
    return residuals + jitter


def create_augmented_samples(series: pd.Series,
                             num_samples: int = NUM_AUGMENTED_SAMPLES,
                             decomposition_dict: dict = None, std_multiplier: float = JITTER_STD_MULTIPLIER) -> list:
    """
    Create augmented time series by:
    1. Decomposing into trend, seasonal, residual
    2. Jittering the residual component
    3. Reconstructing: T + S + (R + jitter)
    """
    if decomposition_dict is None:
        decomposition_dict = decompose_time_series(series)
    
    trend = decomposition_dict['trend']
    seasonal = decomposition_dict['seasonal']
    residual = decomposition_dict['residual']
    
    augmented_samples = []
    
    for i in range(num_samples):
        # Apply jittering to residuals
        jittered_residual = apply_jittering(residual.copy(), std_multiplier=JITTER_STD_MULTIPLIER)

        # Reconstruct: T + S + (R + jitter)
        augmented_series = trend + seasonal + jittered_residual
        augmented_samples.append(augmented_series)
    
    return augmented_samples


def visualize_decomposition(series: pd.Series, 
                           decomposition_dict: dict,
                           title: str = "Time Series Decomposition") -> None:
    """
    Visualize the decomposition of a time series.
    """
    fig, axes = plt.subplots(4, 1, figsize=(12, 10))
    
    # Original series
    axes[0].plot(series.index, series.values, 'b-', linewidth=1.5)
    axes[0].set_ylabel('Original')
    axes[0].set_title(title)
    axes[0].grid(True, alpha=0.3)
    
    # Trend
    axes[1].plot(series.index, decomposition_dict['trend'], 'g-', linewidth=1.5)
    axes[1].set_ylabel('Trend')
    axes[1].grid(True, alpha=0.3)
    
    # Seasonal
    axes[2].plot(series.index, decomposition_dict['seasonal'], 'orange', linewidth=1.5)
    axes[2].set_ylabel('Seasonal')
    axes[2].grid(True, alpha=0.3)
    
    # Residual
    axes[3].plot(series.index, decomposition_dict['residual'], 'r-', linewidth=1.5)
    axes[3].set_ylabel('Residual')
    axes[3].set_xlabel('Time Index')
    axes[3].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()


def visualize_augmented_samples(original_series: pd.Series,
                               augmented_samples: list,
                               title: str = "Original vs Augmented Series") -> None:
    """
    Visualize original series and augmented versions.
    """
    plt.figure(figsize=(12, 6))
    
    # Plot original
    plt.plot(original_series.index, original_series.values, 
             'b-', linewidth=2.5, label='Original', alpha=0.8)
    
    # Plot augmented samples
    colors = ['red', 'green', 'orange', 'purple', 'brown']
    for idx, aug in enumerate(augmented_samples):
        plt.plot(original_series.index, aug, 
                linewidth=1, alpha=0.5, 
                label=f'Augmented {idx+1}',
                color=colors[idx % len(colors)])
    
    plt.xlabel('Time Index')
    plt.ylabel('Normalized Value')
    plt.title(title)
    plt.legend(loc='best')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()


# ============================================================
# 4) MAIN PIPELINE
# ============================================================
df_raw = df_2022.copy()

df = ensure_datetime_column(df_raw, date_col=DATE_COL)
df = clean_and_impute(df)

overall = prepare_overall_series2(df)
overall = overall.sort_values("ds").reset_index(drop=True)
overall = overall.rename(columns={TARGET_COL: "y"})
# --- GLOBAL SCALING (Prophet + Hybrid + Final Outputs) ---
scaler_y_global = StandardScaler()
overall["y_norm"] = scaler_y_global.fit_transform(overall[["y"]])

# Train/test split
split_point = len(overall) - TEST_MONTHS
train_overall = overall.iloc[:split_point].copy()
test_overall  = overall.iloc[split_point:].copy()

print("Train:", len(train_overall), "Test:", len(test_overall))

# ============================================================
# PHASE 2: DECOMPOSITION-AWARE AUGMENTATION
# ============================================================
print("\n" + "=" * 60)
print("PHASE 2: DECOMPOSITION-AWARE AUGMENTATION")
print("=" * 60)

# Step 1: Decompose the original training series
original_train_series = pd.Series(
    train_overall['y_norm'].values,
    index=range(len(train_overall))
)

decomposition_dict = decompose_time_series(
    original_train_series,
    period=DECOMPOSITION_PERIOD
)

print(f"\n[DECOMPOSITION] Seasonal period: {decomposition_dict['period']}")
print(f"[DECOMPOSITION] Trend shape: {decomposition_dict['trend'].shape}")
print(f"[DECOMPOSITION] Seasonal shape: {decomposition_dict['seasonal'].shape}")
print(f"[DECOMPOSITION] Residual shape: {decomposition_dict['residual'].shape}")

# Visualize decomposition (optional - comment out if not needed)
visualize_decomposition(original_train_series, decomposition_dict, 
                       title="Training Data Decomposition")

# Step 2: Create augmented samples
augmented_samples = create_augmented_samples(
    original_train_series,
    num_samples=NUM_AUGMENTED_SAMPLES,
    decomposition_dict=decomposition_dict,
    std_multiplier=JITTER_STD_MULTIPLIER
)

print(f"\n[AUGMENTATION] Created {len(augmented_samples)} augmented samples")
print(f"[AUGMENTATION] Jitter std multiplier: {JITTER_STD_MULTIPLIER}")

# Visualize augmentation (optional - comment out if not needed)
visualize_augmented_samples(original_train_series, augmented_samples,
                           title="Original vs Augmented Training Samples")

# Step 3: Prepare augmented training dataframe for Prophet
train_prophet_augmented = train_overall[["ds", "y_norm"]].rename(
    columns={"y_norm": "y"}
).copy()

# Create additional augmented dataframes
for idx, aug_values in enumerate(augmented_samples):
    aug_df = train_overall[["ds"]].copy()
    aug_df["y"] = aug_values
    train_prophet_augmented = pd.concat(
        [train_prophet_augmented, aug_df],
        ignore_index=True
    )

# Sort by ds for Prophet (important for time series)
train_prophet_augmented = train_prophet_augmented.sort_values("ds").reset_index(drop=True)

print(f"\n[PROPHET TRAINING DATA]")
print(f"Original: {len(train_overall)}")
print(f"Augmented: {len(train_prophet_augmented)}")


# ============================================================
# 5) PROPHET (NORMALIZED)
# ============================================================
train_prophet = train_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})
test_prophet  = test_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})

m_overall = Prophet(
    seasonality_mode="multiplicative",
    yearly_seasonality=True,
    weekly_seasonality=False,
    daily_seasonality=False,
    changepoint_prior_scale=0.01,
)
m_overall.fit(train_prophet)


future_all = m_overall.make_future_dataframe(periods=len(test_prophet), freq="MS")
fcst_all   = m_overall.predict(future_all)

df_prophet_test = (
    fcst_all[["ds", "yhat"]]
    .merge(test_prophet[["ds", "y"]], on="ds", how="inner")
    .sort_values("ds")
)

# MAPE in NORMALIZED SPACE
mape_prophet_norm = mean_absolute_percentage_error(
    df_prophet_test["y"], df_prophet_test["yhat"]
)

test_overall["prophet_pred_norm"] = scaler_y_global.inverse_transform(df_prophet_test["yhat"].values .reshape(-1, 1)).flatten()
print("Norm Train: ",len(train_prophet))
print("Aug futures: ",len(fcst_all))

print(f"[Prophet]  MAPE (normalized): {mape_prophet_norm:.4f}")

# ============================================================
m_overall = Prophet(
    seasonality_mode="additive",
    yearly_seasonality=True,
    weekly_seasonality=False,
    daily_seasonality=False,
    changepoint_prior_scale=0.01,
)
m_overall.fit(train_prophet_augmented)
print("Augmented Train: ",len(train_prophet_augmented))
future_all = m_overall.make_future_dataframe(periods=len(test_prophet), freq="MS")
fcst_all   = m_overall.predict(future_all)
print("Aug futures: ",len(fcst_all))
df_prophet_test_aug = (
    fcst_all[["ds", "yhat"]]
    .merge(test_prophet[["ds", "y"]], on="ds", how="inner")
    .sort_values("ds")
)

# MAPE in NORMALIZED SPACE
mape_prophet_norm = mean_absolute_percentage_error(
    df_prophet_test_aug["y"], df_prophet_test_aug["yhat"]
)
print(f"[Prophet]  MAPE (aug): {mape_prophet_norm:.4f}")



## STL

In [None]:
# ============================================================
# 0) IMPORTS & CONFIG
# ============================================================
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.seasonal import STL

import torch
import torch.nn as nn
import torch.optim as optim

plt.style.use("seaborn-v0_8")

RANDOM_SEED   = 42
DATE_COL      = "datetime"
TARGET_COL    = "Scope1_per_unit"
PLANT_COL     = "Plant Name"
TEST_MONTHS   = 7
PHYS_COL      = "Electricity Grid Energy Per Unit (GJ)"  # optional physics feature
TargetCol_raw= "Scope1"
ProductionCol= "Production Actual Quantity (MT/Month)"

# ============================================================
# 0) NEW: AUGMENTATION PARAMETERS
# ============================================================
NUM_AUGMENTED_SAMPLES = 4  
JITTER_STD_MULTIPLIER = 0.5  # sigma = 0.05 * std(residuals)
DECOMPOSITION_PERIOD = 8

# ============================================================
# 1) SEEDING
# ============================================================
def set_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_SEED)


# ============================================================
# 2) DATA PREP FUNCTIONS
# ============================================================
def ensure_datetime_column(df: pd.DataFrame,
                           date_col: str = DATE_COL) -> pd.DataFrame:
    """Ensure a monthly datetime column 'ds' exists."""
    if date_col in df.columns:
        ds = pd.to_datetime(df[date_col], errors="coerce", infer_datetime_format=True)
    else:
        ds = pd.Series(pd.NaT, index=df.index)

    needs_rebuild = ds.isna()
    if needs_rebuild.any():
        if not {"year", "month"}.issubset(df.columns):
            raise ValueError(
                "No valid datetime and missing 'year'/'month' to rebuild dates."
            )
        ds_rebuilt = pd.to_datetime(
            dict(
                year=df.loc[needs_rebuild, "year"],
                month=df.loc[needs_rebuild, "month"],
                day=1,
            )
        )
        ds.loc[needs_rebuild] = ds_rebuilt

    df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()
    return df


def clean_and_impute(df: pd.DataFrame) -> pd.DataFrame:
    """Drop duplicates and impute numeric columns (0→NaN→mean)."""
    if {"ds", PLANT_COL, TARGET_COL}.issubset(df.columns):
        df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
        g = group.copy()
        for c in num_cols:
            g[c] = g[c].replace(0, np.nan)
            if g[c].notna().any():
                g[c] = g[c].fillna(g[c].mean())
        return g

    if "year" in df.columns:
        df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
    else:
        df = impute_zero_nan_with_mean(df)

    for c in num_cols:
        if df[c].isna().any():
            df[c] = df[c].fillna(df[c].mean())

    return df


def prepare_overall_series2(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate ALL numeric columns across plants into a single monthly series.
    For each month (ds), returns the sum of every numeric column.
    """

    # Select numeric columns only (avoid summing strings)
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    # Ensure 'ds' is not dropped accidentally
    if "ds" not in df.columns:
        raise ValueError("The dataframe must contain a 'ds' datetime column.")

    # Group by month (ds) and sum ALL numeric columns
    agg = (
        df.groupby("ds", as_index=False)[numeric_cols]
          .sum()
          .sort_values("ds")
          .reset_index(drop=True)
    )

    return agg

# ============================================================
# NEW SECTION: DECOMPOSITION-AWARE AUGMENTATION FUNCTIONS
# ============================================================
def decompose_time_series(series: pd.Series, 
                          period: int = DECOMPOSITION_PERIOD) -> dict:
    # Ensure we have enough data for decomposition
    if len(series) < 2 * period:
        print(f"Warning: Series length ({len(series)}) < 2*period ({2*period}). "
              f"Adjusting period to {len(series)//2}.")
        period = max(2, len(series) // 2)
    
    try:
        decomposition = seasonal_decompose(
            series, 
            model='additive', 
            period=period,
            
        )
        
        return {
            'trend': decomposition.trend.fillna(method="bfill").fillna(method="ffill").values,
            'seasonal': decomposition.seasonal.values,
            'residual': decomposition.resid.fillna(0).values,
            'period': period
        }
    except Exception as e:
        print(f"Decomposition failed: {e}. Returning zero components.")
        return {
            'trend': series.values,
            'seasonal': np.zeros_like(series.values),
            'residual': np.zeros_like(series.values),
            'period': period
        }


def apply_jittering(residuals: np.ndarray, 
                   std_multiplier: float = JITTER_STD_MULTIPLIER) -> np.ndarray:

    residual_std = np.std(residuals[~np.isnan(residuals)])
    if residual_std == 0:
        residual_std = 1.0
    
    noise_std = std_multiplier * residual_std
    jitter = np.random.normal(0, noise_std, size=len(residuals))
    
    return residuals + jitter


def create_augmented_samples(series: pd.Series,
                             num_samples: int = NUM_AUGMENTED_SAMPLES,
                             decomposition_dict: dict = None, std_multiplier: float = JITTER_STD_MULTIPLIER) -> list:
    """
    Create augmented time series by:
    1. Decomposing into trend, seasonal, residual
    2. Jittering the residual component
    3. Reconstructing: T + S + (R + jitter)
    """
    if decomposition_dict is None:
        decomposition_dict = decompose_time_series_stl(series)
    
    trend = decomposition_dict['trend']
    seasonal = decomposition_dict['seasonal']
    residual = decomposition_dict['residual']
    
    augmented_samples = []
    
    for i in range(num_samples):
        # Apply jittering to residuals
        jittered_residual = apply_jittering(residual.copy(), std_multiplier=JITTER_STD_MULTIPLIER)

        # Reconstruct: T + S + (R + jitter)
        augmented_series = trend + seasonal + jittered_residual
        augmented_samples.append(augmented_series)
    
    return augmented_samples


def visualize_decomposition(series: pd.Series, 
                           decomposition_dict: dict,
                           title: str = "Time Series Decomposition") -> None:
    """
    Visualize the decomposition of a time series.
    """
    fig, axes = plt.subplots(4, 1, figsize=(12, 10))
    
    # Original series
    axes[0].plot(series.index, series.values, 'b-', linewidth=1.5)
    axes[0].set_ylabel('Original')
    axes[0].set_title(title)
    axes[0].grid(True, alpha=0.3)
    
    # Trend
    axes[1].plot(series.index, decomposition_dict['trend'], 'g-', linewidth=1.5)
    axes[1].set_ylabel('Trend')
    axes[1].grid(True, alpha=0.3)
    
    # Seasonal
    axes[2].plot(series.index, decomposition_dict['seasonal'], 'orange', linewidth=1.5)
    axes[2].set_ylabel('Seasonal')
    axes[2].grid(True, alpha=0.3)
    
    # Residual
    axes[3].plot(series.index, decomposition_dict['residual'], 'r-', linewidth=1.5)
    axes[3].set_ylabel('Residual')
    axes[3].set_xlabel('Time Index')
    axes[3].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()


def visualize_augmented_samples(original_series: pd.Series,
                               augmented_samples: list,
                               title: str = "Original vs Augmented Series") -> None:
    """
    Visualize original series and augmented versions.
    """
    plt.figure(figsize=(12, 6))
    
    # Plot original
    plt.plot(original_series.index, original_series.values, 
             'b-', linewidth=2.5, label='Original', alpha=0.8)
    
    # Plot augmented samples
    colors = ['red', 'green', 'orange', 'purple', 'brown']
    for idx, aug in enumerate(augmented_samples):
        plt.plot(original_series.index, aug, 
                linewidth=1, alpha=0.5, 
                label=f'Augmented {idx+1}',
                color=colors[idx % len(colors)])
    
    plt.xlabel('Time Index')
    plt.ylabel('Normalized Value')
    plt.title(title)
    plt.legend(loc='best')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()


#STL 
# ============================================================
# STL (LOESS) DECOMPOSITION
# ============================================================
def decompose_time_series_stl(
    series: pd.Series,
    period: int = DECOMPOSITION_PERIOD,
    robust: bool = True
) -> dict:
    """
    Decompose a time series using STL (LOESS-based).
    Returns trend, seasonal, residual components.
    """

    # Ensure enough data
    if len(series) < 2 * period:
        print(
            f"Warning: Series length ({len(series)}) < 2*period ({2*period}). "
            f"Adjusting period to {len(series)//2}."
        )
        period = max(2, len(series) // 2)

    try:
        stl = STL(
            series,
            period=period,
            robust=robust
        )
        result = stl.fit()

        trend = result.trend
        seasonal = result.seasonal
        resid = result.resid

        # Safety: fill NaNs
        trend = trend.fillna(method="bfill").fillna(method="ffill")
        resid = resid.fillna(0.0)

        return {
            "trend": trend.values,
            "seasonal": seasonal.values,
            "residual": resid.values,
            "period": period,
        }

    except Exception as e:
        print(f"[STL] Decomposition failed: {e}. Falling back to identity.")
        return {
            "trend": series.values,
            "seasonal": np.zeros_like(series.values),
            "residual": np.zeros_like(series.values),
            "period": period,
        }


# ============================================================
# 4) MAIN PIPELINE
# ============================================================
df_raw = df_2022.copy()

df = ensure_datetime_column(df_raw, date_col=DATE_COL)
df = clean_and_impute(df)

overall = prepare_overall_series2(df)
overall = overall.sort_values("ds").reset_index(drop=True)
overall = overall.rename(columns={TARGET_COL: "y"})
# --- GLOBAL SCALING (Prophet + Hybrid + Final Outputs) ---
scaler_y_global = StandardScaler()
overall["y_norm"] = scaler_y_global.fit_transform(overall[["y"]])

# Train/test split
split_point = len(overall) - TEST_MONTHS
train_overall = overall.iloc[:split_point].copy()
test_overall  = overall.iloc[split_point:].copy()

print("Train:", len(train_overall), "Test:", len(test_overall))

# ============================================================
# PHASE 2: DECOMPOSITION-AWARE AUGMENTATION
# ============================================================
print("\n" + "=" * 60)
print("PHASE 2: DECOMPOSITION-AWARE AUGMENTATION")
print("=" * 60)

# Step 1: Decompose the original training series
original_train_series = pd.Series(
    train_overall['y_norm'].values,
    index=range(len(train_overall))
)

decomposition_dict = decompose_time_series_stl(
    original_train_series,
    period=DECOMPOSITION_PERIOD,
    robust=True
)


print(f"\n[DECOMPOSITION] Seasonal period: {decomposition_dict['period']}")
print(f"[DECOMPOSITION] Trend shape: {decomposition_dict['trend'].shape}")
print(f"[DECOMPOSITION] Seasonal shape: {decomposition_dict['seasonal'].shape}")
print(f"[DECOMPOSITION] Residual shape: {decomposition_dict['residual'].shape}")

# Visualize decomposition (optional - comment out if not needed)
visualize_decomposition(original_train_series, decomposition_dict, 
                       title="Training Data Decomposition")

# Step 2: Create augmented samples
augmented_samples = create_augmented_samples(
    original_train_series,
    num_samples=NUM_AUGMENTED_SAMPLES,
    decomposition_dict=decomposition_dict,
    std_multiplier=JITTER_STD_MULTIPLIER
)

print(f"\n[AUGMENTATION] Created {len(augmented_samples)} augmented samples")
print(f"[AUGMENTATION] Jitter std multiplier: {JITTER_STD_MULTIPLIER}")

# Visualize augmentation (optional - comment out if not needed)
visualize_augmented_samples(original_train_series, augmented_samples,
                           title="Original vs Augmented Training Samples")

# Step 3: Prepare augmented training dataframe for Prophet
train_prophet_augmented = train_overall[["ds", "y_norm"]].rename(
    columns={"y_norm": "y"}
).copy()

# Create additional augmented dataframes
for idx, aug_values in enumerate(augmented_samples):
    aug_df = train_overall[["ds"]].copy()
    aug_df["y"] = aug_values
    train_prophet_augmented = pd.concat(
        [train_prophet_augmented, aug_df],
        ignore_index=True
    )

# Sort by ds for Prophet (important for time series)
train_prophet_augmented = train_prophet_augmented.sort_values("ds").reset_index(drop=True)

print(f"\n[PROPHET TRAINING DATA]")
print(f"Original: {len(train_overall)}")
print(f"Augmented: {len(train_prophet_augmented)}")


# ============================================================
# 5) PROPHET (NORMALIZED)
# ============================================================
train_prophet = train_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})
test_prophet  = test_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"})

m_overall = Prophet(
    seasonality_mode="multiplicative",
    yearly_seasonality=True,
    weekly_seasonality=False,
    daily_seasonality=False,
    changepoint_prior_scale=0.01,
)
m_overall.fit(train_prophet)


future_all = m_overall.make_future_dataframe(periods=len(test_prophet), freq="MS")
fcst_all   = m_overall.predict(future_all)

df_prophet_test = (
    fcst_all[["ds", "yhat"]]
    .merge(test_prophet[["ds", "y"]], on="ds", how="inner")
    .sort_values("ds")
)

# MAPE in NORMALIZED SPACE
mape_prophet_norm = mean_absolute_percentage_error(
    df_prophet_test["y"], df_prophet_test["yhat"]
)

test_overall["prophet_pred_norm"] = scaler_y_global.inverse_transform(df_prophet_test["yhat"].values .reshape(-1, 1)).flatten()
print("Norm Train: ",len(train_prophet))
print("Aug futures: ",len(fcst_all))

print(f"[Prophet]  MAPE (normalized): {mape_prophet_norm:.4f}")

# ============================================================
m_overall = Prophet(
    seasonality_mode="additive",
    yearly_seasonality=True,
    weekly_seasonality=False,
    daily_seasonality=False,
    changepoint_prior_scale=0.01,
)
m_overall.fit(train_prophet_augmented)
print("Augmented Train: ",len(train_prophet_augmented))
future_all = m_overall.make_future_dataframe(periods=len(test_prophet), freq="MS")
fcst_all   = m_overall.predict(future_all)
print("Aug futures: ",len(fcst_all))
df_prophet_test_aug = (
    fcst_all[["ds", "yhat"]]
    .merge(test_prophet[["ds", "y"]], on="ds", how="inner")
    .sort_values("ds")
)

# MAPE in NORMALIZED SPACE
mape_prophet_norm = mean_absolute_percentage_error(
    df_prophet_test_aug["y"], df_prophet_test_aug["yhat"]
)
print(f"[Prophet]  MAPE (aug): {mape_prophet_norm:.4f}")



# seasonal Moving bootstrap

In [None]:
# ============================================================
# 0) ADDITIONAL IMPORTS
# ============================================================
from statsmodels.tsa.seasonal import STL
import random

# ============================================================
# 1) SEASONAL MOVING BLOCK BOOTSTRAP (MBB)
# ============================================================
def seasonal_moving_block_bootstrap(
    series: pd.Series,
    block_size: int = 12,
    num_samples: int = 4,
    random_seed: int = 2
) -> list:
    """
    Seasonal Moving Block Bootstrap (MBB)

    List[np.ndarray]
        Bootstrapped time series samples.
    """
    random_seed=2
    np.random.seed(random_seed)
    random.seed(random_seed)

    values = series.values
    n = len(values)

    if n < block_size:
        raise ValueError("Time series shorter than block size.")

    # All possible seasonal blocks
    blocks = []
    for start in range(0, n - block_size + 1):
        blocks.append(values[start : start + block_size])

    blocks = np.array(blocks)
    n_blocks_needed = int(np.ceil(n / block_size))

    augmented_series_list = []

    for _ in range(num_samples):
        sampled_blocks = blocks[
            np.random.choice(len(blocks), size=n_blocks_needed, replace=True)
        ]

        bootstrapped = np.concatenate(sampled_blocks)[:n]

        augmented_series_list.append(bootstrapped)

    return augmented_series_list


# ============================================================
# 2) VISUALIZATION (OPTIONAL)
# ============================================================
def visualize_mbb(
    original_series: pd.Series,
    augmented_samples: list,
    title: str = "Seasonal Moving Block Bootstrap"
):
    plt.figure(figsize=(12, 6))
    plt.plot(
        original_series.index,
        original_series.values,
        label="Original",
        linewidth=3,
        alpha=0.9,
        color="black"
    )

    for i, aug in enumerate(augmented_samples):
        plt.plot(
            original_series.index,
            aug,
            linewidth=1.5,
            alpha=0.6,
            label=f"MBB Sample {i+1}"
        )

    plt.title(title)
    plt.xlabel("Time Index")
    plt.ylabel("Normalized Value")
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()


# ============================================================
# 3) APPLY MBB TO TRAINING DATA
# ============================================================
print("\n" + "=" * 60)
print("PHASE 3: SEASONAL MOVING BLOCK BOOTSTRAP (MBB)")
print("=" * 60)

original_train_series = pd.Series(
    train_overall["y_norm"].values,
    index=train_overall["ds"]
)

mbb_augmented_samples = seasonal_moving_block_bootstrap(
    series=original_train_series,
    block_size=DECOMPOSITION_PERIOD,  # 12 months
    num_samples=NUM_AUGMENTED_SAMPLES,
    random_seed=RANDOM_SEED
)

print(f"[MBB] Generated {len(mbb_augmented_samples)} bootstrapped samples")

# Optional visualization
visualize_mbb(
    original_series=original_train_series,
    augmented_samples=mbb_augmented_samples,
    title="Original vs Seasonal MBB-Augmented Series"
)

# ============================================================
# 4) PREPARE PROPHET TRAINING DATA (MBB)
# ============================================================
train_prophet_mbb = train_overall[["ds", "y_norm"]].rename(
    columns={"y_norm": "y"}
).copy()

for aug in mbb_augmented_samples:
    aug_df = train_overall[["ds"]].copy()
    aug_df["y"] = aug
    train_prophet_mbb = pd.concat(
        [train_prophet_mbb, aug_df],
        ignore_index=True
    )

train_prophet_mbb = (
    train_prophet_mbb
    .sort_values("ds")
    .reset_index(drop=True)
)

print(f"[MBB] Prophet training size: {len(train_prophet_mbb)}")


# ============================================================
# 5) PROPHET TRAINING (MBB-AUGMENTED)
# ============================================================
m_mbb = Prophet(
    seasonality_mode="multiplicative",
    yearly_seasonality=True,
    weekly_seasonality=False,
    daily_seasonality=False,
    changepoint_prior_scale=0.1,
)

m_mbb.fit(train_prophet_mbb)

future_mbb = m_mbb.make_future_dataframe(
    periods=len(test_overall),
    freq="MS"
)

forecast_mbb = m_mbb.predict(future_mbb)

df_prophet_test_mbb = (
    forecast_mbb[["ds", "yhat"]]
    .merge(
        test_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"}),
        on="ds",
        how="inner"
    )
    .sort_values("ds")
)

mape_prophet_mbb = mean_absolute_percentage_error(
    df_prophet_test_mbb["y"],
    df_prophet_test_mbb["yhat"]
)

print(f"[Prophet + MBB] MAPE (normalized): {mape_prophet_mbb:.4f}")

# Inverse-transform predictions (original scale)
test_overall["prophet_pred_mbb"] = scaler_y_global.inverse_transform(
    df_prophet_test_mbb["yhat"].values.reshape(-1, 1)
).flatten()


## INTERPOLATION

In [None]:
# ============================================================
# 0) IMPORTS
# ============================================================
import numpy as np
import pandas as pd

from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error

from scipy.interpolate import interp1d, CubicSpline, UnivariateSpline
from numpy.fft import fft, ifft

# ============================================================
# 1) INTERPOLATION AUGMENTATION FUNCTIONS
# ============================================================
def linear_interpolation_augmentation(series, scale_factors=(0.9, 1.1)):
    x = np.arange(len(series))
    y = series.values
    augmented = []

    for scale in scale_factors:
        new_len = int(len(series) * scale)
        x_new = np.linspace(0, len(series) - 1, new_len)
        f = interp1d(x, y, kind="linear")
        y_new = f(x_new)
        y_resampled = np.interp(x, np.linspace(0, new_len - 1, new_len), y_new)
        augmented.append(y_resampled)

    return augmented


def cubic_interpolation_augmentation(series, scale_factors=(0.9, 1.1)):
    x = np.arange(len(series))
    y = series.values
    augmented = []

    for scale in scale_factors:
        new_len = int(len(series) * scale)
        x_new = np.linspace(0, len(series) - 1, new_len)
        cs = CubicSpline(x, y)
        y_new = cs(x_new)
        y_resampled = np.interp(x, np.linspace(0, new_len - 1, new_len), y_new)
        augmented.append(y_resampled)

    return augmented


def spline_interpolation_augmentation(series, smoothing_factors=(0.1, 0.3)):
    x = np.arange(len(series))
    y = series.values
    augmented = []

    for s in smoothing_factors:
        spline = UnivariateSpline(x, y, s=s * len(series))
        augmented.append(spline(x))

    return augmented


def fourier_interpolation_augmentation(series, keep_ratio=0.85):
    y = series.values
    n = len(y)

    coeffs = fft(y)
    cutoff = int(keep_ratio * n / 2)

    filtered = np.zeros_like(coeffs)
    filtered[:cutoff] = coeffs[:cutoff]
    filtered[-cutoff:] = coeffs[-cutoff:]

    return [np.real(ifft(filtered))]


# ============================================================
# 2) CREATE INTERPOLATION-AUGMENTED TRAINING SET
# ============================================================
print("\n" + "=" * 60)
print("PHASE: PROPHET WITH INTERPOLATION AUGMENTATION")
print("=" * 60)

original_train_series = pd.Series(
    train_overall["y_norm"].values,
    index=train_overall["ds"]
)

# Linear interpolation
augmented_linear = linear_interpolation_augmentation(
    original_train_series,
    scale_factors=(0.9, 1.1, 0.5, 1.75)
)

# Cubic interpolation
augmented_cubic = cubic_interpolation_augmentation(
    original_train_series,
    scale_factors=(0.9, 1.1, 0.5, 0.05)
)

# Spline interpolation
augmented_spline = spline_interpolation_augmentation(
    original_train_series,
    smoothing_factors=(0.1, 0.3, 0.5, 0.7)
)

# Fourier interpolation
augmented_fourier = fourier_interpolation_augmentation(
    original_train_series,
    keep_ratio=0.75
)

augmented_samples = (
    # augmented_linear +
    # augmented_cubic +
    # augmented_spline +
    augmented_fourier
)


# ============================================================
# VISUALIZATION OF INTERPOLATION AUGMENTATIONS
# ============================================================
import matplotlib.pyplot as plt

def visualize_interpolation_augmentations(
    original_series: pd.Series,
    linear_samples: list = [],
    cubic_samples: list = [],
    spline_samples: list = [],
    fourier_samples: list = [],
    title: str = "Interpolation-Based Time Series Augmentation"
):
    """
    Visualize original and interpolated augmented samples.
    """

    plt.figure(figsize=(14, 7))

    # Plot original series
    plt.plot(
        original_series.index,
        original_series.values,
        label="Original",
        linewidth=3,
        color="black",
        alpha=0.9
    )

    # Linear interpolation
    for i, aug in enumerate(linear_samples):
        plt.plot(
            original_series.index,
            aug,
            linestyle="--",
            linewidth=1.5,
            alpha=0.6,
            label=f"Linear Aug {i+1}"
        )

    # Cubic interpolation
    for i, aug in enumerate(cubic_samples):
        plt.plot(
            original_series.index,
            aug,
            linestyle="-.",
            linewidth=1.5,
            alpha=0.6,
            label=f"Cubic Aug {i+1}"
        )

    # Spline interpolation
    for i, aug in enumerate(spline_samples):
        plt.plot(
            original_series.index,
            aug,
            linestyle=":",
            linewidth=2,
            alpha=0.6,
            label=f"Spline Aug {i+1}"
        )

    # Fourier interpolation
    for i, aug in enumerate(fourier_samples):
        plt.plot(
            original_series.index,
            aug,
            linestyle="-",
            linewidth=2,
            alpha=0.7,
            label=f"Fourier Aug {i+1}"
        )

    plt.title(title)
    plt.xlabel("Time")
    plt.ylabel("Normalized Value")
    plt.legend(
        loc="upper left",
        ncol=2,
        fontsize=9
    )
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()


# ============================================================
# APPLY VISUALIZATION
# ============================================================
# ============================================================
# CREATE INTERPOLATION AUGMENTATIONS (DEFINE VARIABLES)
# ============================================================


print("Augmented samples created:")
print(f"Linear  : {len(augmented_linear)}")
print(f"Cubic   : {len(augmented_cubic)}")
print(f"Spline  : {len(augmented_spline)}")
print(f"Fourier : {len(augmented_fourier)}")


# ============================================================
# VISUALIZE AUGMENTATIONS
# ============================================================

visualize_interpolation_augmentations(
    original_series=original_train_series,
    # linear_samples=augmented_linear,
    # cubic_samples=augmented_cubic,
    # spline_samples=augmented_spline,
    fourier_samples=augmented_fourier,
    title="Original vs Interpolation-Based Augmented Series"
)




print(f"[Interpolation] Augmented samples: {len(augmented_samples)}")

train_prophet_interp = train_overall[["ds", "y_norm"]].rename(
    columns={"y_norm": "y"}
).copy()

for aug in augmented_samples:
    aug_df = train_overall[["ds"]].copy()
    aug_df["y"] = aug
    train_prophet_interp = pd.concat(
        [train_prophet_interp, aug_df],
        ignore_index=True
    )

train_prophet_interp = (
    train_prophet_interp
    .sort_values("ds")
    .reset_index(drop=True)
)

print(f"[Prophet] Training rows (augmented): {len(train_prophet_interp)}")

# ============================================================
# 3) TRAIN PROPHET
# ============================================================
m_interp = Prophet(
    seasonality_mode="additive",
    yearly_seasonality=True,
    weekly_seasonality=False,
    daily_seasonality=False,
    changepoint_prior_scale=0.01
)

m_interp.fit(train_prophet_interp)

# ============================================================
# 4) FORECAST
# ============================================================
future = m_interp.make_future_dataframe(
    periods=len(test_overall),
    freq="MS"
)

forecast = m_interp.predict(future)

df_test_interp = (
    forecast[["ds", "yhat"]]
    .merge(
        test_overall[["ds", "y_norm"]].rename(columns={"y_norm": "y"}),
        on="ds",
        how="inner"
    )
    .sort_values("ds")
)

# ============================================================
# 5) EVALUATION
# ============================================================

mape_interp_unnorm= mean_absolute_percentage_error(
    scaler_y_global.inverse_transform(df_test_interp["y"].values.reshape(-1, 1)),
    scaler_y_global.inverse_transform(df_test_interp["yhat"].values.reshape(-1, 1))
)

print(f"[Prophet + Interpolation] MAPE (normalized): {mape_interp:.4f}")
print(f"[Prophet + Interpolation] MAPE (unnormalized): {mape_interp_unnorm:.4f}")

# ============================================================
# 6) INVERSE SCALE PREDICTIONS
# ============================================================
test_overall["prophet_pred_interp"] = scaler_y_global.inverse_transform(
    df_test_interp["yhat"].values.reshape(-1, 1)
).flatten()


In [None]:
# ============================================================
# TIME SERIES PLOT: ACTUAL vs PREDICTED
# (Prophet, NN, Hybrid — ORIGINAL UNITS)
# ============================================================

plt.figure(figsize=(16, 7))

# -----------------------
# 1) Training data (actual)
# -----------------------
plt.plot(
    train_overall["ds"],
    train_overall["y_norm"],
    label="Training Actual",
    color="black",
    linewidth=2
)

# -----------------------
# 2) Test actuals
# -----------------------
plt.plot(
    test_overall["ds"],
    test_overall["y_norm"],
    label="Test Actual",
    color="black",
    linestyle="--",
    linewidth=2
)

# -----------------------
# 3) Prophet predictions
# -----------------------
plt.plot(
    test_overall["ds"],
    df_prophet_test["yhat"],
    label="Prophet Prediction",
    linestyle=":",
    linewidth=2.5,
    color="tab:blue"
)

# -----------------------
# 4) NN predictions
# -----------------------
plt.plot(
    test_overall["ds"],
    df_prophet_test_mbb["yhat"],
    label="NN Prediction",
    linestyle=":",
    linewidth=2.5,
    color="tab:green"
)


# -----------------------
# 6) Vertical line for train/test split
# -----------------------
split_date = train_overall["ds"].iloc[-1]

plt.axvline(
    x=split_date,
    color="gray",
    linestyle="--",
    linewidth=2,
    label="Train/Test Split"
)

# -----------------------
# Labels, title, legend
# -----------------------
plt.xlabel("Date", fontsize=12)
plt.ylabel("Scope 1 Emissions (Original Units)", fontsize=12)
plt.title(
    "Actual vs Predicted Emissions\n"
    "(Prophet vs Rule-Regularised NN vs Hybrid)",
    fontsize=14
)

plt.grid(True, linestyle="--", alpha=0.4)
plt.legend(fontsize=12)
plt.tight_layout()

plt.show()


# phase

In [None]:
# ===== 0) Imports =====
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error

import torch
import torch.nn as nn
import torch.optim as optim
import seaborn as sns

plt.style.use("seaborn-v0_8")  # updated style name
sns.set_palette("viridis")

%matplotlib inline

# ===== 1) CONFIG =====
DATE_COL = "datetime"
TARGET_COL = "Scope1_per_unit"
PLANT_COL = "Plant Name"
FORECAST_MONTHS = 7
TEST_MONTHS = 7

# ===== 2) START FROM df_2020 =====
df = df_2022.copy()
# ----- 2a) Ensure datetime -----
if DATE_COL in df.columns:
    ds = pd.to_datetime(df[DATE_COL], errors="coerce", infer_datetime_format=True)
else:
    ds = pd.Series(pd.NaT, index=df.index)

needs_rebuild = ds.isna()
if needs_rebuild.any():
    if not {"year", "month"}.issubset(df.columns):
        raise ValueError("No valid 'Current Date' and missing 'year'/'month' to rebuild dates.")
    ds_rebuilt = pd.to_datetime(dict(year=df.loc[needs_rebuild, "year"],
                                     month=df.loc[needs_rebuild, "month"],
                                     day=1))
    ds.loc[needs_rebuild] = ds_rebuilt

df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()

# ----- 2b) Cleaning -----
df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

# ----- 2c) Imputation -----
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
    g = group.copy()
    for c in num_cols:
        g[c] = g[c].replace(0, np.nan)
        if g[c].notna().any():
            g[c] = g[c].fillna(g[c].mean())
    return g

if "year" in df.columns:
    df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
else:
    df = impute_zero_nan_with_mean(df)

for c in num_cols:
    if df[c].isna().any():
        df[c] = df[c].fillna(df[c].mean())

# ===== 3) OVERALL SERIES =====
def prepare_overall_series(df_in: pd.DataFrame) -> pd.DataFrame:
    s = (df_in.groupby("ds", as_index=False)[TARGET_COL]
         .sum()
         .sort_values("ds"))
    s = s.rename(columns={TARGET_COL: "y"})
    return s

overall = prepare_overall_series(df)
overall = overall.sort_values("ds").reset_index(drop=True)

# Train/test split
split_point = len(overall) - TEST_MONTHS if len(overall) > TEST_MONTHS else len(overall)
train_overall = overall.iloc[:split_point].copy()
test_overall = overall.iloc[split_point:].copy()

# ===== 3A) PROPHET =====
m_overall = Prophet(seasonality_mode="multiplicative", yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False,changepoint_prior_scale=0.1 )
m_overall.fit(train_overall)
future_all = m_overall.make_future_dataframe(periods=len(test_overall), freq="MS")
fcst_all = m_overall.predict(future_all)
# take exactly the test horizon (no overlap with train)


if not test_overall.empty:
    y_pred_prophet = (fcst_all[["ds", "yhat"]]
              .merge(test_overall[["ds", "y"]], on="ds", how="inner"))
    mape_prophet = mean_absolute_percentage_error(y_pred_prophet["y"], y_pred_prophet["yhat"])
else:
    mape_prophet = np.nan
print(mape_prophet)


import matplotlib.pyplot as plt

# Merge forecast and actuals for plotting
overall_pred = fcst_all.merge(overall, on="ds", how="left")
overall_pred["residual"] = overall_pred["y"] - overall_pred["yhat"]

# Create a figure with 4 subplots
fig, axes = plt.subplots(4, 1, figsize=(14, 20))

# 1️⃣ Forecast vs Actuals
axes[0].plot(overall_pred["ds"], overall_pred["y"], label="Actual", marker="o")
axes[0].plot(overall_pred["ds"], overall_pred["yhat"], label="Prophet Forecast", linestyle="--")
axes[0].fill_between(overall_pred["ds"], overall_pred["yhat_lower"], overall_pred["yhat_upper"], color="skyblue", alpha=0.3, label="Uncertainty")
axes[0].set_title("Forecast vs Actuals")
axes[0].set_xlabel("Date")
axes[0].set_ylabel("Scope1_per_unit")
axes[0].legend()
axes[0].grid(True)

# 2️⃣ Trend
trend = fcst_all[["ds", "trend"]]
axes[1].plot(trend["ds"], trend["trend"], color="green")
axes[1].set_title("Trend Component")
axes[1].set_xlabel("Date")
axes[1].set_ylabel("Trend")
axes[1].grid(True)

# 3️⃣ Yearly Seasonality
seasonality = fcst_all[["ds", "yearly"]]
axes[2].plot(seasonality["ds"], seasonality["yearly"], color="orange")
axes[2].set_title("Yearly Seasonality Component")
axes[2].set_xlabel("Date")
axes[2].set_ylabel("Seasonality")
axes[2].grid(True)

# 4️⃣ Residuals
axes[3].plot(overall_pred["ds"], overall_pred["residual"], color="red", marker="o")
axes[3].axhline(0, color="black", linestyle="--")
axes[3].set_title("Residuals (Actual - Forecast)")
axes[3].set_xlabel("Date")
axes[3].set_ylabel("Residual")
axes[3].grid(True)

plt.tight_layout()
plt.show()



# Breakdown

In [None]:
# ===== 0) Imports =====
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error

import torch
import torch.nn as nn
import torch.optim as optim
import seaborn as sns

plt.style.use("seaborn-v0_8")  # updated style name
sns.set_palette("viridis")

%matplotlib inline

# ===== 1) CONFIG =====
DATE_COL = "datetime"
TARGET_COL = "Scope1_per_unit"
PLANT_COL = "Plant Name"
FORECAST_MONTHS = 7
TEST_MONTHS = 7

# ===== 2) START FROM df_2020 =====
df = df_2022.copy()
df["India_Flag"] = df["Plant Location"].apply(lambda x: "India" if x=="India" else "Outside India")
df=df[df['India_Flag']=='Outside India']
# ----- 2a) Ensure datetime -----
if DATE_COL in df.columns:
    ds = pd.to_datetime(df[DATE_COL], errors="coerce", infer_datetime_format=True)
else:
    ds = pd.Series(pd.NaT, index=df.index)

needs_rebuild = ds.isna()
if needs_rebuild.any():
    if not {"year", "month"}.issubset(df.columns):
        raise ValueError("No valid 'Current Date' and missing 'year'/'month' to rebuild dates.")
    ds_rebuilt = pd.to_datetime(dict(year=df.loc[needs_rebuild, "year"],
                                     month=df.loc[needs_rebuild, "month"],
                                     day=1))
    ds.loc[needs_rebuild] = ds_rebuilt

df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()

# ----- 2b) Cleaning -----
df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

# ----- 2c) Imputation -----
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
    g = group.copy()
    for c in num_cols:
        g[c] = g[c].replace(0, np.nan)
        if g[c].notna().any():
            g[c] = g[c].fillna(g[c].mean())
    return g

if "year" in df.columns:
    df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
else:
    df = impute_zero_nan_with_mean(df)

for c in num_cols:
    if df[c].isna().any():
        df[c] = df[c].fillna(df[c].mean())

# ===== 3) OVERALL SERIES =====
def prepare_overall_series(df_in: pd.DataFrame) -> pd.DataFrame:
    s = (df_in.groupby("ds", as_index=False)[TARGET_COL]
         .sum()
         .sort_values("ds"))
    s = s.rename(columns={TARGET_COL: "y"})
    return s

overall = prepare_overall_series(df)
overall = overall.sort_values("ds").reset_index(drop=True)

# Train/test split
split_point = len(overall) - TEST_MONTHS if len(overall) > TEST_MONTHS else len(overall)
train_overall = overall.iloc[:split_point].copy()
test_overall = overall.iloc[split_point:].copy()

# ===== 3A) PROPHET =====
m_overall = Prophet(seasonality_mode="multiplicative", yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False,changepoint_prior_scale=0.1 )
m_overall.fit(train_overall)
future_all = m_overall.make_future_dataframe(periods=len(test_overall), freq="MS")
fcst_all = m_overall.predict(future_all)
# take exactly the test horizon (no overlap with train)


if not test_overall.empty:
    y_pred_prophet = (fcst_all[["ds", "yhat"]]
              .merge(test_overall[["ds", "y"]], on="ds", how="inner"))
    mape_prophet = mean_absolute_percentage_error(y_pred_prophet["y"], y_pred_prophet["yhat"])
else:
    mape_prophet = np.nan
print(mape_prophet)
flag=True
itr=0
while flag:
# ===== 3B) PINN FORECAST =====
    class PINN(nn.Module):
        def __init__(self, in_dim=1, hidden_dim=32, out_dim=1):
            super().__init__()
            self.net = nn.Sequential(
                nn.Linear(in_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, out_dim)
            )
        def forward(self, x):
            return self.net(x)

    # Time index as feature
    train_time = np.arange(len(train_overall)).reshape(-1, 1)
    test_time = np.arange(len(train_overall), len(overall)).reshape(-1, 1)
    y_train = train_overall["y"].values.reshape(-1, 1)
    # Physics features (optional)
    if set(['Electricity Grid Energy Per Unit (GJ)']).issubset(df.columns):
        agg = df.groupby("ds")[['Electricity Grid Energy Per Unit (GJ)']].sum().reset_index()
        agg = agg.sort_values("ds").reset_index(drop=True)
        X_phys = (agg[['Electricity Grid Energy Per Unit (GJ)']]/1000).values
    else:
        X_phys = np.ones((len(overall), 3))

    X_phys_train = X_phys[:len(train_overall)]
    X_phys_test = X_phys[len(train_overall):]


    from sklearn.preprocessing import StandardScaler

    # ==== Standardize features & target ====
    scaler_X = StandardScaler()
    scaler_Y = StandardScaler()
    scaler_phys = StandardScaler()

    # Fit scalers on training data only
    train_time_scaled = scaler_X.fit_transform(train_time)
    test_time_scaled = scaler_X.transform(test_time)

    y_train_scaled = scaler_Y.fit_transform(y_train)

    X_phys_train_scaled = scaler_phys.fit_transform(X_phys_train)
    X_phys_test_scaled = scaler_phys.transform(X_phys_test)

    # Torch tensors (scaled data)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    X_t = torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
    Y_t = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
    X_phys_t = torch.tensor(X_phys_train_scaled, dtype=torch.float32).to(device)


    model = PINN(in_dim=1, hidden_dim=32, out_dim=1).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    class MAPELoss(nn.Module):
        def __init__(self, eps=1e-6):
            super().__init__()
            self.eps = eps
        def forward(self, y_pred, y_true):
            return torch.mean(torch.abs(y_pred - y_true) / (torch.abs(y_true) + self.eps))

    def physics_residual_loss_mape(y_pred, features, eps=1e-6):
        elec = features[:, 0]
        physics_estimate = elec/8
        violation = torch.relu(physics_estimate - y_pred.squeeze())
        return torch.mean(violation / (torch.abs(physics_estimate) + eps))

    mape_loss_fn = MAPELoss()

    # Train PINN
    for epoch in range(10000):
        model.train()
        optimizer.zero_grad()
        y_pred = model(X_t)
        data_loss = torch.mean((y_pred - Y_t) ** 2)
        phys_loss = physics_residual_loss_mape(y_pred, X_phys_t)
        loss =  data_loss 
        loss.backward()
        optimizer.step()
        # if epoch%1000==0:
        #     print(f"Epoch {epoch}, MAPE Loss: {loss.item():.6f}")
        #     print(f"data: {data_loss:.3f}, phy:{phys_loss:.3f}, total: {(data_loss+phys_loss):.3f}")
        #     print()


    # Predictions
    model.eval()
    with torch.no_grad():
        y_train_pred_scaled = model(torch.tensor(train_time_scaled, dtype=torch.float32).to(device)).cpu().numpy()
        y_test_pred_scaled = model(torch.tensor(test_time_scaled, dtype=torch.float32).to(device)).cpu().numpy()

    # Inverse transform to original units
    y_train_pred = scaler_Y.inverse_transform(y_train_pred_scaled)
    y_test_pred = scaler_Y.inverse_transform(y_test_pred_scaled)

    train_overall["pinn_pred"] = y_train_pred.flatten()
    test_overall["pinn_pred"] = y_test_pred.flatten()


    if not test_overall.empty:
        mape_pinn = mean_absolute_percentage_error(test_overall["y"], test_overall["pinn_pred"])
    else:
        mape_pinn = np.nan

    # ===== 3C) PINN + PROPHET RESIDUAL STACK =====
    train_overall["residual"] = train_overall["y"] - train_overall["pinn_pred"]
    test_overall["residual"]  = test_overall["y"] - test_overall["pinn_pred"]

    train_res = train_overall[["ds","residual"]].rename(columns={"residual":"y"})
    test_res  = test_overall[["ds","residual"]].rename(columns={"residual":"y"})



    m_res = Prophet(seasonality_mode="multiplicative", yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False,changepoint_prior_scale=0.1 )
    m_res.fit(train_res)

    future_res = m_res.make_future_dataframe(periods=len(test_res), freq="MS")
    forecast_res = m_res.predict(future_res)

    res_pred = forecast_res["yhat"].iloc[len(train_res):].values
    test_overall["final_pred"] = test_overall["pinn_pred"].values + res_pred

    if not test_overall.empty:
        mape_pinn_prophet = mean_absolute_percentage_error(test_overall["y"], test_overall["final_pred"])
    else:
        mape_pinn_prophet = np.nan

    itr+=1
    if mape_pinn_prophet<mape_prophet:
        print(mape_pinn_prophet,mape_prophet)
        flag=False
    else:
        print(f'itr: {itr}, mape_pinn_prophet: {mape_pinn_prophet}, mape_prophet: {mape_prophet}')


print("Iterations:",itr)
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 6))

# Train residuals
plt.plot(train_res["ds"], train_res["y"], 
         label="Train Residuals", marker="o", linestyle="-", color="blue")

# Test residuals
plt.plot(test_res["ds"], test_res["y"], 
         label="Test Residuals", marker="s", linestyle="--", color="red")

# Reference line (zero residuals)
plt.axhline(0, color="black", linestyle="--", linewidth=1)

plt.title("Residuals: Train vs Test (PINN Predictions)")
plt.xlabel("Date")
plt.ylabel("Residual (y - y_pred)")
plt.legend()
plt.grid(True)
plt.show()

# ===== 4) RESULTS =====
print("\n--- RESULTS ---")
print(f"Prophet-only MAPE: {mape_prophet:.2f}")
print(f"PINN-only MAPE: {mape_pinn:.2f}")
print(f"PINN + Prophet MAPE: {mape_pinn_prophet:.2f}")


In [None]:
import matplotlib.pyplot as plt

# Collect predictions
predictions = {
    "Prophet-only": y_pred_prophet["yhat"].values if not test_overall.empty else [],
    "PINN-only": test_overall["pinn_pred"].values if not test_overall.empty else [],
    "PINN + Prophet": test_overall["final_pred"].values if not test_overall.empty else []
}

# Compute MAPEs
results = {
    "Prophet-only": mape_prophet,
    "PINN-only": mape_pinn,
    "PINN + Prophet": mape_pinn_prophet
}

plt.figure(figsize=(14, 6))

# Training data (actuals)
plt.plot(train_overall["ds"], train_overall["y"], 
         label="Train (Actual)", color="black", linewidth=2)

# Test data (actuals)
plt.plot(test_overall["ds"], test_overall["y"], 
         label="Test (Actual)", color="blue", linewidth=2)
# Forecasts
for name, y_pred in predictions.items():
    if len(y_pred) > 0:  # only plot if available
        plt.plot(test_overall["ds"], y_pred, '--', label=f"{name} (MAPE {results[name]:.2f}%)", linewidth=2)

# Vertical line for train/test split
if not test_overall.empty:
    split_date = test_overall["ds"].iloc[0]
    plt.axvline(split_date, color="gray", linestyle="--", label="Train/Test Split")

plt.title("Carbon Emissions Forecast (Outside India): Prophet vs PINN vs Hybrid")
plt.xlabel("Date")
plt.ylabel("Monthly Emissions (Scope 1)")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
# ===== 0) Imports =====
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error

import torch
import torch.nn as nn
import torch.optim as optim
import seaborn as sns

plt.style.use("seaborn-v0_8")  # updated style name
sns.set_palette("viridis")

%matplotlib inline

# ===== 1) CONFIG =====
DATE_COL = "datetime"
TARGET_COL = "Scope1_per_unit"
PLANT_COL = "Plant Name"
FORECAST_MONTHS = 7
TEST_MONTHS = 7

# ===== 2) START FROM df_2020 =====
df = df_2022.copy()
df["India_Flag"] = df["Plant Location"].apply(lambda x: "India" if x=="India" else "Outside India")
df=df[df['India_Flag']=='India']
# ----- 2a) Ensure datetime -----
if DATE_COL in df.columns:
    ds = pd.to_datetime(df[DATE_COL], errors="coerce", infer_datetime_format=True)
else:
    ds = pd.Series(pd.NaT, index=df.index)

needs_rebuild = ds.isna()
if needs_rebuild.any():
    if not {"year", "month"}.issubset(df.columns):
        raise ValueError("No valid 'Current Date' and missing 'year'/'month' to rebuild dates.")
    ds_rebuilt = pd.to_datetime(dict(year=df.loc[needs_rebuild, "year"],
                                     month=df.loc[needs_rebuild, "month"],
                                     day=1))
    ds.loc[needs_rebuild] = ds_rebuilt

df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()

# ----- 2b) Cleaning -----
df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

# ----- 2c) Imputation -----
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
    g = group.copy()
    for c in num_cols:
        g[c] = g[c].replace(0, np.nan)
        if g[c].notna().any():
            g[c] = g[c].fillna(g[c].mean())
    return g

if "year" in df.columns:
    df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
else:
    df = impute_zero_nan_with_mean(df)

for c in num_cols:
    if df[c].isna().any():
        df[c] = df[c].fillna(df[c].mean())

# ===== 3) OVERALL SERIES =====
def prepare_overall_series(df_in: pd.DataFrame) -> pd.DataFrame:
    s = (df_in.groupby("ds", as_index=False)[TARGET_COL]
         .sum()
         .sort_values("ds"))
    s = s.rename(columns={TARGET_COL: "y"})
    return s

overall = prepare_overall_series(df)
overall = overall.sort_values("ds").reset_index(drop=True)

# Train/test split
split_point = len(overall) - TEST_MONTHS if len(overall) > TEST_MONTHS else len(overall)
train_overall = overall.iloc[:split_point].copy()
test_overall = overall.iloc[split_point:].copy()

# ===== 3A) PROPHET =====
m_overall = Prophet(seasonality_mode="multiplicative", yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False,changepoint_prior_scale=0.1 )
m_overall.fit(train_overall)
future_all = m_overall.make_future_dataframe(periods=len(test_overall), freq="MS")
fcst_all = m_overall.predict(future_all)
# take exactly the test horizon (no overlap with train)


if not test_overall.empty:
    y_pred_prophet = (fcst_all[["ds", "yhat"]]
              .merge(test_overall[["ds", "y"]], on="ds", how="inner"))
    mape_prophet = mean_absolute_percentage_error(y_pred_prophet["y"], y_pred_prophet["yhat"])
else:
    mape_prophet = np.nan
print(mape_prophet)
flag=True
itr=0
while flag:
# ===== 3B) PINN FORECAST =====
    class PINN(nn.Module):
        def __init__(self, in_dim=1, hidden_dim=32, out_dim=1):
            super().__init__()
            self.net = nn.Sequential(
                nn.Linear(in_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, out_dim)
            )
        def forward(self, x):
            return self.net(x)

    # Time index as feature
    train_time = np.arange(len(train_overall)).reshape(-1, 1)
    test_time = np.arange(len(train_overall), len(overall)).reshape(-1, 1)
    y_train = train_overall["y"].values.reshape(-1, 1)
    # Physics features (optional)
    if set(['Electricity Grid Energy Per Unit (GJ)']).issubset(df.columns):
        agg = df.groupby("ds")[['Electricity Grid Energy Per Unit (GJ)']].sum().reset_index()
        agg = agg.sort_values("ds").reset_index(drop=True)
        X_phys = (agg[['Electricity Grid Energy Per Unit (GJ)']]/1000).values
    else:
        X_phys = np.ones((len(overall), 3))

    X_phys_train = X_phys[:len(train_overall)]
    X_phys_test = X_phys[len(train_overall):]


    from sklearn.preprocessing import StandardScaler

    # ==== Standardize features & target ====
    scaler_X = StandardScaler()
    scaler_Y = StandardScaler()
    scaler_phys = StandardScaler()

    # Fit scalers on training data only
    train_time_scaled = scaler_X.fit_transform(train_time)
    test_time_scaled = scaler_X.transform(test_time)

    y_train_scaled = scaler_Y.fit_transform(y_train)

    X_phys_train_scaled = scaler_phys.fit_transform(X_phys_train)
    X_phys_test_scaled = scaler_phys.transform(X_phys_test)

    # Torch tensors (scaled data)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    X_t = torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
    Y_t = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
    X_phys_t = torch.tensor(X_phys_train_scaled, dtype=torch.float32).to(device)


    model = PINN(in_dim=1, hidden_dim=32, out_dim=1).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    class MAPELoss(nn.Module):
        def __init__(self, eps=1e-6):
            super().__init__()
            self.eps = eps
        def forward(self, y_pred, y_true):
            return torch.mean(torch.abs(y_pred - y_true) / (torch.abs(y_true) + self.eps))

    def physics_residual_loss_mape(y_pred, features, eps=1e-6):
        elec = features[:, 0]
        physics_estimate = elec/8
        violation = torch.relu(physics_estimate - y_pred.squeeze())
        return torch.mean(violation / (torch.abs(physics_estimate) + eps))

    mape_loss_fn = MAPELoss()

    # Train PINN
    for epoch in range(10000):
        model.train()
        optimizer.zero_grad()
        y_pred = model(X_t)
        data_loss = torch.mean((y_pred - Y_t) ** 2)
        phys_loss = physics_residual_loss_mape(y_pred, X_phys_t)
        loss =  data_loss 
        loss.backward()
        optimizer.step()
        # if epoch%1000==0:
        #     print(f"Epoch {epoch}, MAPE Loss: {loss.item():.6f}")
        #     print(f"data: {data_loss:.3f}, phy:{phys_loss:.3f}, total: {(data_loss+phys_loss):.3f}")
        #     print()


    # Predictions
    model.eval()
    with torch.no_grad():
        y_train_pred_scaled = model(torch.tensor(train_time_scaled, dtype=torch.float32).to(device)).cpu().numpy()
        y_test_pred_scaled = model(torch.tensor(test_time_scaled, dtype=torch.float32).to(device)).cpu().numpy()

    # Inverse transform to original units
    y_train_pred = scaler_Y.inverse_transform(y_train_pred_scaled)
    y_test_pred = scaler_Y.inverse_transform(y_test_pred_scaled)

    train_overall["pinn_pred"] = y_train_pred.flatten()
    test_overall["pinn_pred"] = y_test_pred.flatten()


    if not test_overall.empty:
        mape_pinn = mean_absolute_percentage_error(test_overall["y"], test_overall["pinn_pred"])
    else:
        mape_pinn = np.nan

    # ===== 3C) PINN + PROPHET RESIDUAL STACK =====
    train_overall["residual"] = train_overall["y"] - train_overall["pinn_pred"]
    test_overall["residual"]  = test_overall["y"] - test_overall["pinn_pred"]

    train_res = train_overall[["ds","residual"]].rename(columns={"residual":"y"})
    test_res  = test_overall[["ds","residual"]].rename(columns={"residual":"y"})



    m_res = Prophet(seasonality_mode="multiplicative", yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False,changepoint_prior_scale=0.1 )
    m_res.fit(train_res)

    future_res = m_res.make_future_dataframe(periods=len(test_res), freq="MS")
    forecast_res = m_res.predict(future_res)

    res_pred = forecast_res["yhat"].iloc[len(train_res):].values
    test_overall["final_pred"] = test_overall["pinn_pred"].values + res_pred

    if not test_overall.empty:
        mape_pinn_prophet = mean_absolute_percentage_error(test_overall["y"], test_overall["final_pred"])
    else:
        mape_pinn_prophet = np.nan

    itr+=1
    if mape_pinn_prophet<mape_prophet:
        print(mape_pinn_prophet,mape_prophet)
        flag=False
    else:
        print(f'itr: {itr}, mape_pinn_prophet: {mape_pinn_prophet}, mape_prophet: {mape_prophet}')


print("Iterations:",itr)
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 6))

# Train residuals
plt.plot(train_res["ds"], train_res["y"], 
         label="Train Residuals", marker="o", linestyle="-", color="blue")

# Test residuals
plt.plot(test_res["ds"], test_res["y"], 
         label="Test Residuals", marker="s", linestyle="--", color="red")

# Reference line (zero residuals)
plt.axhline(0, color="black", linestyle="--", linewidth=1)

plt.title("Residuals: Train vs Test (PINN Predictions)")
plt.xlabel("Date")
plt.ylabel("Residual (y - y_pred)")
plt.legend()
plt.grid(True)
plt.show()

# ===== 4) RESULTS =====
print("\n--- RESULTS ---")
print(f"Prophet-only MAPE: {mape_prophet:.2f}")
print(f"PINN-only MAPE: {mape_pinn:.2f}")
print(f"PINN + Prophet MAPE: {mape_pinn_prophet:.2f}")


In [None]:
import matplotlib.pyplot as plt

# Collect predictions
predictions = {
    "Prophet-only": y_pred_prophet["yhat"].values if not test_overall.empty else [],
    "PINN-only": test_overall["pinn_pred"].values if not test_overall.empty else [],
    "PINN + Prophet": test_overall["final_pred"].values if not test_overall.empty else []
}

# Compute MAPEs
results = {
    "Prophet-only": mape_prophet,
    "PINN-only": mape_pinn,
    "PINN + Prophet": mape_pinn_prophet
}

plt.figure(figsize=(14, 6))

# Training data (actuals)
plt.plot(train_overall["ds"], train_overall["y"], 
         label="Train (Actual)", color="black", linewidth=2)

# Test data (actuals)
plt.plot(test_overall["ds"], test_overall["y"], 
         label="Test (Actual)", color="blue", linewidth=2)
# Forecasts
for name, y_pred in predictions.items():
    if len(y_pred) > 0:  # only plot if available
        plt.plot(test_overall["ds"], y_pred, '--', label=f"{name} (MAPE {results[name]:.2f}%)", linewidth=2)

# Vertical line for train/test split
if not test_overall.empty:
    split_date = test_overall["ds"].iloc[0]
    plt.axvline(split_date, color="gray", linestyle="--", label="Train/Test Split")

plt.title("Carbon Emissions Forecast (Outside India): Prophet vs PINN vs Hybrid")
plt.xlabel("Date")
plt.ylabel("Monthly Emissions (Scope 1)")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()


# Other Models

In [None]:
df=df_2022.copy()
df.info()

In [None]:
df.columns

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

# ---------------------------
# Feature Engineering
# ---------------------------
def create_features(df_final):
    df_monthly = df_final.groupby('datetime')[['Scope1_per_unit','Electricity Grid Energy Per Unit (GJ)','Scope1','Production Actual Quantity (MT/Month)']].sum().reset_index()
    df_monthly = df_monthly.rename(columns={'datetime':'ds','Scope1_per_unit':'y'})
    df = df_monthly.copy()
    df['year'] = df['ds'].dt.year
    df['month'] = df['ds'].dt.month
    df['quarter'] = df['ds'].dt.quarter
    df['dayofyear'] = df['ds'].dt.dayofyear
    df['sin_month'] = np.sin(2 * np.pi * df['month']/12)
    df['cos_month'] = np.cos(2 * np.pi * df['month']/12)
    return df

df_ml = create_features(df)

# Train/Test split (same as earlier)
train_ml = df_ml[(df_ml['ds'] >= '2021-01-01') & (df_ml['ds'] < '2025-01-01')]
test_ml  = df_ml[(df_ml['ds'] >= '2025-01-01') & (df_ml['ds'] <= '2026-05-01')]

X_train = train_ml.drop(columns=['ds','y'])
y_train = train_ml['y']
X_test  = test_ml.drop(columns=['ds','y'])
y_test  = test_ml['y']

# ---------------------------
# Machine Learning Models
# ---------------------------

models = {
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=200, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=200, random_state=42),
    "SVR": SVR(kernel='rbf', C=200, gamma=0.1)
}

results = {}

import matplotlib.pyplot as plt

# ---------------------------
# Train ML models and store predictions
# ---------------------------
predictions = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    predictions[name] = y_pred
    mape = mean_absolute_percentage_error(y_test, y_pred)
    results[name] = mape

# ---------------------------
# Plot Train, Test, and Predictions
# ---------------------------
plt.figure(figsize=(18,8))

# Training data
plt.plot(train_ml['ds'], y_train, label="Train", color="black", linewidth=2)

# Test actual
plt.plot(test_ml['ds'], y_test, label="Test (Actual)", color="blue", linewidth=2)

# Forecasts from ML models
for name, y_pred in predictions.items():
    plt.plot(test_ml['ds'], y_pred, '--', label=f"{name} (MAPE {results[name]:.2f}%)", linewidth=2)

# Vertical line for train/test split
plt.axvline(pd.Timestamp("2025-01-01"), color="gray", linestyle="--", label="Train/Test Split")

plt.title("Scope1 Emissions Forecast (Machine Learning): RF, Gradient Boosting, XGBoost, SVR")
plt.xlabel("Date")
plt.ylabel("Total Scope1 per Unit of production MT")
plt.legend()
plt.grid(alpha=0.3)
plt.show()


# ---------------------------
# Compare All Models
# ---------------------------
print("\nModel Comparison:")
for model, mape in results.items():
    print(f"{model}: {mape:.2f}%")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_percentage_error

# ===== 1) CONFIG =====
DATE_COL = "datetime"
TARGET_COL = "Scope1_per_unit"
PLANT_COL = "Plant Name"
FORECAST_MONTHS = 7
TEST_MONTHS = 7

# ===== 2) START FROM df_2020 =====
df = df_2022.copy()
# ----- 2a) Ensure datetime -----
if DATE_COL in df.columns:
    ds = pd.to_datetime(df[DATE_COL], errors="coerce", infer_datetime_format=True)
else:
    ds = pd.Series(pd.NaT, index=df.index)

needs_rebuild = ds.isna()
if needs_rebuild.any():
    if not {"year", "month"}.issubset(df.columns):
        raise ValueError("No valid 'Current Date' and missing 'year'/'month' to rebuild dates.")
    ds_rebuilt = pd.to_datetime(dict(year=df.loc[needs_rebuild, "year"],
                                     month=df.loc[needs_rebuild, "month"],
                                     day=1))
    ds.loc[needs_rebuild] = ds_rebuilt

df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()

# ----- 2b) Cleaning -----
df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

# ----- 2c) Imputation -----
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
    g = group.copy()
    for c in num_cols:
        g[c] = g[c].replace(0, np.nan)
        if g[c].notna().any():
            g[c] = g[c].fillna(g[c].mean())
    return g

if "year" in df.columns:
    df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
else:
    df = impute_zero_nan_with_mean(df)

for c in num_cols:
    if df[c].isna().any():
        df[c] = df[c].fillna(df[c].mean())

# ===== 3) OVERALL SERIES =====
def prepare_overall_series(df_in: pd.DataFrame) -> pd.DataFrame:
    s = (df_in.groupby("ds", as_index=False)[TARGET_COL]
         .sum()
         .sort_values("ds"))
    s = s.rename(columns={TARGET_COL: "y"})
    return s

overall = prepare_overall_series(df)
overall = overall.sort_values("ds").reset_index(drop=True)

# Train/test split
split_point = len(overall) - TEST_MONTHS if len(overall) > TEST_MONTHS else len(overall)
train_overall = overall.iloc[:split_point].copy()
test_overall = overall.iloc[split_point:].copy()

# ===== 3A) PROPHET =====
m_overall = Prophet(seasonality_mode="multiplicative", yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False,changepoint_prior_scale=0.1 )
m_overall.fit(train_overall)
future_all = m_overall.make_future_dataframe(periods=len(test_overall), freq="MS")
fcst_all = m_overall.predict(future_all)
# take exactly the test horizon (no overlap with train)


if not test_overall.empty:
    y_pred_prophet = (fcst_all[["ds", "yhat"]]
              .merge(test_overall[["ds", "y"]], on="ds", how="inner"))
    mape_prophet = mean_absolute_percentage_error(y_pred_prophet["y"], y_pred_prophet["yhat"])
else:
    mape_prophet = np.nan
print(mape_prophet)

# -------------------------
# (2) Holt-Winters (monthly seasonality)
# -------------------------
hw_model = ExponentialSmoothing(train_overall['y'],
                                trend='add',
                                seasonal='add',
                                seasonal_periods=12).fit()
hw_forecast = hw_model.forecast(len(test_overall['y']))

# -------------------------
# (3) ARIMA
# -------------------------
arima_model = ARIMA(train_overall['y'], order=(12,1,12))  
arima_fit = arima_model.fit()
arima_forecast = arima_fit.forecast(len(test_overall['y']))

# Seasonal ARIMA
import pmdarima as pm

sarima_model = pm.auto_arima(train_overall['y'],
                             seasonal=True,
                             m=12,  # 12 months in a seasonal cycle
                             stepwise=True,
                             suppress_warnings=True)
sarima_forecast = sarima_model.predict(n_periods=len(test_overall['y']))

sarima_mape = mean_absolute_percentage_error(test_overall['y'], sarima_forecast) * 100
# -------------------------
# Evaluate MAPE
# -------------------------
hw_mape = mean_absolute_percentage_error(test_overall['y'], hw_forecast) * 100
arima_mape = mean_absolute_percentage_error(test_overall['y'], arima_forecast) * 100
print(f"Prophet MAPE: {mape_prophet:.2f}%")
print(f"Holt-Winters MAPE: {hw_mape:.2f}%")
print(f"ARIMA MAPE: {arima_mape:.2f}%")
print(f"SARIMA MAPE: {sarima_mape:.2f}%")



# -------------------------
# Plot Comparison
# -------------------------


plt.figure(figsize=(14,5))
plt.plot(train_overall['ds'], train_overall['y'], label="Train", color = 'black', linewidth=2)
plt.plot(test_overall['ds'], test_overall['y'], label="Test (Actual)", color='blue',linewidth=2)

# Prophet forecast (assuming you have eval_df with 'ds','yhat')
plt.plot(eval_df['ds'], eval_df['yhat'], '--', label=f"Prophet (MAPE={mape_prophet:.2f}%)")

# Holt-Winters forecast
plt.plot(test_overall['ds'], hw_forecast, '--', label=f"Holt-Winters (MAPE={hw_mape:.2f}%)")

# ARIMA forecast
plt.plot(test_overall['ds'], sarima_forecast, '--', label=f"SARIMA (MAPE={sarima_mape:.2f}%)")
plt.plot(test_overall['ds'], arima_forecast, '--', label=f"ARIMA (MAPE={arima_mape:.2f}%)")


plt.axvline(pd.Timestamp('2024-01-01'), color='gray', linestyle='--', label="Train/Test Split")
plt.title("Carbon Emissions Forecast (Classical Models): Prophet, Holt-Winters, SARIMA, ARIMA ")
plt.xlabel("Date"); plt.ylabel("Total Emissions")
plt.legend(); plt.grid(alpha=0.3)
plt.show()


# sindy

In [None]:
DATE_COL = "datetime"
TARGET_COL = "Scope1_per_unit"
PLANT_COL = "Plant Name"
FORECAST_MONTHS = 7
TEST_MONTHS = 7

# ===== 2) START FROM df_2020 =====
df = df_2022.copy()
df.info()

In [None]:
col=['Scope_2','totalWaterConsumption','Electricity Grid Energy Per Unit (GJ) ']

In [None]:
# ===== 0) Imports =====
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error

import torch
import torch.nn as nn
import torch.optim as optim
import seaborn as sns
from prophet import Prophet
import pysindy as ps
from torch.utils.data import TensorDataset, DataLoader

plt.style.use("seaborn-v0_8")  # updated style name
sns.set_palette("viridis")

%matplotlib inline

# ===== 1) CONFIG =====
DATE_COL = "datetime"
TARGET_COL = "Scope1_per_unit"
PLANT_COL = "Plant Name"
FORECAST_MONTHS = 7
TEST_MONTHS = 7

col1='Scope_2'
col2='totalWaterConsumption'
col3='Electricity Grid Energy Per Unit (GJ)'

# ===== 2) START FROM df_2020 =====
df = df_2022.copy()
# ----- 2a) Ensure datetime -----
if DATE_COL in df.columns:
    ds = pd.to_datetime(df[DATE_COL], errors="coerce", infer_datetime_format=True)
else:
    ds = pd.Series(pd.NaT, index=df.index)

needs_rebuild = ds.isna()
if needs_rebuild.any():
    if not {"year", "month"}.issubset(df.columns):
        raise ValueError("No valid 'Current Date' and missing 'year'/'month' to rebuild dates.")
    ds_rebuilt = pd.to_datetime(dict(year=df.loc[needs_rebuild, "year"],
                                     month=df.loc[needs_rebuild, "month"],
                                     day=1))
    ds.loc[needs_rebuild] = ds_rebuilt

df["ds"] = pd.to_datetime(ds).dt.to_period("M").dt.to_timestamp()

# ----- 2b) Cleaning -----
df = df.drop_duplicates(subset=["ds", PLANT_COL, TARGET_COL], keep="last")

# ----- 2c) Imputation -----
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
def impute_zero_nan_with_mean(group: pd.DataFrame) -> pd.DataFrame:
    g = group.copy()
    for c in num_cols:
        g[c] = g[c].replace(0, np.nan)
        if g[c].notna().any():
            g[c] = g[c].fillna(g[c].mean())
    return g

if "year" in df.columns:
    df = df.groupby("year", group_keys=False).apply(impute_zero_nan_with_mean)
else:
    df = impute_zero_nan_with_mean(df)

for c in num_cols:
    if df[c].isna().any():
        df[c] = df[c].fillna(df[c].mean())

# ===== 3) OVERALL SERIES =====
def prepare_overall_series(df_in: pd.DataFrame) -> pd.DataFrame:
    s = (df_in.groupby("ds", as_index=False)[[TARGET_COL,col1,col2,col3]]
         .sum()
         .sort_values("ds"))
    s = s.rename(columns={TARGET_COL: "y"})
    return s

overall = prepare_overall_series(df)
overall = overall.sort_values("ds").reset_index(drop=True)

# Train/test split
split_point = len(overall) - TEST_MONTHS if len(overall) > TEST_MONTHS else len(overall)
train_overall = overall[['ds','y']].iloc[:split_point].copy()
test_overall = overall[['ds','y']].iloc[split_point:].copy()

sindyTrain=overall.iloc[:split_point].copy()
sindyTest=overall.iloc[split_point:].copy()



# ===== 3A) PROPHET =====
m_overall = Prophet(seasonality_mode="multiplicative", yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False,changepoint_prior_scale=0.1 )
m_overall.fit(train_overall)
future_all = m_overall.make_future_dataframe(periods=len(test_overall), freq="MS")
fcst_all = m_overall.predict(future_all)
# take exactly the test horizon (no overlap with train)


if not test_overall.empty:
    y_pred_prophet = (fcst_all[["ds", "yhat"]]
              .merge(test_overall[["ds", "y"]], on="ds", how="inner"))
    mape_prophet = mean_absolute_percentage_error(y_pred_prophet["y"], y_pred_prophet["yhat"])
else:
    mape_prophet = np.nan
print(mape_prophet)



#SINDy
X_train=sindyTrain.drop(['y','ds'], axis=1)
y_train=sindyTrain['y']

scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(np.array(y_train).reshape(-1, 1)).flatten()
# -------------------------
# 3) Build SINDy model using precomputed derivatives
# -------------------------
# Combine features + target to be the state for SINDy: 
import numpy as np

X_sindy = np.hstack((X_train_scaled, y_train_scaled.reshape(-1, 1))).astype(float)
N = X_sindy.shape[0]
# Build time vector (uniform monthly steps)
dt = 1.0
t = np.arange(N) * dt  # 1D numeric time vector
x_dot = np.gradient(X_sindy, t, axis=0)   # shape (N, 4)

# Fit SINDy using precomputed derivatives (robust)
feature_library = ps.PolynomialLibrary(degree=2)
optimizer = ps.STLSQ(threshold=0.05)  # lower threshold so we don't prune everything
model_sindy = ps.SINDy(feature_library=feature_library, optimizer=optimizer)
model_sindy.fit(X_sindy, t=dt, x_dot=x_dot)
print("Discovered SINDy equations:")
model_sindy.print()

def sindy_predict_dy_dt(X_batch_np, y_batch_np):
    XY = np.hstack((X_batch_np, y_batch_np.reshape(-1,1)))
    dydt = model_sindy.predict(XY)  
    return dydt[:, -1]

flag=True
itr=0
while flag:
# ===== 3B) PINN FORECAST =====
    class PINN(nn.Module):
        def __init__(self, in_dim=1, hidden_dim=32, out_dim=1):
            super().__init__()
            self.net = nn.Sequential(
                nn.Linear(in_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, out_dim)
            )
        def forward(self, x):
            return self.net(x)

    # Time index as feature
    train_time = np.arange(len(train_overall)).reshape(-1, 1)
    test_time = np.arange(len(train_overall), len(overall)).reshape(-1, 1)
    y_train = train_overall["y"].values.reshape(-1, 1)

    X_tensor = torch.tensor(X_train_scaled, dtype=torch.float32, device=device)
    y_tensor = torch.tensor(y_train_scaled, dtype=torch.float32, device=device).view(-1,1)
    dXdt = torch.tensor(x_dot[:, :3], dtype=torch.float32, device=device)  # shape (N, 3)

    # # Physics features (optional)
    # if set(['Electricity Grid Energy Per Unit (GJ)']).issubset(df.columns):
    #     agg = df.groupby("ds")[['Electricity Grid Energy Per Unit (GJ)']].sum().reset_index()
    #     agg = agg.sort_values("ds").reset_index(drop=True)
    #     X_phys = (agg[['Electricity Grid Energy Per Unit (GJ)']]/1000).values
    # else:
    #     X_phys = np.ones((len(overall), 3))

    # X_phys_train = X_phys[:len(train_overall)]
    # X_phys_test = X_phys[len(train_overall):]


    from sklearn.preprocessing import StandardScaler

    # ==== Standardize features & target ====
    scaler_X = StandardScaler()
    scaler_Y = StandardScaler()
    scaler_phys = StandardScaler()

    # Fit scalers on training data only
    train_time_scaled = scaler_X.fit_transform(train_time)
    test_time_scaled = scaler_X.transform(test_time)

    y_train_scaled = scaler_Y.fit_transform(y_train)

    # Torch tensors (scaled data)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    X_t = torch.tensor(train_time_scaled, dtype=torch.float32).to(device)
    Y_t = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
    X_phys_t = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)


    model = PINN(in_dim=1, hidden_dim=32, out_dim=1).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    class MAPELoss(nn.Module):
        def __init__(self, eps=1e-6):
            super().__init__()
            self.eps = eps
        def forward(self, y_pred, y_true):
            return torch.mean(torch.abs(y_pred - y_true) / (torch.abs(y_true) + self.eps))
    mse_loss = nn.MSELoss()
    mape_loss_fn = MAPELoss()
    lambda_sindy = 1.0  # weight for SINDy physics loss (tune this)

    dataset = TensorDataset(X_phys_t, Y_t)
    loader = DataLoader(dataset, batch_size=N, shuffle=False)
    # Train PINN

    for epoch in range(1000):
        for xb,yb in loader:    
            model.train()
            optimizer.zero_grad()
            y_pred = model(X_t)
            data_loss=mape_loss_fn(y_pred,Y_t)
            
            xb = xb.clone().detach().requires_grad_(True)
            Nbatch = xb.shape[0]
            dy_dt_nn_list = []
            for i in range(Nbatch):
                grad_i = torch.autograd.grad(y_pred[i,0], xb, retain_graph=True,allow_unused=True, create_graph=True)[0][i]  # shape (3,)
                dy_dt_nn_i = torch.dot(grad_i, dXdt[i])
                dy_dt_nn_list.append(dy_dt_nn_i.unsqueeze(0))

            dy_dt_nn = torch.cat(dy_dt_nn_list, dim=0).view(-1,1)  # shape (N,1)

            y_pred_np = y_pred.detach().cpu().numpy().flatten()  # scaled y predicted
            X_np = X_phys_t.detach().cpu().numpy()  # scaled features
            dy_dt_sindy_np = sindy_predict_dy_dt(X_np, y_pred_np)  # shape (N,)
            dy_dt_sindy = torch.tensor(dy_dt_sindy_np.reshape(-1,1), dtype=torch.float32, device=device)
            loss_sindy = mse_loss(dy_dt_nn, dy_dt_sindy)
            
            loss = data_loss + lambda_sindy * loss_sindy

            loss.backward()
            optimizer.step()
            if epoch%1000==0:
                print(f"Epoch {epoch}, MAPE Loss: {loss.item():.6f}")
                print(f"data: {data_loss:.3f}, phy:{phys_loss:.3f}, total: {(data_loss+phys_loss):.3f}")
                print()


    # Predictions
    model.eval()
    with torch.no_grad():
        y_train_pred_scaled = model(torch.tensor(train_time_scaled, dtype=torch.float32).to(device)).cpu().numpy()
        y_test_pred_scaled = model(torch.tensor(test_time_scaled, dtype=torch.float32).to(device)).cpu().numpy()

    # Inverse transform to original units
    y_train_pred = scaler_Y.inverse_transform(y_train_pred_scaled)
    y_test_pred = scaler_Y.inverse_transform(y_test_pred_scaled)

    train_overall["pinn_pred"] = y_train_pred.flatten()
    test_overall["pinn_pred"] = y_test_pred.flatten()


    if not test_overall.empty:
        mape_pinn = mean_absolute_percentage_error(test_overall["y"], test_overall["pinn_pred"])
    else:
        mape_pinn = np.nan

    # ===== 3C) PINN + PROPHET RESIDUAL STACK =====
    train_overall["residual"] = train_overall["y"] - train_overall["pinn_pred"]
    test_overall["residual"]  = test_overall["y"] - test_overall["pinn_pred"]

    train_res = train_overall[["ds","residual"]].rename(columns={"residual":"y"})
    test_res  = test_overall[["ds","residual"]].rename(columns={"residual":"y"})



    m_res = Prophet(seasonality_mode="multiplicative", yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False,changepoint_prior_scale=0.1 )
    m_res.fit(train_res)

    future_res = m_res.make_future_dataframe(periods=len(test_res), freq="MS")
    forecast_res = m_res.predict(future_res)

    res_pred = forecast_res["yhat"].iloc[len(train_res):].values
    test_overall["final_pred"] = test_overall["pinn_pred"].values + res_pred

    if not test_overall.empty:
        mape_pinn_prophet = mean_absolute_percentage_error(test_overall["y"], test_overall["final_pred"])
    else:
        mape_pinn_prophet = np.nan

    itr+=1
    if mape_pinn_prophet<mape_prophet:
        print(mape_pinn_prophet,mape_prophet)
        flag=False
    else:
        print(f'itr: {itr}, mape_pinn_prophet: {mape_pinn_prophet}, mape_prophet: {mape_prophet}')


print("Iterations:",itr)
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 6))

# Train residuals
plt.plot(train_res["ds"], train_res["y"], 
         label="Train Residuals", marker="o", linestyle="-", color="blue")

# Test residuals
plt.plot(test_res["ds"], test_res["y"], 
         label="Test Residuals", marker="s", linestyle="--", color="red")

# Reference line (zero residuals)
plt.axhline(0, color="black", linestyle="--", linewidth=1)

plt.title("Residuals: Train vs Test (PINN Predictions)")
plt.xlabel("Date")
plt.ylabel("Residual (y - y_pred)")
plt.legend()
plt.grid(True)
plt.show()

# ===== 4) RESULTS =====
print("\n--- RESULTS ---")
print(f"Prophet-only MAPE: {mape_prophet:.2f}")
print(f"PINN-only MAPE: {mape_pinn:.2f}")
print(f"PINN + Prophet MAPE: {mape_pinn_prophet:.2f}")


In [None]:
# =========================================================
# Full script: PINN + SINDy differential physics loss
# =========================================================

# 0) Imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from prophet import Prophet
import pysindy as ps

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on:", device)
DATE_COL = "datetime"
TARGET_COL = "Scope1_per_unit"
PLANT_COL = "Plant Name"
FORECAST_MONTHS = 7
TEST_MONTHS = 7

col1='Scope_2'
col2='totalWaterConsumption'
col3='Electricity Grid Energy Per Unit (GJ)'
# -------------------------
# 1) Prepare India data
# -------------------------
df_india = df_2022.copy()

df_monthly = df_india.groupby(DATE_COL).agg({
    TARGET_COL:'sum',
    col1:'sum',
    col2:'sum',
    col3:'mean'
}).reset_index()

df_monthly = df_monthly.rename(columns={
    DATE_COL:'ds',
    TARGET_COL:'y',
    col1:col1,
    col2:col2,
    col3:col3
})

# Train/Test split
train = df_monthly[(df_monthly['ds'] >= '2021-01-01') & (df_monthly['ds'] < '2025-01-01')].copy()
test  = df_monthly[(df_monthly['ds'] >= '2025-01-01') & (df_monthly['ds'] <= '2026-05-01')].copy()

X_train = train[[col1,col2,col3,'ds']].values
y_train = train['y'].values
X_test  = test[[col1,col2,col3]].values
y_test  = test['y'].values

# -------------------------
# 2) Scale features and target
# -------------------------
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled  = scaler_X.transform(X_test)

scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1,1)).flatten()
y_test_scaled  = scaler_y.transform(y_test.reshape(-1,1)).flatten()

# -------------------------
# 3) Build SINDy model using precomputed derivatives
# -------------------------
# Combine features + target to be the state for SINDy: [activity, capacity, capacity_factor, y]
X_sindy = np.hstack((X_train_scaled, y_train_scaled.reshape(-1,1))).astype(float)  # shape (N, 4)
N = X_sindy.shape[0]

# Build time vector (uniform monthly steps)
dt = 1.0
t = np.arange(N) * dt  # 1D numeric time vector

# Precompute derivatives (x_dot) numerically (shape (N, 4))
# np.gradient handles edge points; provide t for non-uniform spacing (we have uniform dt)
x_dot = np.gradient(X_sindy, t, axis=0)   # shape (N, 4)

# Fit SINDy using precomputed derivatives (robust)
feature_library = ps.PolynomialLibrary(degree=2)
optimizer = ps.STLSQ(threshold=0.05)  # lower threshold so we don't prune everything
model_sindy = ps.SINDy(feature_library=feature_library, optimizer=optimizer)
model_sindy.fit(X_sindy, t=dt, x_dot=x_dot)
print("Discovered SINDy equations:")
model_sindy.print()

# SINDy predict function for dy/dt: will take [X, y] and return derivative of y (last column)
def sindy_predict_dy_dt(X_batch_np, y_batch_np):
    # X_batch_np: (m, n_features)
    XY = np.hstack((X_batch_np, y_batch_np.reshape(-1,1)))
    dydt = model_sindy.predict(XY)  # shape (m, state_dim)
    # return the derivative for the last state (y) as (m,)
    return dydt[:, -1]

# -------------------------
# 4) PINN model and loss
# -------------------------
class PINN(nn.Module):
    def __init__(self, input_dim=3, hidden=64):
        super(PINN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden),
            nn.Tanh(),
            nn.Linear(hidden, hidden),
            nn.Tanh(),
            nn.Linear(hidden, 1)
        )
    def forward(self, x):
        return self.net(x)

class MAPELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.eps = eps
    def forward(self, y_pred, y_true):
        return torch.mean(torch.abs(y_pred - y_true) / (torch.abs(y_true) + self.eps))

mse_loss = nn.MSELoss()

# -------------------------
# 5) Data tensors and derivative of features (numeric)
# -------------------------
# Convert to torch tensors
X_tensor = torch.tensor(X_train_scaled, dtype=torch.float32, device=device)
y_tensor = torch.tensor(y_train_scaled, dtype=torch.float32, device=device).view(-1,1)

# dX/dt numeric (from x_dot). We will use only feature derivatives (first 3 columns)
dXdt = torch.tensor(x_dot[:, :3], dtype=torch.float32, device=device)  # shape (N, 3)

# Full-batch DataLoader (no shuffle)
dataset = TensorDataset(X_tensor, y_tensor)
loader = DataLoader(dataset, batch_size=N, shuffle=False)

# instantiate model + optimizer
model = PINN(input_dim=3, hidden=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = MAPELoss()

# -------------------------
# 6) Training loop (full-batch, SINDy differential physics loss)
# -------------------------
lambda_sindy = 1.0  # weight for SINDy physics loss (tune this)
epochs = 3000

for epoch in range(epochs):
    for xb, yb in loader:   # one batch containing the whole training set
        optimizer.zero_grad()

        # Ensure inputs require grad for Jacobian computation
        xb = xb.clone().detach().requires_grad_(True)

        # NN prediction (scaled y)
        y_pred = model(xb)  # shape (N,1)

        # Data loss (MAPE) on scaled values
        loss_data = loss_fn(y_pred, yb)

        # --- compute time derivative predicted by NN via chain rule ---
        # For each sample i: dy_dt_nn[i] = grad_y_wrt_x_i dot dXdt[i]
        Nbatch = xb.shape[0]
        dy_dt_nn_list = []
        # loop per sample to extract per-sample gradient (OK for N ~ 36)
        for i in range(Nbatch):
            # gradient of scalar y_pred[i] w.r.t the input matrix xb (returns gradient matrix)
            grad_i = torch.autograd.grad(y_pred[i,0], xb, retain_graph=True, create_graph=True)[0][i]  # shape (3,)
            # compute dot product with numeric dXdt[i]
            dy_dt_nn_i = torch.dot(grad_i, dXdt[i])
            dy_dt_nn_list.append(dy_dt_nn_i.unsqueeze(0))

        dy_dt_nn = torch.cat(dy_dt_nn_list, dim=0).view(-1,1)  # shape (N,1)

        # --- compute SINDy-predicted dy/dt (treat SINDy as fixed, use current NN y to get inputs) ---
        # We need the current y in scaled space for SINDy inputs (SINDy was trained on scaled y)
        y_pred_np = y_pred.detach().cpu().numpy().flatten()  # scaled y predicted
        X_np = xb.detach().cpu().numpy()  # scaled features

        dy_dt_sindy_np = sindy_predict_dy_dt(X_np, y_pred_np)  # shape (N,)
        dy_dt_sindy = torch.tensor(dy_dt_sindy_np.reshape(-1,1), dtype=torch.float32, device=device)

        # SINDy physics loss (MSE between NN time derivative and SINDy-predicted derivative)
        loss_sindy = mse_loss(dy_dt_nn, dy_dt_sindy)

        # Total loss and update
        loss = loss_data + lambda_sindy * loss_sindy
        loss.backward()
        optimizer.step()

    # print progress
    if epoch % 250 == 0:
        print(f"Epoch {epoch:4d}  loss={loss.item():.6e}  data={loss_data.item():.6e}  sindy={loss_sindy.item():.6e}")

# -------------------------
# 7) Predict on train/test and unscale
# -------------------------
model.eval()
with torch.no_grad():
    y_pinn_train_scaled = model(torch.tensor(X_train_scaled, dtype=torch.float32, device=device)).cpu().numpy().flatten()
    y_pinn_test_scaled  = model(torch.tensor(X_test_scaled, dtype=torch.float32, device=device)).cpu().numpy().flatten()

y_pinn_train = scaler_y.inverse_transform(y_pinn_train_scaled.reshape(-1,1)).flatten()
y_pinn_test  = scaler_y.inverse_transform(y_pinn_test_scaled.reshape(-1,1)).flatten()

train['pinn_pred'] = y_pinn_train
train['residual'] = train['y'] - train['pinn_pred']
test['pinn_pred']  = y_pinn_test
test['residual']   = test['y'] - test['pinn_pred']

# -------------------------
# 8) Residual modeling with Prophet (same as before)
# -------------------------
train_res = train[['ds','residual']].rename(columns={'residual':'y'})
test_res  = test[['ds','residual']].rename(columns={'residual':'y'})

m_res = Prophet(yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False)
m_res.fit(train_res)

future_res = m_res.make_future_dataframe(periods=len(test_res), freq='MS')
forecast_res = m_res.predict(future_res)
res_pred_prophet = forecast_res['yhat'].iloc[len(train_res):].values
final_pred_prophet = test['pinn_pred'].values + res_pred_prophet

# -------------------------
# 9) Metrics
# -------------------------
def robust_mape(y_true, y_pred):
    mask = y_true != 0
    if mask.sum() == 0:
        return np.nan
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def compute_metrics(y_true, y_pred, name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape_val = robust_mape(y_true, y_pred)
    smape_val = smape(y_true, y_pred)
    return pd.DataFrame({
        'Model': [name],
        'MAPE (%)': [mape_val],
        'MAE': [mae],
        'RMSE': [rmse],
        'sMAPE (%)': [smape_val]
    })

y_true_test = test['y'].values
df_metrics = pd.concat([
    compute_metrics(y_true_test, y_pinn_test, 'SINDy-PINN'),
    compute_metrics(y_true_test, final_pred_prophet, 'SINDy-PINN + Prophet')
], ignore_index=True)

print(df_metrics)


# Stationarity

In [None]:
df_2022.info()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

plt.style.use("seaborn-v0_8")


In [None]:
# ============================================================
# 0) IMPORTS
# ============================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

plt.style.use("seaborn-v0_8")


# ============================================================
# 1) LOAD & PREPARE DATA
# ============================================================
# Assume df is already loaded
df = df.copy()

df["datetime"] = pd.to_datetime(df["datetime"])

# Aggregate date-wise (important step)
df_daily = (
    df.groupby("datetime", as_index=False)
      .agg({"Scope1_per_unit": "sum"})
      .sort_values("datetime")
)

ts = df_daily.set_index("datetime")["Scope1_per_unit"]


# ============================================================
# 2) RAW TIME SERIES PLOT
# ============================================================
plt.figure(figsize=(12,4))
plt.plot(ts, label="Scope1_per_unit")
plt.title("Raw Time Series: Scope1_per_unit")
plt.xlabel("Datetime")
plt.ylabel("Scope1_per_unit")
plt.legend()
plt.show()


# ============================================================
# 3) ROLLING MEAN & STD
# ============================================================
rolling_mean = ts.rolling(window=12).mean()
rolling_std  = ts.rolling(window=12).std()

plt.figure(figsize=(12,4))
plt.plot(ts, label="Original", alpha=0.6)
plt.plot(rolling_mean, label="Rolling Mean (12)", linewidth=2)
plt.plot(rolling_std, label="Rolling Std (12)", linewidth=2)
plt.title("Rolling Mean & Standard Deviation")
plt.legend()
plt.show()


# ============================================================
# 4) ADF TEST FUNCTION
# ============================================================
def adf_test(series, name=""):
    result = adfuller(series.dropna())
    print(f"\nADF Test: {name}")
    print(f"ADF Statistic : {result[0]:.4f}")
    print(f"p-value       : {result[1]:.4f}")
    print("Critical Values:")
    for k, v in result[4].items():
        print(f"   {k}: {v:.4f}")


# ADF on original series
adf_test(ts, "Original Series")


# ============================================================
# 5) FIRST DIFFERENCING
# ============================================================
ts_diff = ts.diff()

plt.figure(figsize=(12,4))
plt.plot(ts_diff, label="First Differenced")
plt.title("First Differenced Series")
plt.legend()
plt.show()


# ============================================================
# 6) ROLLING STATS AFTER DIFFERENCING
# ============================================================
rolling_mean_diff = ts_diff.rolling(window=12).mean()
rolling_std_diff  = ts_diff.rolling(window=12).std()

plt.figure(figsize=(12,4))
plt.plot(ts_diff, label="Differenced", alpha=0.6)
plt.plot(rolling_mean_diff, label="Rolling Mean", linewidth=2)
plt.plot(rolling_std_diff, label="Rolling Std", linewidth=2)
plt.title("Rolling Stats After Differencing")
plt.legend()
plt.show()


# ADF on differenced series
adf_test(ts_diff, "Differenced Series")


# ============================================================
# 7) ACF & PACF (STATIONARY SERIES)
# ============================================================
fig, ax = plt.subplots(1, 2, figsize=(12,4))

plot_acf(ts_diff.dropna(), lags=24, ax=ax[0])
plot_pacf(ts_diff.dropna(), lags=24, ax=ax[1], method="ywm")

ax[0].set_title("ACF (Differenced)")
ax[1].set_title("PACF (Differenced)")

plt.tight_layout()
plt.show()


# ============================================================
# 8) OPTIONAL: LOG + DIFFERENCING (IF VARIANCE UNSTABLE)
# ============================================================
ts_log = np.log(ts.replace(0, np.nan))
ts_log_diff = ts_log.diff()

plt.figure(figsize=(12,4))
plt.plot(ts_log_diff)
plt.title("Log + Differenced Series")
plt.show()

adf_test(ts_log_diff, "Log Differenced Series")


In [None]:
# ============================================================
# SCATTER PLOTS: Scope1 vs Key Drivers
# ============================================================

import matplotlib.pyplot as plt
import pandas as pd

plt.style.use("seaborn-v0_8")

# Ensure datetime is parsed
df = df.copy()
df["datetime"] = pd.to_datetime(df["datetime"])

# Optional: aggregate date-wise (recommended for clarity)
df_agg = (
    df.groupby("datetime", as_index=False)
      .agg({
          "Scope1_per_unit": "sum",
          "Scope1": "sum",
          "Electricity Grid Energy Per Unit (GJ)": "sum",
          "Electricity Grid TCO2 Emission": "sum",
          "totalWaterConsumption": "sum"
      })
)

# ============================================================
# 1) Scope1 vs Electricity Grid Energy Per Unit (GJ)
# ============================================================
plt.figure(figsize=(5,4))
plt.scatter(
    df_agg["Electricity Grid Energy Per Unit (GJ)"],
    df_agg["Scope1"],
    alpha=0.6
)
plt.xlabel("Electricity Grid Energy Per Unit (GJ)")
plt.ylabel("Scope1 Emissions")
plt.title("Scope1 vs Electricity Grid Energy")
plt.show()


# ============================================================
# 2) Scope1 vs Electricity Grid TCO2 Emission
# ============================================================
plt.figure(figsize=(5,4))
plt.scatter(
    df_agg["Electricity Grid TCO2 Emission"],
    df_agg["Scope1"],
    alpha=0.6
)
plt.xlabel("Electricity Grid TCO2 Emission")
plt.ylabel("Scope1 Emissions")
plt.title("Scope1 vs Electricity Grid TCO2 Emission")
plt.show()


# ============================================================
# 3) Scope1 vs Water Consumption
# ============================================================
plt.figure(figsize=(5,4))
plt.scatter(
    df_agg["totalWaterConsumption"],
    df_agg["Scope1_per_unit"],
    alpha=0.6
)
plt.xlabel("Total Water Consumption")
plt.ylabel("Scope1 Emissions")
plt.title("Scope1 vs Water Consumption")
plt.show()
