In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ------------------------------------------------------------------
# 1. Load data and pick the target column
df = pd.read_csv("combined_dataset.csv", parse_dates=["datetime"])
df.set_index("datetime", inplace=True)
stage = df["stage_m"]

# ------------------------------------------------------------------
# 2. Define the split (same as your LSTM notebooks)
train_end = pd.Timestamp("2018-04-21")
val_start = pd.Timestamp("2019-01-01")

val_series = stage.loc[val_start:]          # Validation period only

# ------------------------------------------------------------------
# 3. Compute persistence errors for horizons 1‒7 days
results = []
for h in range(1, 8):                       # 1-day … 7-day ahead
    y_true = val_series.shift(-h).dropna()  # "future" values
    
    # Get the corresponding past values (h days before)
    past_indices = y_true.index - pd.Timedelta(days=h)
    
    # Keep only indices that exist in val_series
    valid_mask = past_indices.isin(val_series.index)
    valid_indices = y_true.index[valid_mask]
    
    # Get the aligned true and predicted values
    y_true_aligned = y_true.loc[valid_indices]
    y_pred_aligned = val_series.loc[past_indices[valid_mask]]
    
    if len(y_true_aligned) > 0:
        mae  = mean_absolute_error(y_true_aligned, y_pred_aligned)
        rmse = np.sqrt(mean_squared_error(y_true_aligned, y_pred_aligned))
        r2   = r2_score(y_true_aligned, y_pred_aligned)
        results.append({"Horizon(day)": h, "MAE(m)": mae, "RMSE(m)": rmse, "R²": r2})

baseline_df = pd.DataFrame(results)
print(baseline_df.to_string(index=False, formatters={
    "MAE(m)" : "{:.4f}".format,
    "RMSE(m)": "{:.4f}".format,
    "R²"     : "{:.4f}".format}))


 Horizon(day) MAE(m) RMSE(m)      R²
            1 0.2993  0.5640  0.4527
            2 0.4044  0.7136  0.1198
            3 0.4547  0.7766 -0.0508
            4 0.4863  0.8139 -0.1583
            5 0.5094  0.8384 -0.2309
            6 0.5131  0.8352 -0.2219
            7 0.5192  0.8269 -0.1999


In [3]:
# Let's check the date range of our data
print(f"Full dataset range: {df.index.min()} to {df.index.max()}")
print(f"Validation series range: {val_series.index.min()} to {val_series.index.max()}")
print(f"Number of validation samples: {len(val_series)}")


Full dataset range: 1995-10-01 00:00:00 to 2025-06-18 00:00:00
Validation series range: 2019-01-01 00:00:00 to 2025-06-18 00:00:00
Number of validation samples: 2361
