In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
import tensorflow as tf
import joblib

print("Loading data and models...")
# Define split date once
split_date_train_end = '2019-01-01'

# --- STEP 1: GENERATE PREDICTIONS FROM THE LSTM MODEL ---

# 1a. Load data and scaler for LSTM
df_lstm = pd.read_csv('combined_dataset.csv', index_col='datetime', parse_dates=True)
df_lstm = df_lstm[['stage_m'] + [col for col in df_lstm.columns if col != 'stage_m']]
scaler_lstm = joblib.load('multivariate_scaler.joblib') # Assumes this was saved from the best LSTM script
data_scaled_lstm = scaler_lstm.transform(df_lstm)

# 1b. Create sequences for LSTM
N_PAST = 7
N_FUTURE = 7
def create_lstm_sequences(data, n_past):
    X = []
    # Go up to the end of the data to get predictions for all possible inputs
    for i in range(n_past, len(data) + 1):
        X.append(data[i - n_past:i, :])
    return np.array(X)
X_lstm = create_lstm_sequences(data_scaled_lstm, N_PAST)

# 1c. Get LSTM predictions
lstm_model = tf.keras.models.load_model('best_multivariate_model.keras')
# The predictions will have a length of len(df_lstm) - N_PAST + 1
lstm_preds = lstm_model.predict(X_lstm).squeeze()


# --- STEP 2: GENERATE PREDICTIONS FROM THE XGBOOST MODEL ---

# 2a. Load data and perform IDENTICAL intelligent feature engineering
print("Recreating identical intelligent feature set for XGBoost...")
df_xgb_base = pd.read_csv('combined_dataset.csv', index_col='datetime', parse_dates=True)
df_features = pd.DataFrame(index=df_xgb_base.index)

df_features['stage_m_lag_1'] = df_xgb_base['stage_m'].shift(1)
df_features['stage_m_lag_2'] = df_xgb_base['stage_m'].shift(2)
df_features['stage_m_lag_3'] = df_xgb_base['stage_m'].shift(3)
df_features['stage_m_lag_4'] = df_xgb_base['stage_m'].shift(4)
df_features['prcp_lag_1'] = df_xgb_base['prcp'].shift(1)
df_features['prcp_lag_2'] = df_xgb_base['prcp'].shift(2)
df_features['rhum_lag_1'] = df_xgb_base['rhum'].shift(1)
df_features['rhum_lag_2'] = df_xgb_base['rhum'].shift(2)
df_features['tavg_lag_1'] = df_xgb_base['tavg'].shift(1)
df_features['tavg_lag_3'] = df_xgb_base['tavg'].shift(3)
df_features['tavg_lag_5'] = df_xgb_base['tavg'].shift(5)
df_features['wspd_lag_1'] = df_xgb_base['wspd'].shift(1)
df_features['pres_lag_1'] = df_xgb_base['pres'].shift(1)
df_features['prcp_rolling_3day_sum'] = df_xgb_base['prcp'].rolling(window=3).sum().shift(1)

# 2b. Create XGBoost X and y, and drop NaNs
X_xgb = df_features.copy()
y_xgb_targets = pd.DataFrame()
for i in range(1, N_FUTURE + 1):
    y_xgb_targets[f'target_day_{i}'] = df_xgb_base['stage_m'].shift(-i)

# Combine and drop NaNs created by shifting
df_full_xgb = pd.concat([X_xgb, y_xgb_targets], axis=1)
df_full_xgb.dropna(inplace=True)
X_xgb = df_full_xgb[X_xgb.columns]
y_xgb = df_full_xgb[y_xgb_targets.columns]

# 2c. Split and Scale the XGBoost data correctly (No Data Leakage)
train_indices = X_xgb.index <= split_date_train_end
X_xgb_train, X_xgb_full = X_xgb[train_indices], X_xgb # Keep full X_xgb for prediction
y_xgb_train = y_xgb[train_indices]

scaler_xgb = StandardScaler()
X_xgb_train_scaled = scaler_xgb.fit_transform(X_xgb_train)
X_xgb_full_scaled = scaler_xgb.transform(X_xgb_full) # Transform the whole set

# 2d. Train XGBoost and get predictions for the ENTIRE dataset
xgb_model = MultiOutputRegressor(estimator=xgb.XGBRegressor(objective='reg:squarederror', n_estimators=109, learning_rate=0.05, max_depth=6, min_child_weight=4, subsample=1.0, colsample_bytree=0.9, n_jobs=-1, random_state=42))
xgb_model.fit(X_xgb_train_scaled, y_xgb_train)
xgb_preds = xgb_model.predict(X_xgb_full_scaled)


# --- STEP 3: ASSEMBLE THE META-DATASET AND TRAIN THE HYBRID MODEL ---

# 3a. Align all data
# The ground truth `y`
y_true_df = pd.DataFrame()
for i in range(1, N_FUTURE + 1):
    y_true_df[f'target_day_{i}'] = df_lstm['stage_m'].shift(-i)

# Inverse scale the LSTM predictions first
n_features_lstm = data_scaled_lstm.shape[1]
dummy_preds_lstm = np.zeros((len(lstm_preds.flatten()), n_features_lstm))
dummy_preds_lstm[:, 0] = lstm_preds.flatten()
lstm_preds_unscaled = scaler_lstm.inverse_transform(dummy_preds_lstm)[:, 0].reshape(lstm_preds.shape)

# Create dataframes from our predictions
# THIS IS THE FIX: The index should match the length of the predictions.
# It starts at the first possible prediction date (index N_PAST - 1) and goes to the end.
lstm_index_start = N_PAST - 1
correct_lstm_index = df_lstm.index[lstm_index_start : lstm_index_start + len(lstm_preds_unscaled)]

lstm_preds_df = pd.DataFrame(
    lstm_preds_unscaled,
    index=correct_lstm_index,
    columns=[f'lstm_pred_day_{i+1}' for i in range(N_FUTURE)]
)
# The xgb_preds_df line is correct as is
xgb_preds_df = pd.DataFrame(xgb_preds, index=X_xgb.index, columns=[f'xgb_pred_day_{i+1}' for i in range(N_FUTURE)])

# 3b. Join everything together
meta_df = y_true_df.join(lstm_preds_df).join(xgb_preds_df)
meta_df.dropna(inplace=True)

# 3c. Train and Evaluate the Hybrid Model day by day
print("\n--- HYBRID Model Performance ---")
for i in range(N_FUTURE):
    day = i + 1
    
    # Create the dataset for this specific day
    X_meta = meta_df[[f'lstm_pred_day_{day}', f'xgb_pred_day_{day}']]
    y_meta = meta_df[f'target_day_{day}']
    
    # Split into train and validation sets
    train_indices_meta = X_meta.index <= split_date_train_end
    val_indices_meta = ~train_indices_meta
    
    X_meta_train, X_meta_val = X_meta[train_indices_meta], X_meta[val_indices_meta]
    y_meta_train, y_meta_val = y_meta[train_indices_meta], y_meta[val_indices_meta]
    
    # Train the meta-model
    meta_model = LinearRegression()
    meta_model.fit(X_meta_train, y_meta_train)
    
    # Evaluate
    hybrid_predictions = meta_model.predict(X_meta_val)
    r2 = r2_score(y_meta_val, hybrid_predictions)
    
    # Print results and the learned weights!
    lstm_weight = meta_model.coef_[0]
    xgb_weight = meta_model.coef_[1]
    print(f"Day {day} Ahead -> R²: {r2:.4f} (LSTM Weight: {lstm_weight:.2f}, XGB Weight: {xgb_weight:.2f})")

Loading data and models...
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step
Recreating identical intelligent feature set for XGBoost...

--- HYBRID Model Performance ---
Day 1 Ahead -> R²: 0.8581 (LSTM Weight: 0.86, XGB Weight: 0.29)
Day 2 Ahead -> R²: 0.5522 (LSTM Weight: 0.35, XGB Weight: 0.79)
Day 3 Ahead -> R²: 0.3311 (LSTM Weight: -0.19, XGB Weight: 1.29)
Day 4 Ahead -> R²: 0.2163 (LSTM Weight: -0.63, XGB Weight: 1.65)
Day 5 Ahead -> R²: 0.1652 (LSTM Weight: -0.85, XGB Weight: 1.82)
Day 6 Ahead -> R²: 0.1628 (LSTM Weight: -0.98, XGB Weight: 1.92)
Day 7 Ahead -> R²: 0.1544 (LSTM Weight: -1.08, XGB Weight: 2.02)
