In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
import matplotlib.pyplot as plt
import joblib

# 1. Load and Prepare Data
df = pd.read_csv('combined_dataset.csv', index_col='datetime', parse_dates=True)

# 2. Intelligent Feature Engineering
print("Building intelligent feature set based on PACF/CCF analysis...")
df_features = pd.DataFrame(index=df.index)

# Based on PACF analysis of stage_m
df_features['stage_m_lag_1'] = df['stage_m'].shift(1)
df_features['stage_m_lag_2'] = df['stage_m'].shift(2)
df_features['stage_m_lag_3'] = df['stage_m'].shift(3)
df_features['stage_m_lag_4'] = df['stage_m'].shift(4)

# Based on CCF analysis for each weather variable
df_features['prcp_lag_1'] = df['prcp'].shift(1)
df_features['prcp_lag_2'] = df['prcp'].shift(2)
df_features['rhum_lag_1'] = df['rhum'].shift(1)
df_features['rhum_lag_2'] = df['rhum'].shift(2)
df_features['tavg_lag_1'] = df['tavg'].shift(1)
df_features['tavg_lag_3'] = df['tavg'].shift(3)
df_features['tavg_lag_5'] = df['tavg'].shift(5)
df_features['wspd_lag_1'] = df['wspd'].shift(1)
df_features['pres_lag_1'] = df['pres'].shift(1)
df_features['prcp_rolling_3day_sum'] = df['prcp'].rolling(window=3).sum().shift(1)

# --- Create Future Target Features (y) ---
N_FUTURE = 7
df_targets = pd.DataFrame()
for i in range(1, N_FUTURE + 1):
    df_targets[f'target_day_{i}'] = df['stage_m'].shift(-i)

# --- Combine and clean data ---
df_full = pd.concat([df_features, df_targets], axis=1)
df_full.dropna(inplace=True)

feature_cols = [col for col in df_full.columns if 'lag' in col or 'rolling' in col]
target_cols = [col for col in df_full.columns if 'target' in col]
X = df_full[feature_cols]
y = df_full[target_cols]

# --- Split data ---
split_date_train_end = '2019-01-01'
train_indices = X.index <= split_date_train_end
val_indices = ~train_indices
X_train, X_val = X[train_indices], X[val_indices]
y_train, y_val = y[train_indices], y[val_indices]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 5. Build and Train XGBoost Model
print("\nTraining XGBoost model...")
xgb_regressor = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=109,
    learning_rate=0.05,
    max_depth=6,
    min_child_weight=4,
    subsample=1.0,
    colsample_bytree=0.9,
    n_jobs=-1,
    random_state=42
)

# Wrap it in MultiOutputRegressor
multioutput_model = MultiOutputRegressor(estimator=xgb_regressor)

# Fit the model
multioutput_model.fit(X_train_scaled, y_train)

# 6. Evaluation
print("\n--- XGBoost Model Performance ---")
predictions = multioutput_model.predict(X_val_scaled)
for i in range(N_FUTURE):
    day = i + 1
    y_true_day = y_val.iloc[:, i]
    y_pred_day = predictions[:, i]
    r2 = r2_score(y_true_day, y_pred_day)
    print(f"Day {day} Ahead -> R²: {r2:.4f}")

# 7. Save the trained model and scaler
print("\nSaving model and scaler...")
joblib.dump(multioutput_model, 'best_xgboost_model.joblib')
joblib.dump(scaler, 'xgboost_scaler.joblib')
print("✅ Model and scaler saved successfully!")

Building intelligent feature set based on PACF/CCF analysis...

Training XGBoost model...

--- XGBoost Model Performance ---
Day 1 Ahead -> R²: 0.6533
Day 2 Ahead -> R²: 0.4727
Day 3 Ahead -> R²: 0.3787
Day 4 Ahead -> R²: 0.3207
Day 5 Ahead -> R²: 0.2843
Day 6 Ahead -> R²: 0.2713
Day 7 Ahead -> R²: 0.2625

Saving model and scaler...
✅ Model and scaler saved successfully!
