# Streamway Depth Forecasting Model

This notebook develops a forecasting model for the OFD 1 streamway depth.
It uses lagged rainfall (2-7 hours ago) and forecast precipitation to predict future depth.
We compare two approaches:
1. Predicting absolute depth.
2. Predicting change in depth.

In [9]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import requests
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import os


In [10]:
# 1. Load Streamway Data
if os.path.exists('streamway_data.csv'):
    df_streamway = pd.read_csv('streamway_data.csv', parse_dates=['created_at'], index_col='created_at')
    df_streamway = df_streamway.sort_index()
    df_streamway = df_streamway[~df_streamway.index.duplicated(keep='first')]
    # Resample to 10min
    df_streamway = df_streamway.resample('10min').mean().interpolate(method='time')
    print(f"Loaded {len(df_streamway)} rows of streamway data.")
else:
    print("Error: streamway_data.csv not found.")

Loaded 79298 rows of streamway data.


In [11]:
# 2. Load/Fetch Precipitation Data (Open-Meteo)
# We fetch hourly data. For historical, we use the archive/historical forecast.

latitude = 51.8258112
longitude = -3.6611301
start_date = df_streamway.index.min().strftime('%Y-%m-%d')
end_date = df_streamway.index.max().strftime('%Y-%m-%d')

forecast_url = "https://historical-forecast-api.open-meteo.com/v1/forecast"
forecast_params = {
    "latitude": latitude,
    "longitude": longitude,
    "start_date": start_date,
    "end_date": end_date,
    "hourly": "precipitation",
    "timezone": "auto"
}

print("Fetching precipitation data...")
response = requests.get(forecast_url, params=forecast_params)
if response.status_code == 200:
    data = response.json()
    hourly_data = data['hourly']
    df_precip = pd.DataFrame({
        'time': pd.to_datetime(hourly_data['time']),
        'precip': hourly_data['precipitation']
    })
    df_precip.set_index('time', inplace=True)
    print(f"Loaded {len(df_precip)} rows of precipitation data.")
else:
    print("Error fetching precipitation data.")
    df_precip = pd.DataFrame()

Fetching precipitation data...
Loaded 13224 rows of precipitation data.


In [12]:
# plot the residules of the historical precipitation against the predicted precipitation
# this will help us understand the accuracy of the forecast data we are using




In [13]:
# 3. Feature Engineering

# Align data
# Upsample precipitation to 10min (ffill to simulate availability)
df_precip_10min = df_precip.resample('10min').ffill()

df = df_streamway.join(df_precip_10min, how='inner')

# Create Lags (2-7 hours ago)
# 1 hour = 6 steps
for h in range(2, 8):
    df[f'precip_lag_{h}h'] = df['precip'].shift(h * 6)

# Create Forecast Features (Future Rain relative to T)
# We assume 'precip' at T, T+1h... is available as forecast.
# For prediction horizon of 4 hours, we might use rain at T, T+1, T+2, T+3.
prediction_horizon_hours = 4
prediction_steps = prediction_horizon_hours * 6

# Future rain features (from T to T+horizon)
# Note: shift(-k) brings future data to current row. 
# In production, this comes from the forecast API.
for h in range(0, prediction_horizon_hours):
    # We use rolling window or specific points? Let's use hourly points.
    # T+0h, T+1h, ...
    df[f'precip_forecast_{h}h'] = df['precip'].shift(-h * 6)

# Create Targets
# 1. Absolute Depth at T + 4h
df['target_depth'] = df['streamway_depth_mm'].shift(-prediction_steps)

# 2. Change in Depth (T+4h - T)
df['target_change'] = df['target_depth'] - df['streamway_depth_mm']

df_model = df.dropna()
print(f"Model data shape: {df_model.shape}")
df_model.head()

Model data shape: (79232, 15)


Unnamed: 0,entry_id,streamway_depth_mm,precip,precip_lag_2h,precip_lag_3h,precip_lag_4h,precip_lag_5h,precip_lag_6h,precip_lag_7h,precip_forecast_0h,precip_forecast_1h,precip_forecast_2h,precip_forecast_3h,target_depth,target_change
2024-05-17 08:50:00,1080.0,610.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,610.0,0.0
2024-05-17 09:00:00,1081.0,610.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,605.0,-5.0
2024-05-17 09:10:00,1082.0,610.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,605.0,-5.0
2024-05-17 09:20:00,1083.0,610.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,605.0,-5.0
2024-05-17 09:30:00,1084.0,610.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,605.0,-5.0


In [14]:
# 4. Validation and Hyperparameter Tuning

features = [f'precip_lag_{h}h' for h in range(2, 8)] + \
           [f'precip_forecast_{h}h' for h in range(0, prediction_horizon_hours)]

print("Features:", features)

# We focus on tuning the 'Change in Depth' model as it's likely more robust
X = df_model[features]
y = df_model['target_change']

# Time Series Cross-Validation
# This respects the temporal order of observations.
tscv = TimeSeriesSplit(n_splits=5)

# Grid Search for Hyperparameters
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200]
}

model = xgb.XGBRegressor(random_state=42)

print("Running GridSearchCV...")
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=tscv,
    scoring='neg_root_mean_squared_error',
    verbose=1,
    n_jobs=-1
)
grid_search.fit(X, y)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best RMSE (Validation): {-grid_search.best_score_:.2f}")

best_model_change = grid_search.best_estimator_

Features: ['precip_lag_2h', 'precip_lag_3h', 'precip_lag_4h', 'precip_lag_5h', 'precip_lag_6h', 'precip_lag_7h', 'precip_forecast_0h', 'precip_forecast_1h', 'precip_forecast_2h', 'precip_forecast_3h']
Running GridSearchCV...
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Best RMSE (Validation): 104.45


In [15]:
# 5. Final Model Training and Testing

# Split Data (80% Train, 20% Test)
split_idx = int(len(df_model) * 0.8)
train = df_model.iloc[:split_idx]
test = df_model.iloc[split_idx:]

X_train = train[features]
X_test = test[features]

# Model A: Absolute Depth (Using default params for comparison)
y_train_depth = train['target_depth']
y_test_depth = test['target_depth']

model_depth = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5)
model_depth.fit(X_train, y_train_depth)
pred_depth = model_depth.predict(X_test)

# Model B: Change in Depth (Using Tuned Model)
y_train_change = train['target_change']
y_test_change = test['target_change']

# Retrain best model on full training set
best_model_change.fit(X_train, y_train_change)
pred_change = best_model_change.predict(X_test)

# Reconstruct Depth from Change
# Predicted Depth = Current Depth + Predicted Change
current_depth_test = test['streamway_depth_mm']
pred_depth_reconstructed = current_depth_test + pred_change

# Evaluation
rmse_a = np.sqrt(mean_squared_error(y_test_depth, pred_depth))
rmse_b = np.sqrt(mean_squared_error(y_test_depth, pred_depth_reconstructed))
r2_a = r2_score(y_test_depth, pred_depth)
r2_b = r2_score(y_test_depth, pred_depth_reconstructed)

print(f"Model A (Absolute) RMSE: {rmse_a:.2f}")
print(f"Model A (Absolute) R2: {r2_a:.2f}")
print(f"Model B (Change - Tuned) RMSE: {rmse_b:.2f}")
print(f"Model B (Change - Tuned) R2: {r2_b:.2f}")

Model A (Absolute) RMSE: 185.46
Model A (Absolute) R2: 0.53
Model B (Change - Tuned) RMSE: 97.49
Model B (Change - Tuned) R2: 0.87


In [16]:
# 6. Visualization with Plotly

# Select a window to plot
window_start = test.index[0]
window_end = window_start + pd.Timedelta(days=2)
plot_data = test.loc[window_start:window_end]
plot_preds_a = pred_depth[:len(plot_data)]
plot_preds_b = pred_depth_reconstructed[:len(plot_data)]

# Create Figure
fig = go.Figure()

# Actual Depth (Target)
fig.add_trace(go.Scatter(
    x=plot_data.index + pd.Timedelta(hours=4),
    y=plot_data['target_depth'],
    mode='lines',
    name='Actual Depth (T+4h)',
    line=dict(color='black', width=2)
))

# Model A Predictions
fig.add_trace(go.Scatter(
    x=plot_data.index + pd.Timedelta(hours=4),
    y=plot_preds_a,
    mode='lines',
    name='Predicted Absolute (Model A)',
    line=dict(color='blue', width=2, dash='dash')
))

# Model B Predictions
fig.add_trace(go.Scatter(
    x=plot_data.index + pd.Timedelta(hours=4),
    y=plot_preds_b,
    mode='lines',
    name='Predicted Change -> Depth (Model B)',
    line=dict(color='red', width=2, dash='dot')
))

# Current Depth (Input)
fig.add_trace(go.Scatter(
    x=plot_data.index,
    y=plot_data['streamway_depth_mm'],
    mode='lines',
    name='Current Depth (T)',
    line=dict(color='gray', width=1)
))

fig.update_layout(
    title='Streamway Depth Forecast (4-Hour Horizon)',
    xaxis_title='Time',
    yaxis_title='Depth (mm)',
    template='plotly_white',
    hovermode='x unified'
)

fig.show()

In [17]:
# Visualization of Data Availability for a Single Prediction
# To show "data used when asking for a prediction is only in the range of data I can expect to have"

sample_time = test.index[100]
prediction_time = sample_time + pd.Timedelta(hours=4)

print(f"Prediction made at: {sample_time}")
print(f"Forecasting for: {prediction_time}")

# Get data available at sample_time
# Past Rain (2-7h lag)
past_rain_times = [sample_time - pd.Timedelta(hours=h) for h in range(2, 8)]
past_rain_vals = [test.loc[sample_time, f'precip_lag_{h}h'] for h in range(2, 8)]

# Forecast Rain (0-3h future)
forecast_rain_times = [sample_time + pd.Timedelta(hours=h) for h in range(0, 4)]
forecast_rain_vals = [test.loc[sample_time, f'precip_forecast_{h}h'] for h in range(0, 4)]

fig = go.Figure()

# Plot Past Rain
fig.add_trace(go.Bar(
    x=past_rain_times,
    y=past_rain_vals,
    name='Past Rain (Input)',
    marker_color='blue',
    opacity=0.6
))

# Plot Forecast Rain
fig.add_trace(go.Bar(
    x=forecast_rain_times,
    y=forecast_rain_vals,
    name='Forecast Rain (Input)',
    marker_color='cyan',
    opacity=0.6,
    marker_pattern_shape='/'
))

# Plot Prediction Point
pred_val = pred_depth_reconstructed.iloc[100]
actual_val = test.loc[sample_time, 'target_depth']

fig.add_trace(go.Scatter(
    x=[prediction_time],
    y=[pred_val],
    mode='markers',
    name='Predicted Depth',
    marker=dict(color='red', size=12, symbol='circle')
))

fig.add_trace(go.Scatter(
    x=[prediction_time],
    y=[actual_val],
    mode='markers',
    name='Actual Depth',
    marker=dict(color='black', size=12, symbol='x')
))

# Plot Current Depth
fig.add_trace(go.Scatter(
    x=[sample_time],
    y=[test.loc[sample_time, 'streamway_depth_mm']],
    mode='markers',
    name='Current Depth (Known)',
    marker=dict(color='green', size=12, symbol='circle')
))

# Add vertical line for Prediction Time
fig.add_vline(x=sample_time, line_width=2, line_dash="dash", line_color="gray")
fig.add_annotation(x=sample_time, y=0, text="Prediction Time (Now)", showarrow=True, arrowhead=1, yref="paper")

fig.update_layout(
    title=f'Data Availability for Prediction at {sample_time}',
    xaxis_title='Time',
    yaxis_title='Value',
    template='plotly_white'
)

fig.show()

Prediction made at: 2025-08-01 05:40:00
Forecasting for: 2025-08-01 09:40:00
