# Multi Day Model

Combining rainfall data and stream flow make the model better at predicting extreme changes in water level. Now we will input multiple days worth of data into the neural network to predict the next day.

In [None]:
# imports, setup and helper functions
import numpy as np
import pandas as pd
import altair as alt
import requests
import json
from datetime import datetime
import pprint
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

water_level_df = pd.read_csv("datasets/warragamba_dam_level_raw_2008_2022.csv", index_col=0)
rainfall_df = pd.read_csv("datasets/rainfall_within_0.2_lat_degrees_of_warragamba_dam_2008_2022.csv", index_col=0)
stream_df = pd.read_csv("datasets/cox_and_wollon_stream_level_2008-2022.csv", index_col=0)

In [None]:
# clean water_level_df so that levels with quality code 201 or 255 are set to NaN

print(water_level_df["q"].unique())
water_level_df.loc[water_level_df["q"].isin([201,255]), "v"] = np.NaN

water_level_df

In [None]:
# rename df columns to prepare for merge into main_df
water_level_col = "v_212242_130"
water_level_df = water_level_df.rename(columns={"v":"v_212242_130", "q":"q_212242_130"})

In [None]:
# join water_level and rainfall df by timestamp
df = pd.merge(
        left=water_level_df,
        right=rainfall_df,
        how="inner",
        on="t"
    )

# join stream df to main df
df = pd.merge(
        left=df,
        right=stream_df,
        how="inner",
        on="t"
    )

df

In [None]:
water_level = df[water_level_col]
water_level_tomorrow = df[water_level_col].copy().shift(1, fill_value=0.0)

df["water_level_difference"] = water_level - water_level_tomorrow
df

In [None]:
# add water level difference tomorrow and water level in 2 days time
df["water_level_difference_plus_1"] = df["water_level_difference"].copy().shift(-1, fill_value=0.0)
df["water_level_difference_plus_2"] = df["water_level_difference"].copy().shift(-2, fill_value=0.0)
df[["water_level_difference", "water_level_difference_plus_1", "water_level_difference_plus_2"]]

In [None]:
# if corresponding q_ (quality column) 201 or 255, set v_ column to 0.0

value_cols = df.copy().filter(regex="^v_").columns.to_list()
print(value_cols)
df.columns

In [None]:
quality_cols = df.filter(regex="^q_").columns.to_list()
print(quality_cols)

In [None]:
for value_col, quality_col in zip(value_cols, quality_cols):
    df.loc[df[quality_col].isin([201, 255]), value_col] = 0.0
    
df

In [None]:
# remove quality columns
df = df.loc[:, ~df.columns.str.contains('^q_')]

df

In [None]:
# Copy value columns, shift down by 1 and join to data frame.
# This will mean each column will have data for the current day and the previous day.

yesterday_values = df[value_cols].copy().shift(1, fill_value=0.0)
day_minus_2_values = df[value_cols].copy().shift(2, fill_value=0.0)


yesterday_columns = list(map(lambda c: f"{c}_dm1", value_cols))
day_minus_2_columns = list(map(lambda c: f"{c}_dm2", value_cols))

yesterday_values = yesterday_values.rename(columns=dict(zip(value_cols, yesterday_columns)))
day_minus_2_values = day_minus_2_values.rename(columns=dict(zip(value_cols, day_minus_2_columns)))

df = pd.concat([df, yesterday_values, day_minus_2_values], axis=1)
df

In [None]:
# remove row if 'water_level_difference' NaN
df = df.loc[~df["water_level_difference"].isnull()]
df

In [None]:
# separate training data from testing data
# separate features from labels

train_data = df.copy().sample(frac=0.8, random_state=12345)
test_data = df.copy().drop(train_data.index)

train_features = train_data.copy()
test_features = test_data.copy()

train_labels = train_features.pop("water_level_difference")
test_labels = test_features.pop("water_level_difference")

In [None]:
feature_columns = train_features.filter(regex="^v_.*_(100|10).*$").columns

In [None]:
# created normaliser which will normalise input via (input - mean) / sqrt(var)
np_train_features = train_features[feature_columns]
np_train_labels = train_labels.values
np_train_labels = np_train_labels.reshape(-1, 1)

np_test_features = test_features[feature_columns].values
np_test_labels = test_labels.values

normaliser = tf.keras.layers.Normalization(axis=1)
normaliser.adapt(np_train_features)

display(np_train_features)


In [None]:
test_model = tf.keras.Sequential(name="stream_rain_temporal_model", layers=[
    layers.Input(shape=(21,)),
    normaliser,
    layers.Dropout(rate=0.5),
    layers.Dense(units=32, activation="relu"),
    layers.Dropout(rate=0.5),
    layers.Dense(units=1)
])

test_model.summary()

In [None]:
# check it allows input that is the same shape as the features
test_model.predict(np_train_features[0:1])

In [None]:
test_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='mean_absolute_error')

In [None]:
history = test_model.fit(
    np_train_features,
    np_train_labels,
    epochs=50,
    # Suppress logging.
    verbose=1,
    # Calculate validation results on 20% of the training data.
    validation_split = 0.2)

In [None]:
hist_df = pd.DataFrame(history.history)
hist_df["epoch"] = history.epoch

alt.Chart(hist_df).mark_line().encode(
    x="epoch:Q",
    y="loss:Q"
) + alt.Chart(hist_df).mark_line(color="orange").encode(
    x="epoch:Q",
    y="val_loss:Q"
)

In [None]:
test_model.evaluate(np_test_features, np_test_labels)

In [None]:
# compare predictions to real values of training set

y = test_model.predict(df[feature_columns].values)

compare_df = pd.DataFrame({
    "t": df[["t"]].values.flatten(),
    "actual": df[["water_level_difference"]].values.flatten(),
    "prediction": y.flatten()
})

compare_df.describe()

In [None]:
base = alt.Chart(compare_df.reset_index()[0:5000]).encode(
    x="index:Q"
)

(base.mark_line().encode(
    y="actual:Q"
) + base.mark_line(color="orange").encode(
    y="prediction:Q"
)).interactive()