# Warragamba

Install dependencies

```
poetry install
```

Use poetry env
```
poetry shell
```

If using VSCode use the python env denoted as 'Poetry env'

# Imports and Setup

In [None]:
import numpy as np # Numerical python lib
import pandas as pd # Data frame lib
import altair as alt # Chart plotting lib
import tensorflow as tf # Neural Network lib
import requests
import json
import pprint

from tensorflow.keras import layers

In [None]:
water_nsw_api_endpoint = "https://realtimedata.waternsw.com.au/cgi/webservice.pl"

def get_ts_trace(station_id: str, start_time: str, end_time: str, variable: str, interval: str, aggregate: str, datasource="A", multiplier="1"):
    
    payload = json.dumps({
        "function": "get_ts_traces",
        "version": 2,
        "params": {
            "site_list": station_id,
            "datasource": datasource,
            "start_time": start_time,
            "end_time": end_time,
            "var_list": variable,
            "interval": interval,
            "multiplier": multiplier,
            "data_type": aggregate
        }        
    })
    
    json_response = requests.post(water_nsw_api_endpoint, payload).json()

    try:
        return json_response["_return"]["traces"][0]["trace"]
    except KeyError:
        return json_response

## Fetch Data from API

In [None]:
weather_station_ids = ["563035", "563046", "563079", "568045", "568051"]
stream_station_ids = ["212250", "212270"]

def cast_data_types(df):
    df["v"] = df["v"].astype(float)
    df["q"] = df["q"].astype(float)
    df["t"] = pd.to_datetime(df["t"], format="%Y%m%d%H%M%S")
    return df
    

json_response = get_ts_trace(
    "212242",
    "20080130000000", # when the warragamba dam measurements start
    "20220111000000", # when warragamba dam measurements end
    variable="130.00",
    interval="day",
    aggregate="mean",
    datasource="CP"
    
)
water_level_df = pd.json_normalize(json_response)
cast_data_types(water_level_df)


rainfall_dfs = []
for station in weather_station_ids:
    json_response = get_ts_trace(
        station,
        "20080130000000", 
        "20220111000000",
        variable="10.00",
        interval="day",
        aggregate="mean",
        datasource="CP"   
    )
    
    rainfall_dfs.append(
        cast_data_types(
            pd.json_normalize(json_response)
        )
    )

stream_dfs = []
for station in stream_station_ids:
    json_response = get_ts_trace(
        station,
        "20080130000000", 
        "20220111000000",
        variable="100.00",
        interval="day",
        aggregate="mean",
        datasource="CP"   
    )
    
    stream_dfs.append(
        cast_data_types(
            pd.json_normalize(json_response)
        )
    )

## Deriving 'Water Level Difference'

In [None]:
water_level = water_level_df["v"]
water_level_tomorrow = water_level_df["v"].copy().shift(1, fill_value=0.0)
water_level_df["water_level_difference"] = water_level - water_level_tomorrow
water_level_df = water_level_df[water_level_df["water_level_difference"] > -0.3]

## Cleaning

In [None]:
water_level_df = water_level_df[~water_level_df["q"].isin([201,205])]
water_level_df

In [None]:
for stream_df in stream_dfs:
    stream_df.loc[stream_df["q"].isin([201,255]), "v"] = 0.0
    
for rainfall_df in rainfall_dfs:
    rainfall_df.loc[rainfall_df["q"].isin([201,255]), "v"] = 0.0

In [None]:
def rename_cols_with_id(dfs, station_ids: list, variable_no: str):
    for df, station_id in zip(dfs, station_ids):
        df.rename(
            columns={
                "v": f"v_{station_id}_{variable_no}",
                "q": f"q_{station_id}_{variable_no}",
            },
            inplace=True,
        )


water_level_df = water_level_df.rename(
    columns={"v": "v_212242_130", "q": "q_212242_130"}
)

rename_cols_with_id(rainfall_dfs, weather_station_ids, "10")
rename_cols_with_id(stream_dfs, stream_station_ids, "100")

df = water_level_df

for rainfall_df in rainfall_dfs:
    df = pd.merge(left=df, right=rainfall_df, how="inner", on="t")

for stream_df in stream_dfs:
    df = pd.merge(left=df, right=stream_df, how="inner", on="t")

df.columns

## Prepare Data for Model

In [None]:
feature_columns = [
  "v_568051_10", "v_568045_10", "v_563079_10",
  "v_563046_10", "v_563035_10", "v_212250_100",
  "v_212270_100"]

label_column = "water_level_difference"
model_columns = feature_columns + [label_column]

train_data = df.copy()[model_columns].sample(frac=0.8, random_state=12345)
test_data = df.copy()[model_columns].drop(train_data.index)

train_features = train_data.copy()
test_features = test_data.copy()

train_labels = train_features.pop("water_level_difference").values.reshape(-1, 1)
test_labels = test_features.pop("water_level_difference").values.reshape(-1, 1)

## Building the Model

In [None]:
normaliser = tf.keras.layers.Normalization(axis=1)
normaliser.adapt(train_features)

test_model = tf.keras.Sequential(name="stream_and_rain_model", layers=[
    layers.Input(shape=(7,)),
    normaliser,
    layers.Dropout(rate=0.5),
    layers.Dense(units=16, activation="relu"),
    layers.Dropout(rate=0.5),
    layers.Dense(units=1)
])

test_model.summary()

## Training

In [None]:
test_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='mean_absolute_error')

history = test_model.fit(
    train_features.values,
    train_labels,
    epochs=30,
    # Suppress logging.
    verbose=1,
    # Calculate validation results on 20% of the training data.
    validation_split = 0.2)

In [None]:
hist_df = pd.DataFrame(history.history)
hist_df["epoch"] = history.epoch
hist_df.rename(
    columns={"loss":"training_loss", "val_loss":"validation_loss"},
    inplace=True
)

alt.Chart(hist_df).mark_line().transform_fold(
    fold=['training_loss', 'validation_loss'], 
    as_=['variable', 'loss']
).encode(
    x="epoch:Q",
    y="loss:Q",
    color="variable:N"
)

## Results

In [None]:
test_model.evaluate(test_features, test_labels)

In [None]:
y = test_model.predict(df[feature_columns])

compare_df = pd.DataFrame({
    "t": df[["t"]].values.flatten(),
    "actual": df[["water_level_difference"]].values.flatten(),
    "prediction": y.flatten()
})

base = alt.Chart(compare_df.reset_index()[0:5000]).encode(
    x="index:Q"
)

(base.mark_line().encode(
    y="actual:Q"
) + base.mark_line(color="orange").encode(
    y="prediction:Q"
)).interactive()