# River flow predictor

In [None]:
import data_processing as dp
import training as trn
import pandas as pd

## Create a configured dataset, select and train models

In [None]:
# ============================== CONFIG ===============================================

epochs = 1000
lrn_param = 0.1
hidden_layers = 1
nodes_per_layer = 5
standardisation_range = (0.1, 0.9)
dataset_split = (0.6, 0.2, 0.2)



# ============================= DATA PROCESSING ========================================

raw_df = dp.read_baseline_csv()

# Copy the DataFrame before standardising values
# as we will use the raw data for visualization
main_df = raw_df.copy()

main_df = dp.split_data(main_df, dataset_split[0], dataset_split[1], dataset_split[2])
main_df, min_val, max_val = dp.standardise_data(main_df, standardisation_range[0], standardisation_range[1])
print(main_df.head().to_string() + "\n")




# ============================= MODEL TRAINING ===========================================

# Initialise and train neural network

network = trn.initialise_network(main_df.shape[1]-1, hidden_layers, nodes_per_layer)
print("Network created")

# Train using backpropagation and calculate root-mean-square error after every epoch
network = trn.train(network, main_df, lrn_param, epochs)

# Record real values vs values modelled by trained network
# Map real to modelled values by date as modelled values are in random order
real_vals = raw_df.xs("p", axis=1, level=1)
model_vals = pd.DataFrame(data = trn.predict(network, main_df), index = main_df.droplevel(level=0).index)

# Destandardise the modelled values
model_vals = model_vals.apply(lambda x: dp.destandardise_val(x, standardisation_range[0], standardisation_range[1], min_val, max_val))

outcome_df = real_vals.join(model_vals)
outcome_df = outcome_df.set_axis(["Actual", "Modelled"], axis="columns")

print("Training complete")
print(outcome_df.head())

## Data visualization

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import Slider, RadioButtons, CheckButtons
%matplotlib widget
plt.close("all")
vis_df = raw_df.copy()

### Plot data by type and cull by std devs

In [None]:
plt.close("all")
std_df = vis_df.sort_index()

# ============================ GRAPH ==================================


# Line graph showing flow/rain datasets after culling with different std dev values

# Initial graph - flow data with no culling
fig, ax_graph = plt.subplots()
ax_buttons = fig.add_axes([0.2, 0.05, 0.15, 0.1])
ax_slider = fig.add_axes([0.6, 0.12, 0.3, 0.03])
std_df["f"].plot(ax=ax_graph,
    title=f"Flow data - no culling - {std_df.shape[0]} rows")



# ============================== COMPONENTS ================================


# Radio buttons for flow/rain

dataset_radio = RadioButtons(ax=ax_buttons, labels=["Flow", "Rain"])



# Slider for # of std devs

sd_slider = Slider(ax=ax_slider, label="Cutoff # of std devs (8=none)",
                   valmin=1, valmax=8, valinit=8, valstep=1)

# ======================== EVENT HANDLER FUNCTIONS =======================

def std_update(val):
    data_type = dataset_radio.value_selected
    sd = sd_slider.val

    std_data = std_df.copy()
    title_text = f"{data_type} data - no culling - {std_data.shape[0]} rows"
    if sd != 8:
        std_data = std_data[np.abs(std_data - std_data.std()) <= std_data.mean() + sd * std_data.std()]
        std_data.dropna(inplace=True)
        title_text = f"{data_type} data - culling values >{sd} std devs from mean - {std_data.shape[0]} rows"

    ax_graph.clear()

    dt = data_type[0].lower()
    if dt == "f":
        std_ax_data = std_data.loc[:, dt]
    else:
        std_ax_data = std_data[[c for c in std_data if c[1][0] != "f"]]
    std_ax_data.plot(ax=ax_graph, title=title_text)
    fig.subplots_adjust(bottom=0.25)
    fig.canvas.draw()


dataset_radio.on_clicked(std_update)
sd_slider.on_changed(std_update)

# Figure styling
fig.subplots_adjust(bottom=0.25)
fig.set_size_inches((12, 8))
fig.canvas.header_visible = False

### Plot predictand against predictors with transformations

In [None]:
plt.close("all")
corr_data = vis_df.copy()
corr_data = corr_data.droplevel(level=0, axis=1)

# ============================ GRAPHS ==================================


# Line graph showing flow/rain datasets after culling with different std dev values

# Initial graph - flow data with no culling
corr_fig, corr_axes = plt.subplot_mosaic(
    [
        ["main", "xdata"],
        ["main", "x_transforms"],
        ["main", "y_transforms"],
    ],
    width_ratios=[5, 1])
corr_axes["x_transforms"].set_title("x transforms")
corr_axes["y_transforms"].set_title("y transforms")

pd_init_x = corr_data.columns[0]
corr_init = corr_data.corr()["p"][pd_init_x]
corr_data.plot(x=pd_init_x, y="p", kind="scatter", ax=corr_axes["main"],
               title=f"Correlation between p and {pd_init_x} - r={corr_init:.4f}")

# ============================== COMPONENTS ================================


# Radio buttons for predictor to plot

predictor_radio = RadioButtons(ax=corr_axes["xdata"],
                               labels=corr_data.columns)

# Check buttons for predictor transforms

def transform_labels(axis):
    return [f"log({axis})",
            f"sqrt({axis})",
            f"{axis}^2",
            f"{axis}^3",
            f"e^{axis}",
            f"e^-{axis}",
            f"1/{axis}",]

x_transform_checks = CheckButtons(ax=corr_axes["x_transforms"],
                                labels=transform_labels("x"))


# Check buttons for predictand transforms

y_transform_checks = CheckButtons(ax=corr_axes["y_transforms"],
                                  labels=transform_labels("y"))

# ======================== EVENT HANDLER FUNCTIONS =======================

def apply_transforms(checks, dat):
    transforms = checks.get_status()
    if transforms[0]: dat = dat.apply(np.log)
    if transforms[1]: dat = dat.apply(np.sqrt)
    if transforms[2]: dat = dat.apply(np.square)
    if transforms[3]: dat = dat.apply(lambda x: np.power(x, 3))
    if transforms[4]: dat = dat.apply(lambda x: np.power(np.e, x))
    if transforms[5]: dat = dat.apply(lambda x: np.power(np.e, -x))
    if transforms[6]: dat = dat.apply(lambda x: 1 / x if x != 0 else 0)
    return dat

def pd_update(val):
    prd = predictor_radio.value_selected

    c_data = corr_data.copy()

    c_data[prd] = apply_transforms(x_transform_checks, c_data[prd])
    c_data["p"] = apply_transforms(y_transform_checks, c_data["p"])

    corr = c_data.corr()["p"][prd]

    corr_axes["main"].clear()
    c_data.plot(x=prd, y="p", kind="scatter", ax=corr_axes["main"],
                title=f"Correlation between p and {prd} - r={corr:.4f}")
    corr_fig.canvas.draw()

predictor_radio.on_clicked(pd_update)
x_transform_checks.on_clicked(pd_update)
y_transform_checks.on_clicked(pd_update)

# Figure styling
corr_fig.set_size_inches((12, 8))
corr_fig.canvas.header_visible = False

### Heatmap of correlations between all columns

In [None]:
heatmap_df = vis_df.droplevel(level=0, axis=1).corr()
heatmap_df.style.background_gradient(cmap='Blues')

### RMSE of model during training (currently not working)

In [None]:
#plt.close("all")
#
#rmse_fig, rmse_ax_graph = plt.subplots()
#
#rmse_ax_graph.plot(rmse_arr)
#
#plt.xlabel("epochs")
#plt.ylabel("Root MSE")

### Line graph of predicted and modelled values with time

In [None]:
plt.close("all")

graph_df = outcome_df.sort_values(by=["Modelled"])
graph_df.plot(figsize=(12, 8))

### Scatter plot of modelled vs actual values

In [None]:
plt.close("all")

outcome_corr = outcome_df.corr()['Actual'].iloc[1]
lim = outcome_df["Actual"].max()

outcome_df.plot.scatter(x="Actual",
                        y="Modelled",
                        figsize=(8, 8),
                        title=f"r = {outcome_corr}",
                        xlim=(-lim * 0.1, lim * 1.1),
                        ylim=(-lim * 0.1, lim * 1.1))

plt.gca().axline((0, 0), linestyle="--", color="r", slope=1)