# Load Data

In [3]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import lxml

2024-09-09 07:23:05.323393: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-09 07:23:05.328274: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-09 07:23:05.344716: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-09 07:23:05.371090: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-09 07:23:05.377587: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-09 07:23:05.390310: I tensorflow/core/platform/cpu_feature_gu

In [None]:
LOWER_MEKONG_STATION_CODES =  [
    "STR", # StungTreng
    "KRA", # Kratie
    "KOM", # Kompong Cham
    "PPB", # Phnom Penh (Bassac)
    "PPP", # Phnom Penh Port
    "KOH", # Koh Khel (Bassac)
    "NEA", # Neak Luong
    "PRE", # Prek Kdam (Tonle Sap)
    "TCH", # Tan Chau
    "CDO", # Chau Doc (Bassac)
]
BASE_URL = "http://ffw.mrcmekong.org/fetchwet_st.php?StCode="
r = requests.get(BASE_URL+LOWER_MEKONG_STATION_CODES[3], verify=False)
# soup = BeautifulSoup(r.content, 'html5lib')
# body = soup.find('body')
data_string = r.content.decode('utf-8')

# Convert single quotes and remove any non-JSON parts
data_string = data_string.replace('date_gmt:', '"date_gmt":')
data_string = data_string.replace('Max:', '"Max":')
data_string = data_string.replace('Min:', '"Min":')
data_string = data_string.replace('AVG:', '"AVG":')
data_string = data_string.replace('floodLevel:', '"floodLevel":')
data_string = data_string.replace('alarmLevel:', '"alarmLevel":')
for year in range(1992, 2025):
    data_string = data_string.replace(f'{year}:', f'"{year}":')

data_string = data_string.replace(',]', ']')

# Now parse it into a list of dictionaries
data = json.loads(data_string)

# Convert to dataframe
df = pd.DataFrame(data)
df['date_gmt'] = df['date_gmt'].apply(lambda x: x.split("-")[1]+"-"+x.split("-")[2])
df['station'] = LOWER_MEKONG_STATION_CODES[3]

# Set date_gmt as index 
df.index = df['date_gmt']

df.describe()

# Preprocess

In [None]:
df_filtered = df[['date_gmt', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']]
df_filtered.set_index('date_gmt', inplace=True)
df_filtered.reset_index(inplace=True)
df_long = pd.melt(df_filtered, id_vars=['date_gmt'], value_vars=['2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024'],
                   var_name='Year', value_name='water_level')
df_long['DATE_GMT'] = pd.to_datetime(df_long['date_gmt'] + '-' + df_long['Year'], format='%m-%d-%Y').dt.strftime('%m-%d-%Y')
df_long = df_long[['DATE_GMT', 'water_level']]
df_non_zero = df_long[df_long['water_level'] != 0]
df_non_zero.set_index('DATE_GMT', inplace=True)
df_non_zero.index.freq='D'
df_non_zero.plot(figsize=(12,6))

# Train

In [None]:
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.callbacks import EarlyStopping

# Split the dataset into training and testing sets
train = df_non_zero.iloc[:1222]
test = df_non_zero.iloc[1222:]

# Scale the data
scaler = MinMaxScaler()
scaled_train = scaler.fit_transform(train)  # Fit on train and transform both train and test
scaled_test = scaler.transform(test)

look_back = 10  # Number of previous time steps to consider for prediction
batch_size = 32  # Batch size

# Create TimeseriesGenerator for training and testing data
train_generator = TimeseriesGenerator(scaled_train, scaled_train, length=look_back, batch_size=batch_size)
test_generator = TimeseriesGenerator(scaled_test, scaled_test, length=look_back, batch_size=batch_size)

In [None]:
# Build the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(look_back, scaled_train.shape[1])))
model.add(Dense(scaled_train.shape[1]))
model.compile(optimizer='adam', loss='mse')
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(train_generator, 
                    validation_data=test_generator, 
                    epochs=100, 
                    callbacks=[early_stop],
                    verbose=1)

In [None]:
model.summary()

In [None]:
# Evaluate the model on test data
loss = model.evaluate(test_generator)
print(f"Test Loss: {loss}")

# Make predictions
predictions = model.predict(test_generator)

# Inverse scale predictions to original values
predictions_inverse = scaler.inverse_transform(predictions)
test_data = scaled_test[look_back:]

# Inverse scale the predictions and test data to get back the original scale
predictions_inverse = scaler.inverse_transform(predictions)
original_test_data_inverse = scaler.inverse_transform(test_data)

# Plot predicted vs original values
plt.figure(figsize=(8, 4))
plt.plot(original_test_data_inverse, label='Original Values')
plt.plot(predictions_inverse, label='Predicted Values', linestyle='dashed')
plt.title('Predicted vs Original Values')
plt.xlabel('Time Steps')
plt.ylabel('Values')
plt.legend()
plt.show()