In [37]:
import polars as pl
import torch
import numpy as np
import os

DATA_PATH = "../data/processed"
MODEL_PATH = "../models"


In [None]:
# load data
train = pl.read_parquet(os.path.join(DATA_PATH, "train.parquet"))
test = pl.read_parquet(os.path.join(DATA_PATH, "test.parquet"))

In [26]:
train.shape

(65730, 12)

In [28]:
train.columns

['date',
 'mr_lag_1',
 'mr_lag_2',
 'mr_lag_3',
 'mr_lag_4',
 'mr_lag_5',
 'mr_lag_6',
 'mr_lag_7',
 'location_longitude',
 'location_latitude',
 'station_name',
 'y']

In [29]:
# Keep only rows with full values
train = train.filter(pl.col("mr_lag_7").is_not_null())
test = test.filter(pl.col("mr_lag_7").is_not_null())
train.shape

(65520, 12)

In [30]:
# train pytorch LSTM model predicting whether it will rain the next day or not on time series data.
# use the value of rainfall in the last 7 days along with location as input features, and whether 
# it rained the next day as the target variable.
class RainfallLSTM(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RainfallLSTM, self).__init__()
        self.lstm = torch.nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = torch.nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return torch.sigmoid(out)


In [31]:
# prepare data for training
drop_cols = ["date", "station_name", "y"]
X_train = train.drop(drop_cols).to_numpy()
X_test = test.drop(drop_cols).to_numpy()

y_train = train["y"].to_numpy().reshape(-1, 1)
y_test = test["y"].to_numpy().reshape(-1, 1)

In [32]:
# initiate model, loss function and optimizer
model = RainfallLSTM(input_size=X_train.shape[1], hidden_size=64, output_size=1)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
# train model
num_epochs = 20
batch_size = 32

for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(X_train), batch_size):
        X_batch = torch.tensor(X_train[i:i + batch_size], dtype=torch.float32).unsqueeze(1)
        y_batch = torch.tensor(y_train[i:i + batch_size], dtype=torch.float32)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 5 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')
        

Epoch [1/20], Loss: 0.6735
Epoch [2/20], Loss: 0.6733
Epoch [3/20], Loss: 0.6727
Epoch [4/20], Loss: 0.6714
Epoch [5/20], Loss: 0.6677
Epoch [6/20], Loss: 0.6690
Epoch [7/20], Loss: 0.6677
Epoch [8/20], Loss: 0.6669
Epoch [9/20], Loss: 0.6666
Epoch [10/20], Loss: 0.6672
Epoch [11/20], Loss: 0.6669
Epoch [12/20], Loss: 0.6661
Epoch [13/20], Loss: 0.6648
Epoch [14/20], Loss: 0.6647
Epoch [15/20], Loss: 0.6641
Epoch [16/20], Loss: 0.6633
Epoch [17/20], Loss: 0.6628
Epoch [18/20], Loss: 0.6623
Epoch [19/20], Loss: 0.6623
Epoch [20/20], Loss: 0.6601


In [36]:
# Calculate accuracy on test set
model.eval()
with torch.no_grad():
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32).unsqueeze(1)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32)
    outputs = model(X_test_tensor)
    predicted = (outputs > 0.5).float()
    accuracy = (predicted == y_test_tensor).float().mean()
    print(f'Test Accuracy: {accuracy.item():.4f}')

Test Accuracy: 0.6294


In [39]:
# Output model
torch.save(model.state_dict(), os.path.join(MODEL_PATH, "rainfall_lstm_baseline.pth"))
print("Model saved to", os.path.join(MODEL_PATH, "rainfall_lstm_baseline.pth"))

Model saved to ../models/rainfall_lstm_baseline.pth
