# Baseline Models for WxRADNet

In [18]:
import os
import numpy as np
import zipfile
import datetime
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.cuda.amp import GradScaler, autocast

import optuna
import pickle
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots
import seaborn as sns
plt.style.use("https://drive.google.com/uc?id=1NKA45YUOjoDwewGrI88Nx_hrqtBv5kuI&export=download")
%config InlineBackend.figure_format = "retina"
%matplotlib inline

PATH = "../parser/data/"
RANDOM_STATE = 0

INPUT_SIZE = 65536
OUTPUT_SIZE = 65536
BATCH_SIZE = 32
HIDDEN_SIZE = 256
NUM_LAYERS = 2

In [5]:
print(torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

False


In [7]:
def compute_rmse(test_targets, test_predictions):
    # Reshape to (batch_size * sequence_length, features)
    test_targets_flat = test_targets.reshape(-1, test_targets.shape[-1])
    test_predictions_flat = test_predictions.reshape(-1, test_predictions.shape[-1])

    # Compute RMSE
    rmse = np.sqrt(mean_squared_error(test_targets_flat, test_predictions_flat))
    return rmse

In [9]:
class ThunderstormDataset(Dataset):
    def __init__(self, file_paths):
        self.file_paths = file_paths
        self.dataset_sizes = [np.load(file_path, mmap_mode="r").shape[0] for file_path in file_paths]
        self.cumulative_sizes = np.cumsum(self.dataset_sizes)
        self.total_size = self.cumulative_sizes[-1]

    def __len__(self):
        return self.total_size

    def __getitem__(self, idx):
        # Find which file this idx belongs to
        file_idx = np.searchsorted(self.cumulative_sizes, idx, side='right')
        if file_idx == 0:
            within_file_idx = idx
        else:
            within_file_idx = idx - self.cumulative_sizes[file_idx - 1]
        
        file_path = self.file_paths[file_idx]
        data = np.load(file_path, mmap_mode='r')
        item = data[within_file_idx]
        
        # Split the item into inputs and targets
        inputs = item[:6].reshape(-1)  # Flatten the first 6 images
        targets = item[6:].reshape(-1)  # Flatten the last 6 images
        
        return torch.tensor(inputs, dtype=torch.float32), torch.tensor(targets, dtype=torch.float32)

In [11]:
train_file_paths = [PATH + file for file in sorted(os.listdir(PATH))][0:29]
valid_file_paths = [PATH + file for file in sorted(os.listdir(PATH))][29:38]
test_file_paths = [PATH + file for file in sorted(os.listdir(PATH))][38:41]

In [28]:
train_dataset = ThunderstormDataset(train_file_paths)
valid_dataset = ThunderstormDataset(valid_file_paths)
test_dataset = ThunderstormDataset(test_file_paths)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)

## 1. Constant Model

In [31]:
def baseline_model(test_loader, device):
    all_targets = []
    all_predictions = []

    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Reshape to (batch_size, 6, 256*256) before applying the baseline prediction
        inputs = inputs.view(inputs.size(0), 6, 256*256)
        targets = targets.view(targets.size(0), 6, 256*256)
        
        # Predict the last seen image
        baseline_pred = inputs[:, -1, :].unsqueeze(1).repeat(1, 6, 1)
        
        all_targets.append(targets.cpu().numpy())
        all_predictions.append(baseline_pred.cpu().numpy())

    all_targets = np.concatenate(all_targets, axis=0)
    all_predictions = np.concatenate(all_predictions, axis=0)
    rmse = compute_rmse(all_targets, all_predictions)
    
    return rmse

In [33]:
baseline_rmse = baseline_model(test_loader, device)
print(f"Baseline RMSE: {baseline_rmse:.4f}")

KeyboardInterrupt: 

## 2. LightGBM

In [49]:
data = np.load('../thunderstorm_data.npy')

In [51]:
data.shape

(34240, 12, 256, 256, 1)

In [53]:
X, y = [], []
for sequence in data:
    inputs = sequence[:6].reshape(-1)
    targets = sequence[6:].reshape(-1)
    X.append(inputs)
    y.append(targets)

In [55]:
X = np.array(X)
X.shape

(34240, 393216)

In [None]:
y = np.array(y).reshape(-1, 6 * 65536)
y.shape

In [41]:
class LightGBMDataset(Dataset):
    def __init__(self, file_paths):
        self.file_paths = file_paths
        self.dataset_sizes = [np.load(file_path, mmap_mode='r').shape[0] for file_path in file_paths]
        self.cumulative_sizes = np.cumsum(self.dataset_sizes)
        self.total_size = self.cumulative_sizes[-1]

    def __len__(self):
        return self.total_size

    def __getitem__(self, idx):
        file_idx = np.searchsorted(self.cumulative_sizes, idx, side='right')
        if file_idx == 0:
            within_file_idx = idx
        else:
            within_file_idx = idx - self.cumulative_sizes[file_idx - 1]
        
        file_path = self.file_paths[file_idx]
        data = np.load(file_path, mmap_mode='r')
        item = data[within_file_idx]
        
        inputs = item[:6].reshape(-1)  # Flatten the first 6 images
        targets = item[6:].reshape(-1)  # Flatten the last 6 images
        
        return inputs, targets

In [43]:
def get_data_targets(dataset):
    data, targets = [], []
    for inputs, outputs in dataset:
        data.append(inputs)
        targets.append(outputs)
        
    data = np.array(data)
    targets = np.array(targets).reshape(-1, 6 * 65536)
    return data, targets

In [45]:
train_dataset = LightGBMDataset(train_file_paths)
valid_dataset = LightGBMDataset(valid_file_paths)
test_dataset = LightGBMDataset(test_file_paths)

In [47]:
train_data, train_targets = get_data_targets(train_dataset)
valid_data, valid_targets = get_data_targets(valid_dataset)
test_data, test_targets = get_data_targets(test_dataset)

OSError: [Errno 24] Too many open files

In [None]:
def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 200),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 200),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
    }

    train_data_lgb = lgb.Dataset(train_data, label=train_targets)
    valid_data_lgb = lgb.Dataset(valid_data, label=valid_targets, reference=train_data_lgb)

    model = lgb.train(params, train_data_lgb, valid_sets=[valid_data_lgb])
    y_pred = model.predict(valid_data)
    
    rmse = np.sqrt(mean_squared_error(valid_targets, y_pred))
    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)