In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import copy

device = torch.device(
    'cuda:0' if torch.cuda.is_available() else 'cpu')

# Helper functions

In [3]:
def comp_distance(X1, X2):
    """
    Compute distance of locations in X1 wrt. locations in X2. 
    Use earth radius R = 6,371 km.
    
    Args:
    X1: lat and lon, shape (m, 2)
    X2: lat and lon, shape (m, 2)
    
    Returns:
    output: distances, shape (m, n) where each row is a location 
    in X1, each column is a location in X2.
    """
    R = 6371.
    X1 = X1 * np.pi/180.
    X2 = X2 * np.pi/180.
    
    A = torch.cos(X1[:,[1]]) @ torch.cos(X2[:,[1]]).T
    A = A * torch.cos(X1[:,[0]] - X2[:,[0]].T)
    A += torch.sin(X1[:,[1]]) @ torch.sin(X2[:,[1]]).T
    A = torch.where(
        A > 1., torch.tensor(1., dtype=torch.float64), A
    )     # remove warning
    
    return R * torch.arccos(A) 


def comp_idweight(filepath, train_idx, test_idx):
    """
    Compute Inverse Distance Weights of target station
    wrt. other stations. Use float64 for better accuracy.

    output shape: (num_test_stations, num_train_stations)
    """
    loc_df = pd.read_csv(filepath, header=0)
    train_loc = torch.tensor(
        loc_df.iloc[train_idx].to_numpy(),
        dtype=torch.float64
    )
    test_loc = torch.tensor(
        loc_df.iloc[test_idx].to_numpy(),
        dtype=torch.float64
    )
    inv_distance = 1. / comp_distance(test_loc, train_loc)
    idw_matrix = inv_distance / inv_distance.sum(axis=1, keepdims=True) 
    return idw_matrix.type(torch.float32)


def mape_loss(pred, target, reduction='mean'):
    """
    input, output: tensor of same shape
    """
    target = torch.where(
        target == 0, 
        torch.tensor(1e-6, device=device), 
        target
    )
    diff = (pred - target) / target
    if reduction == 'mean':
        mape = diff.abs().mean()
    elif reduction == 'sum':
        mape = diff.abs().sum()
    return mape


# Split data & define PM2.5 dataset

In [4]:
def split_dataset(filepath):
    """
    Implement a 60:20:20 contiguous split.
    """
    pm_df = pd.read_csv(
        filepath, header=None, skiprows=1
    )
    length = len(pm_df)
    train_df = pm_df.loc[: int(0.6 * length)]
    valid_df = pm_df.loc[int(0.6 * length): int(0.8 * length)]
    test_df = pm_df.loc[int(0.8 * length):]
    return train_df, valid_df, test_df


class PMDataset(Dataset):
    def __init__(
        self, pm_df, target_idx, 
        test_idx, window, training=False
        ):
        """
        pm_df: dataframe
        target_idx: integer index of target station
        test_idx: list of indices of test stations
        training: False if test dataset
        """
        self.input_data = torch.tensor(
            pm_df.drop(columns=test_idx + [target_idx]).to_numpy(), 
            dtype=torch.float32
        )
        if not training:
            self.target_data = torch.tensor(
                pm_df.iloc[:, test_idx].to_numpy(), 
                dtype=torch.float32
            )
        else:
            self.target_data = torch.tensor(
                pm_df.iloc[:, [target_idx]].to_numpy(), 
                dtype=torch.float32
            )
        self.window = window
    
    def __len__(self):
        return len(self.input_data) - self.window + 1
    
    def __getitem__(self, idx):
        """
        input shape: (seq_len, num_train_stations)
        target shape: (1, ), if testing (num_test_stations, )
        """
        input = self.input_data[idx: idx + self.window]
        target = self.target_data[idx + self.window - 1]
        return input, target
    

# Define layers and models

In [5]:
class BiLSTM(nn.Module):
    def __init__(
        self, lstm_input_size, lstm_hidden_size, 
        lstm_num_layers, linear_hidden_size 
    ):
        super().__init__()
        self.blstm = nn.LSTM(
            lstm_input_size, lstm_hidden_size, 
            lstm_num_layers, bidirectional=True
        )
        self.batchnorm1 = nn.BatchNorm1d(lstm_hidden_size)
        self.linear = nn.Linear(lstm_hidden_size, linear_hidden_size)
        self.relu = nn.ReLU()
        self.batchnorm2 = nn.BatchNorm1d(linear_hidden_size)
        
    def forward(self, input):
        """
        input shape: (batch, seq_length)
        output shape: (batch, linear_hidden_output)
        """
        # BLSTM layer
        input = input.T.unsqueeze(-1)      # shape (window, batch, 1)
        lstm_output, _ = self.blstm(input)
        lstm_hidden_size = int(lstm_output.shape[-1] / 2)
        lstm_output_forward = lstm_output[-1, :, :lstm_hidden_size]
        lstm_output_backward = lstm_output[0, :, lstm_hidden_size:]
        lstm_output = lstm_output_forward + lstm_output_backward
        lstm_output = self.batchnorm1(lstm_output)
        
        # FC+BN Layer
        linear_output = self.linear(lstm_output)
        linear_output = self.relu(linear_output)
        output = self.batchnorm2(linear_output)
        return output


class IDWLayer(nn.Module):
    def __init__(self, input_size, hidden_size, idweight):
        super().__init__()
        self.idweight = idweight
        self.input_size = input_size
        self.linear = nn.Linear(input_size, hidden_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, input):
        """
        input shape: (batch, input_size)
        output shape: (batch, hidden_size)
        """
        num_train = self.idweight.shape[1]
        weight = self.idweight.T * torch.ones(
            int(self.input_size / num_train), 
            device=device
        )
        weight = weight.reshape((1, -1))
        input = input * weight
        output = self.linear(input)
        output = self.sigmoid(output)
        return output

    
class IDW_BiLSTM_Model(nn.Module):
    def __init__(
        self, lstm_input_size, lstm_hidden_size, 
        lstm_num_layers, linear_hidden_size, 
        idw_hidden_size, idweight
    ):
        super().__init__()
        num_train = idweight.shape[1]
        self.blstm = nn.ModuleList(
            [BiLSTM(
                lstm_input_size, lstm_hidden_size, 
                lstm_num_layers, linear_hidden_size
            ) for _ in range(num_train)])
        self.idw = IDWLayer( 
            linear_hidden_size * num_train,
            idw_hidden_size, idweight
        )
        self.linear = nn.Linear(idw_hidden_size, 1)
        
    def forward(self, X):
        """
        X shape: (batch, seq_len, num_train)
        output shape: (batch, 1)
        """
        blstm_output = []
        # iterate over stations
        num_train = X.shape[-1]
        for i in range(num_train):
            Xi = X[:, :, i]      # data of ith station
            bltsm_output_i = self.blstm[i](Xi)
            blstm_output.append(bltsm_output_i)
        blstm_output = torch.cat(blstm_output, axis=1)
        
        idw_output = self.idw(blstm_output)
        output = self.linear(idw_output)
        return output


# Define training/eval loop

In [6]:
def train_loop(model, dataloader, loss_fn, optimizer):
    model.train()
    total_loss = []
    for batch, (input, target) in enumerate((dataloader)):
        # forward pass
        input = input.to(device)
        target = target.to(device)
        pred = model(input)
        loss = loss_fn(pred, target)
        total_loss.append(loss)
    
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    # print loss
    total_loss = torch.tensor(total_loss).mean().sqrt().item()
    print(f'Train: {total_loss:>.4f}')
    return total_loss


def eval_loop(model, dataloader, idw_matrix):
    mse = mae = mape = 0.
    with torch.no_grad():
        test_model = copy.deepcopy(model)
        test_model.eval()
        for input, target in dataloader:
            # forward pass
            input = input.to(device)
            target = target.to(device)
            num_test_stations = target.shape[1]
            pred = []
            for i in range(num_test_stations):
                # Change idweight of another test station.
                test_model.idw.idweight = idw_matrix[[i]]
                pred_i = test_model(input)
                pred.append(pred_i)    
            pred = torch.cat(pred, axis=1)

            # agregate metrics      
            mse += F.mse_loss(pred, target, reduction='sum')
            mae += F.l1_loss(pred, target, reduction='sum')
            mape += mape_loss(pred, target, reduction='sum')
            
    num_entries = len(dataloader.dataset) * len(idw_matrix)
    rmse = torch.sqrt(mse / num_entries).item()
    mae = (mae / num_entries).item()
    mape = (mape / num_entries).item()
    
    print(f'Val  : {rmse:>.4f} | {mae:>.4f} | {mape:>.4f}')
    return rmse, mae, mape


# Training time

**Lưu ý:**
- Tư duy như baseline 1, sử dụng 1 trạm làm target, dùng các trạm còn lại để học cách dự đoán trạm target (trên tinh thần đổi mới sáng tạo, paper gốc viết không rõ). Các em có thể chỉnh sửa các hàm tải dữ liệu để dùng tập trạm test đánh giá hiệu quả mô hình. Mô hình này là BiLSTM + IDW (ngược lại so với baseline 1).
- Mô hình có thể cần regularize, tune. Config dưới đây là của paper gốc.

In [None]:
# data location
data_path = ("/content/drive/MyDrive/pm2.5/"
            "data/test_data/(long's)pm2.5.csv")
loc_path = ("/content/drive/MyDrive/pm2.5/"
           "data/test_data/(long's)locations.csv")

window = 5                   
target_idx = 14                
test_idx = [0,1,4,11,15,24,25,27,32,33,37,39]   
train_idx = list(
    set(range(40)) - set(test_idx) - set([target_idx]))

# model architecture
lstm_input_size = 1
lstm_hidden_size = 64
lstm_num_layers = 2
linear_hidden_size = 256    
idw_hidden_size = 128

# training hyper-params
batch_size = 128      
learning_rate = 1e-2


# load dataset
# create dataloaders
train_df, valid_df, test_df = split_dataset(data_path)
train_dataset = PMDataset(
    train_df, target_idx, test_idx, window, training=True)
valid_dataset = PMDataset(
    valid_df, target_idx, test_idx, window, training=False)
test_dataset = PMDataset(
    test_df, target_idx, test_idx, window, training=False)

train_dataloader = DataLoader(
    train_dataset, batch_size, shuffle=True)
valid_dataloader = DataLoader(
    valid_dataset, batch_size, shuffle=True)
test_dataloader = DataLoader(
    test_dataset, batch_size, shuffle=True)

# create model
idweight = comp_idweight(
    loc_path, train_idx, [target_idx]).to(device)
model = IDW_BiLSTM_Model(
    lstm_input_size, 
    lstm_hidden_size, lstm_num_layers, 
    linear_hidden_size, idw_hidden_size, 
    idweight
)
model = model.to(device)

# training hyper-params
learning_rate = 1e-2
train_loss = []

# create optimizer
idw_matrix = comp_idweight(
    loc_path, train_idx, test_idx).to(device)
loss_fn = nn.MSELoss(reduction='mean')
optimizer = torch.optim.Adam(
    model.parameters(), 
    lr=learning_rate
)


In [None]:
num_epochs = 100
for epoch in tqdm(range(num_epochs)):
    train_loss.append(
        train_loop(model, train_dataloader, loss_fn, optimizer)
    )
    if epoch % 10 == 9:
        eval_loop(model, valid_dataloader, idw_matrix)
