# Homework 1: COVID-19 Cases Prediction (Regression)

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive
[Errno 2] No such file or directory: '/MyDrive/Colab Notebooks/Machine Learning 2023 Spring'
/content


In [3]:
%cd "drive/MyDrive/Colab Notebooks/Machine Learning 2023 Spring"

[Errno 2] No such file or directory: 'drive/MyDrive/Colab Notebooks/Machine Learning 2023 Spring'
/content/drive/MyDrive/Colab Notebooks/Machine Learning 2023 Spring


In [1]:
%pwd

'g:\\My Drive\\Colab Notebooks\\Machine Learning 2023 Spring'

In [1]:
!nvidia-smi

Thu Feb 29 14:34:14 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [5]:
!gdown --fuzzy "https://drive.google.com/file/d/1Xmtq-NZlaWNhjV2FOonpbvBTXIzyXauf/view?usp=sharing" --output "covid_train.csv"
!gdown --fuzzy "https://drive.google.com/file/d/1V-JKYCKyjktlHA9Dh1jQHbvHsewQOtsr/view?usp=sharing" --output "covid_test.csv"

Downloading...
From: https://drive.google.com/uc?id=1Xmtq-NZlaWNhjV2FOonpbvBTXIzyXauf
To: /content/covid_train.csv
100% 2.16M/2.16M [00:00<00:00, 151MB/s]
Downloading...
From: https://drive.google.com/uc?id=1V-JKYCKyjktlHA9Dh1jQHbvHsewQOtsr
To: /content/covid_test.csv
100% 638k/638k [00:00<00:00, 128MB/s]


In [2]:
# numerical operations
import math
import numpy as np

# reading/writing data
import pandas as pd
import os
import csv

# for progress bar
from tqdm import tqdm

# pytorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

# for plotting learning curve
from torch.utils.tensorboard import SummaryWriter




## Some Utility Functions

In [3]:
def same_seed(seed):
  """Fixes random numer generator see"""
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

def train_valid_split(data_set, valid_ratio, seed):
  """Split provided training data into training set and validation set"""
  valid_set_size = int(valid_ratio*len(data_set))
  train_set_size = len(data_set) - valid_set_size
  train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
  return np.array(train_set), np.array(valid_set)

def predict(test_loader, model, device):
  model.eval()  # set your model to evaluation mode
  preds = []
  for x in tqdm(test_loader):
    x = x.to(device)
    with torch.no_grad():
      pred = model(x)
      preds.append(pred.detach().cpu())
  preds = torch.cat(preds, dim=0).numpy()
  return preds

## Dataset

In [4]:
import torch
from torch.utils.data import Dataset

In [5]:
class COVID90Dataset(Dataset):
    """
    x: Features
    y: Targets, if none, do prediction
    """
    def __init__(self, x, y=None):
        if y is None:
            self.y = y
        else:
            self.y = torch.FloatTensor(y)
        self.x = torch.FloatTensor(x)

    def __getitem__(self, idx):
        if self.y is None:
            return self.x[idx]
        else:
            return self.x[idx], self.y[idx]
        
    def __len__(self):
        return(len(self.x))

## Neural Network Model

In [7]:
import torch.nn as nn

In [6]:
class My_Model(nn.Module):
    def __init__(self, input_dim):
        super(My_Model, self).__init__()
        # todo: modify model's stucture, be aware of dimensions
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 1)
        )
    
    def forward(self, x):
        x = self.layers(x)
        x = x.squeeze(1)  # (B, 1) -> (B)
        return x

## Feature Selection

In [7]:
def select_feat(train_data, valid_data, test_data, select_all=True):
    """Selects useful features to preform regression"""
    y_train, y_valid = train_data[:, -1], valid_data[:, -1]
    raw_x_train, raw_x_valid, raw_x_test = train_data[:, :-1], valid_data[:, :-1], test_data

    if select_all:
        feat_idx = list(range(raw_x_train.shape[1]))
    else:
        feat_idx = [0, 1, 2, 3, 4]  # todo: select suitable feature columns
    
    return raw_x_train[:, feat_idx], raw_x_valid[:, feat_idx], raw_x_test[:, feat_idx], y_train, y_valid

## Training Loop

In [19]:
def trainer(train_loader, valid_loader, model, config, device):
    criterion = nn.MSELoss(reduction="mean")
    optimizer = torch.optim.SGD(model.parameters(), lr=config["learning_rate"], momentum=0.9)
    writer = SummaryWriter()  # writer to tensorboard

    if not os.path.isdir("./models"):
        os.mkdir("./models")  # create a directory of saving model

    n_epochs, best_loss, step, early_stop_count = config["n_epochs"], math.inf, 0, 0

    for epoch in range(n_epochs):
        model.train()  # set model to train mode
        loss_record = []
        train_pbar = tqdm(train_loader, position=0, leave=True)  # visualize training progress

        for x, y in train_pbar:
            optimizer.zero_grad()  # set gradient to zero
            x, y = x.to(device), y.to(device)  # move your data to device

            pred = model(x)
            loss = criterion(pred, y)
            loss.backward()  # compute gradient (backpropagation)
            optimizer.step()  # update parameters

            step += 1
            loss_record.append(loss.detach().item())

            # display current epoch number and loss on tqdm progress bar
            train_pbar.set_description(f"Epoch [{epoch+1}/{n_epochs}]")
            train_pbar.set_postfix({"loss": loss.detach().item()})

        mean_train_loss = sum(loss_record) / len(loss_record)
        writer.add_scalar("Loss/train", mean_train_loss, step)

        model.eval()  # set model to evaluation mode
        loss_record = []
        
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                loss = criterion(pred, y)
            
            loss_record.append(loss.item())

        mean_valid_loss = sum(loss_record) / len(loss_record)
        print(f"Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss}, Valid loss: {mean_valid_loss}")
        writer.add_scalar("Loss/valid", mean_valid_loss, step)

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config["save_path"])
            print("Saving model with loss {:.3f}...".format(best_loss))
            early_stop_count = 0
        else:
            early_stop_count += 1

        if early_stop_count >= config["early_stop"]:
            print("\nModel is not improving, so we halt the training session.")
            return

## Configurations

In [8]:
import torch

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"
config = {
    "seed": 5201314,  # seed number
    "select_all": True, # whether to use all features
    "valid_ratio": 0.2,  # validation_size = train_size * valid_ratio
    "n_epochs": 10,  # number of epochs
    "batch_size": 256,
    "learning_rate": 1e-5,
    "early_stop": 600,  # if model has not imporved for this many consecutive epochs, stop training
    "save_path": "./models/model.ckpt"  # model will be saved here
}

## DataLoader

In [15]:
same_seed(config["seed"])

train_data, test_data = pd.read_csv("covid_train.csv").values, pd.read_csv("covid_test.csv").values
train_data, valid_data = train_valid_split(train_data, config["valid_ratio"], config["seed"])

print(f"""
train_data size: {train_data.shape},
valid_data size: {valid_data.shape}
test_data size: {test_data.shape}"""
)

x_train, x_valid, x_test, y_train, y_valid = select_feat(train_data, valid_data, test_data, config["select_all"])
print(f"number of features: {x_train.shape[1]}")

train_dataset, valid_dataset, test_dataset = COVID90Dataset(x_train, y_train), COVID90Dataset(x_valid, y_valid), COVID90Dataset(x_test)

# pytorch dataloader loads pytorch dataset into batches
train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=config["batch_size"], shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=config["batch_size"], shuffle=False, pin_memory=True)


train_data size: (2408, 89),
valid_data size: (601, 89)
test_data size: (997, 88)
number of features: 88


## Start Training

In [20]:
model = My_Model(input_dim=x_train.shape[1]).to(device)
trainer(train_loader, valid_loader, model, config, device)

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]: 100%|██████████| 10/10 [00:00<00:00, 81.20it/s, loss=99.6]


Epoch [1/10]: Train loss: 124.23699951171875, Valid loss: 110.10228474934895
Saving model with loss 110.102...


Epoch [2/10]: 100%|██████████| 10/10 [00:00<00:00, 151.56it/s, loss=95.4]


Epoch [2/10]: Train loss: 95.08738784790039, Valid loss: 78.36477406819661
Saving model with loss 78.365...


Epoch [3/10]: 100%|██████████| 10/10 [00:00<00:00, 180.97it/s, loss=66.5]


Epoch [3/10]: Train loss: 80.2079231262207, Valid loss: 71.30179341634114
Saving model with loss 71.302...


Epoch [4/10]: 100%|██████████| 10/10 [00:00<00:00, 150.05it/s, loss=74.7]


Epoch [4/10]: Train loss: 73.01406631469726, Valid loss: 67.63429005940755
Saving model with loss 67.634...


Epoch [5/10]: 100%|██████████| 10/10 [00:00<00:00, 147.89it/s, loss=82.8]


Epoch [5/10]: Train loss: 70.99601211547852, Valid loss: 62.68150329589844
Saving model with loss 62.682...


Epoch [6/10]: 100%|██████████| 10/10 [00:00<00:00, 150.43it/s, loss=67.3]

Epoch [6/10]: Train loss: 70.48241119384765, Valid loss: 62.7203369140625







Epoch [7/10]: 100%|██████████| 10/10 [00:00<00:00, 147.81it/s, loss=55.2]


Epoch [7/10]: Train loss: 58.941522216796876, Valid loss: 51.40812301635742
Saving model with loss 51.408...


Epoch [8/10]: 100%|██████████| 10/10 [00:00<00:00, 146.42it/s, loss=57.3]


Epoch [8/10]: Train loss: 51.26047554016113, Valid loss: 43.128859202067055
Saving model with loss 43.129...


Epoch [9/10]: 100%|██████████| 10/10 [00:00<00:00, 148.53it/s, loss=36]


Epoch [9/10]: Train loss: 45.366061782836915, Valid loss: 29.626510620117188
Saving model with loss 29.627...


Epoch [10/10]: 100%|██████████| 10/10 [00:00<00:00, 150.02it/s, loss=106]

Epoch [10/10]: Train loss: 136.80146179199218, Valid loss: 117.99960835774739





## Plot learning curves with `tensorboard`

In [22]:
%reload_ext tensorboard
%tensorboard --logdir=./runs/

Reusing TensorBoard on port 6007 (pid 4072), started 0:01:02 ago. (Use '!kill 4072' to kill it.)