# **Homework 1: COVID-19 Cases Prediction (Regression)**

# **Download Data**


If the Google drive links are dead, you can download data from [kaggle](https://www.kaggle.com/c/ml2021spring-hw1/data), and upload data manually to the workspace.

In [1]:
'''Setup experiment'''

# basic
exp_num = '_UMD_1'
# day_seq = [3]
day_seq = [1,3,7,10,14]
cuda_num = 0
data_num = 59

# model and target
model_num = 2
model_pr = 'origin'
target_only = False

# model parameters
n_epochs = 5000
batch_size = 4
# batch_size = 16
optimizer = 'Adam'
lr = 0.001
weight_decay = 0
betas = (0.9, 0.999)
early_stop = 50
# early_stop = 350

# **Import Some Packages**

In [2]:
# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# For data preprocess
import numpy as np
import csv
import os
import pandas as pd

# For plotting
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

# **Some Utilities**

You do not need to modify this part.

In [3]:
def get_device():
    ''' Get device (if GPU is available, use GPU) '''
    return f'cuda:{cuda_num}' if torch.cuda.is_available() else 'cpu'

def plot_learning_curve(loss_record, title=''):
    ''' Plot learning curve of your DNN (train & dev loss) '''
    total_steps = len(loss_record['train'])
    x_1 = range(total_steps)
    x_2 = x_1[::len(loss_record['train']) // len(loss_record['dev'])]
    figure(figsize=(6, 4))
    plt.plot(x_1, loss_record['train'], c='tab:red', label='train')
    plt.plot(x_2, loss_record['dev'], c='tab:cyan', label='dev')
    plt.ylim(0.0, 5.)
    plt.xlabel('Training steps')
    plt.ylabel('MSE loss')
    plt.title('Learning curve of {}'.format(title))
    plt.legend()
    plt.savefig(f'results/learning_curve/learning_curve_day{day_num}_exp{exp_num}')
    plt.close()
#     plt.show()


def plot_valid(dv_set, model, device, lim=35., preds=None, targets=None):
    ''' Plot validation of your DNN '''
    if preds is None or targets is None:
        model.eval()
        preds, targets = [], []
        for x, y in dv_set:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                preds.append(pred.detach().cpu())
                targets.append(y.detach().cpu())
        preds = torch.cat(preds, dim=0).numpy()
        targets = torch.cat(targets, dim=0).numpy()

    figure(figsize=(5, 5))
    plt.scatter(targets, preds, c='r', alpha=0.5)
    plt.plot([-0.2, lim], [-0.2, lim], c='b')
    plt.xlim(-0.2, lim)
    plt.ylim(-0.2, lim)
    plt.xlabel('ground truth value')
    plt.ylabel('predicted value')
    plt.title('Ground Truth v.s. Validation')
    plt.savefig(f'results/validation/validation_day{day_num}_exp{exp_num}')
    plt.close()
#     plt.show()
    
def plot_pred(dv_set, model, device, lim=35., preds=None, targets=None):
    ''' Plot prediction of your DNN '''
    if preds is None or targets is None:
        model.eval()
        preds, targets = [], []
        for x, y in dv_set:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                preds.append(pred.detach().cpu())
                targets.append(y.detach().cpu())
        preds = torch.cat(preds, dim=0).numpy()
        targets = torch.cat(targets, dim=0).numpy()

    figure(figsize=(5, 5))
    plt.scatter(targets, preds, c='r', alpha=0.5)
    plt.plot([-0.2, lim], [-0.2, lim], c='b')
    plt.xlim(-0.2, lim)
    plt.ylim(-0.2, lim)
    plt.xlabel('ground truth value')
    plt.ylabel('predicted value')
    plt.title('Ground Truth v.s. Prediction')
    plt.savefig(f'results/prediction/prediction_day{day_num}_exp{exp_num}')
    plt.close()
#     plt.show()

# **Preprocess**

We have three kinds of datasets:
* `train`: for training
* `dev`: for validation
* `test`: for testing (w/o target value)

## **Dataset**

The `COVID19Dataset` below does:
* read `.csv` files
* extract features
* split `covid.train.csv` into train/dev sets
* normalize features

Finishing `TODO` below might make you pass medium baseline.

In [4]:
class COVID19Dataset(Dataset):
    ''' Dataset for loading and preprocessing the COVID19 dataset '''
    def __init__(self,
                 path,
                 mode='train',
                 target_only=False):
        self.mode = mode

        # Read data into numpy arrays
        with open(path, 'r') as fp:
            data = list(csv.reader(fp))
            data = np.array(data[1:])[:, 1:].astype(float)
        
        if not target_only:
            # feats = [:, -1]
            feats = list(range(day_num*data_num-1))
        else:
            # TODO: Using 40 states & 2 tested_positive features (indices = 57 & 75)
            # feats = [57,75] + [40,41,42,43] + [58,59,60,61] + [76,77,78,79]
            # feats = list(range(day_num*4)) + list(range(day_num*17, day_num*18-1))
            feats = list(range(day_num*(data_num-1), day_num*data_num-1))
            pass

        if mode == 'test':
            # Testing data
            # data: 893 x 93 (40 states + day 1 (18) + day 2 (18) + day 3 (17))
            data = data[:, feats]
            self.data = torch.FloatTensor(data)
            target = data[:, -1]
            self.target = torch.FloatTensor(target)
        else:
            # Training data (train/dev sets)
            # data: 2700 x 94 (40 states + day 1 (18) + day 2 (18) + day 3 (18))
            target = data[:, -1]
            data = data[:, feats]
            
            # Splitting training data into train & dev sets
            # 0 → 2
            if mode == 'train':
                indices = [i for i in range(len(data)) if i % 10 != 1]
            elif mode == 'dev':
                indices = [i for i in range(len(data)) if i % 10 == 1]
            
            # Convert data into PyTorch tensors
            self.data = torch.FloatTensor(data[indices])
            self.target = torch.FloatTensor(target[indices])

        # Normalize features (you may remove this part to see what will happen)
        # self.data[:, 40:] = \
        #     (self.data[:, 40:] - self.data[:, 40:].mean(dim=0, keepdim=True)) \
        #     / self.data[:, 40:].std(dim=0, keepdim=True)

        self.dim = self.data.shape[1]

        print('Finished reading the {} set of COVID19 Dataset ({} samples found, each dim = {})'
              .format(mode, len(self.data), self.dim))

    def __getitem__(self, index):
        # Returns one sample at a time
        if self.mode in ['train', 'dev']:
            # For training
            return self.data[index], self.target[index]
        else:
            # For testing (no target)
            # return self.data[index]
            return self.data[index], self.target[index]

    def __len__(self):
        # Returns the size of the dataset
        return len(self.data)

## **DataLoader**

A `DataLoader` loads data from a given `Dataset` into batches.


In [5]:
def prep_dataloader(path, mode, batch_size, n_jobs=0, target_only=False):
    ''' Generates a dataset, then is put into a dataloader. '''
    dataset = COVID19Dataset(path, mode=mode, target_only=target_only)  # Construct dataset
    dataloader = DataLoader(
        dataset, batch_size,
        shuffle=(mode == 'train'), drop_last=False,
        num_workers=n_jobs, pin_memory=True)                            # Construct dataloader
    return dataloader

# **Deep Neural Network**

`NeuralNet` is an `nn.Module` designed for regression.
The DNN consists of 2 fully-connected layers with ReLU activation.
This module also included a function `cal_loss` for calculating loss.


In [6]:
class NeuralNet(nn.Module):
    ''' A simple fully-connected deep neural network '''
    def __init__(self, input_dim):
        super(NeuralNet, self).__init__()

        # model 1
        if model_num == 1:
            self.net = nn.Sequential(
                nn.Linear(input_dim, 1),
            )
        
        # model 2
        elif model_num == 2:
            self.net = nn.Sequential(        
                nn.Linear(input_dim, 64),
                nn.ReLU(),
                nn.Linear(64, 1),
            )
            
        # model 3
        elif model_num == 3:
            self.net = nn.Sequential(
                nn.Linear(input_dim, 64),
                nn.ReLU(),
                nn.Linear(64, 256),
                nn.ReLU(),
                nn.Linear(256, 64),
                nn.ReLU(),
                nn.Linear(64, 1),
            )
            
        # model 4
        elif model_num == 4:
            self.net = nn.Sequential(
                nn.Linear(input_dim, 64),
                nn.ReLU(),
                nn.Linear(64, 256),
                nn.ReLU(),
                nn.Linear(256, 512),
                nn.ReLU(),
                nn.Linear(512, 1024),
                nn.ReLU(),
                nn.Linear(1024, 2048),
                nn.ReLU(),
                nn.Linear(2048, 2048),
                nn.ReLU(),
                nn.Linear(2048, 2048),
                nn.ReLU(),
                nn.Linear(2048, 2048),
                nn.ReLU(),
                nn.Linear(2048, 2048),
                nn.ReLU(),
                nn.Linear(2048, 1024),
                nn.ReLU(),
                nn.Linear(1024, 512),
                nn.ReLU(),
                nn.Linear(512, 256),
                nn.ReLU(),
                nn.Linear(256, 64),
                nn.ReLU(),
                nn.Linear(64, 1),
            )
        
        else:
            print("model selection error")
        

        # Mean squared error loss
        self.criterion = nn.MSELoss(reduction='mean')

    def forward(self, x):
        ''' Given input of size (batch_size x input_dim), compute output of the network '''
        return self.net(x).squeeze(1)

    def cal_loss(self, pred, target):
        ''' Calculate loss '''
        # TODO: you may implement L1/L2 regularization here
        return self.criterion(pred, target)

# **Train/Dev/Test**

## **Training**

In [7]:
def train(tr_set, dv_set, model, config, device):
    ''' DNN training '''
    print('\n')
    n_epochs = config['n_epochs']  # Maximum number of epochs

    # Setup optimizer
    optimizer = getattr(torch.optim, config['optimizer'])(model.parameters(), **config['optim_hparas'])

    min_mse = 1000.0
    loss_record = {'train': [], 'dev': []}      # for recording training loss
    early_stop_cnt = 0
    final_epoch = 0
    epoch = 0
    while epoch < n_epochs:
        model.train()                           # set model to training mode
        for x, y in tr_set:                     # iterate through the dataloader
            optimizer.zero_grad()               # set gradient to zero
            x, y = x.to(device), y.to(device)   # move data to device (cpu/cuda)
            pred = model(x)                     # forward pass (compute output)
            mse_loss = model.cal_loss(pred, y)  # compute loss
            mse_loss.backward()                 # compute gradient (backpropagation)
            optimizer.step()                    # update model with optimizer
            loss_record['train'].append(mse_loss.detach().cpu().item())

        # After each epoch, test your model on the validation (development) set.
        dev_mse = dev(dv_set, model, device)
        if dev_mse < min_mse:
            # Save model if your model improved
            min_mse = dev_mse
            print('Saving model (epoch = {:4d}, loss = {:.4f})'.format(epoch + 1, min_mse))
            torch.save(model.state_dict(), config['save_path'])  # Save model to specified path
            final_epoch = epoch + 1
            early_stop_cnt = 0
        else:
            early_stop_cnt += 1

        epoch += 1
        loss_record['dev'].append(dev_mse)
        if early_stop_cnt > config['early_stop']:
            # Stop training if your model stops improving for "config['early_stop']" epochs.
            break

    print('Finished training after {} epochs'.format(epoch))
    return min_mse, loss_record, final_epoch

## **Validation**

In [8]:
def dev(dv_set, model, device):
    model.eval()                                # set model to evalutation mode
    total_loss = 0
    for x, y in dv_set:                         # iterate through the dataloader
        x, y = x.to(device), y.to(device)       # move data to device (cpu/cuda)
        with torch.no_grad():                   # disable gradient calculation
            pred = model(x)                     # forward pass (compute output)
            mse_loss = model.cal_loss(pred, y)  # compute loss
        total_loss += mse_loss.detach().cpu().item() * len(x)  # accumulate loss
    total_loss = total_loss / len(dv_set.dataset)              # compute averaged loss

    return total_loss

## **Testing**

In [9]:
def test(tt_set, model, device):

    model.eval()                                # set model to evalutation mode
    testing_loss = 0
    for x, y in tt_set:                         # iterate through the dataloader
        x, y = x.to(device), y.to(device)       # move data to device (cpu/cuda)
        with torch.no_grad():                   # disable gradient calculation
            pred = model(x)                     # forward pass (compute output)
            mse_loss = model.cal_loss(pred, y)  # compute loss
        testing_loss += mse_loss.detach().cpu().item() * len(x)  # accumulate loss
    testing_loss = testing_loss / len(tt_set.dataset)            # compute averaged loss
    
    return testing_loss

# **Setup Hyper-parameters**

`config` contains hyper-parameters for training and the path to save your model.

In [10]:
device = get_device()                    # Get the current available device ('cpu' or 'cuda')
os.makedirs('models', exist_ok=True)     # The trained model will be saved to ./models/
target_only = target_only                # Use selected features

config = {
    'n_epochs': n_epochs,                   # maximum number of epochs
    'batch_size': batch_size,               # mini-batch size for dataloader
    'optimizer': optimizer,                 # optimization algorithm (optimizer in torch.optim)
    'optim_hparas': {                       # hyper-parameters for the optimizer
        'lr': lr,                           # learning rate
        'weight_decay': weight_decay,       # weight decay: to avoid overfitting
        'betas': betas
#         'momentum': 0.9                     # momentum for SGD
    },
    'early_stop': early_stop,               # early stopping epochs (the number epochs since model's last improvement)
    'save_path': 'models/model.pth'         # save model
}

# '''Best: n_epochs = 5000, batch size = 16,  lr=0.0013, weight_decay=0.0005, early_stop = 350'''
# '''Scnd: n_epochs = 5000, batch size = 16,  lr=0.0006, weight_decay=0.0015, early_stop = 350'''
# '''Orgn: n_epochs = 3000, batch size = 270, lr=0.001,  optimizer = SGD,     early_stop = 200''' 

# **Load data and model**

In [11]:
final_epochs = []
train_final_loss_total = []
testing_loss_total = []

for i in range(len(day_seq)):
    
    day_num = day_seq[i]

    print(f"-------------------------------Training day{day_num}-------------------------------")

    '''Set random seed'''
    myseed = 42069  # set a random seed for reproducibility
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(myseed)
    torch.manual_seed(myseed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(myseed)

    '''Set data path'''
    tr_path = f'data/UMD/training/covid.train.{day_num}day.csv'  # path to training data
    tt_path = f'data/UMD/training/covid.test.{day_num}day.csv'   # path to testing data

    '''Load data and model'''
    tr_set = prep_dataloader(tr_path, 'train', config['batch_size'], target_only=target_only)
    dv_set = prep_dataloader(tr_path, 'dev', config['batch_size'], target_only=target_only)
    tt_set = prep_dataloader(tt_path, 'test', config['batch_size'], target_only=target_only)

    model = NeuralNet(tr_set.dataset.dim).to(device)  # Construct model and move to device

    '''Start Training'''
    model_loss, model_loss_record, final_epoch = train(tr_set, dv_set, model, config, device)

    '''Save plots'''
    plot_learning_curve(model_loss_record, title='deep model')

    del model
    model = NeuralNet(tr_set.dataset.dim).to(device)
    ckpt = torch.load(config['save_path'], map_location='cpu')  # Load your best model
    model.load_state_dict(ckpt)
    plot_valid(dv_set, model, device)  # Show prediction on the validation set

    '''Testing'''
    testing_loss = test(tt_set, model, device)  # predict COVID-19 cases with your model
    plot_pred(tt_set, model, device)  # Show prediction on the testing set

    '''Save results'''
    final_epochs.append(final_epoch)
    train_final_loss_total.append(model_loss)
    testing_loss_total.append(testing_loss)

    '''Print results'''
    print('\nResult:')
    print(f'final_epoch_day{day_num} = {final_epoch}')
    print(f'train_final_loss_day{day_num} = {model_loss}')
    print(f'testing_loss_day{day_num} = {testing_loss}')
    print('\n')

-------------------------------Training day1-------------------------------
Finished reading the train set of COVID19 Dataset (97 samples found, each dim = 58)
Finished reading the dev set of COVID19 Dataset (11 samples found, each dim = 58)
Finished reading the test set of COVID19 Dataset (20 samples found, each dim = 58)


Saving model (epoch =    1, loss = 0.0001)
Saving model (epoch =    2, loss = 0.0001)
Saving model (epoch =    3, loss = 0.0000)
Saving model (epoch =    6, loss = 0.0000)
Saving model (epoch =    9, loss = 0.0000)
Saving model (epoch =   18, loss = 0.0000)
Saving model (epoch =   25, loss = 0.0000)
Saving model (epoch =   54, loss = 0.0000)
Saving model (epoch =   62, loss = 0.0000)
Saving model (epoch =   67, loss = 0.0000)
Saving model (epoch =   71, loss = 0.0000)
Saving model (epoch =  109, loss = 0.0000)
Saving model (epoch =  113, loss = 0.0000)
Saving model (epoch =  122, loss = 0.0000)
Saving model (epoch =  129, loss = 0.0000)
Saving model (epoch =  130, 

In [12]:
'''Build dataframe'''
final_epoch_sr = pd.Series(final_epochs)
train_final_loss_sr = pd.Series(train_final_loss_total)
testing_loss_sr = pd.Series(testing_loss_total)

result = pd.DataFrame({'exp_num': exp_num,
                       'day_num': day_seq,
                       'target_only': target_only,
                       'model': model_num,
                       'model_pr.': model_pr,
                       'n_epochs': n_epochs,
                       'batch_size': batch_size,
                       'optimizer': optimizer,
                       'lr': lr,
                       'weight_decay': weight_decay,
                       'betas': str(betas),
                       'early_stop': early_stop,
                       'final_epoch': final_epoch_sr, 
                       'train_final_loss': train_final_loss_sr, 
                       'testing_loss': testing_loss_sr})

result.to_csv(f'results/data/data_exp{exp_num}.csv')

In [13]:
result

Unnamed: 0,exp_num,day_num,target_only,model,model_pr.,n_epochs,batch_size,optimizer,lr,weight_decay,betas,early_stop,final_epoch,train_final_loss,testing_loss
0,_UMD_1,1,False,2,origin,5000,4,Adam,0.001,0,"(0.9, 0.999)",50,170,1.7e-05,0.197199
1,_UMD_1,3,False,2,origin,5000,4,Adam,0.001,0,"(0.9, 0.999)",50,257,1e-05,0.204751
2,_UMD_1,7,False,2,origin,5000,4,Adam,0.001,0,"(0.9, 0.999)",50,115,1.3e-05,0.230467
3,_UMD_1,10,False,2,origin,5000,4,Adam,0.001,0,"(0.9, 0.999)",50,80,7e-06,0.190874
4,_UMD_1,14,False,2,origin,5000,4,Adam,0.001,0,"(0.9, 0.999)",50,222,6e-06,0.174499
