# **Homework 1: COVID-19 Cases Prediction (Regression)**

Objectives:
* Solve a regression problem with deep neural networks (DNN).
* Understand basic DNN training tips.
* Familiarize yourself with PyTorch.

If you have any questions, please contact the TAs via TA hours, NTU COOL, or email to mlta-2023-spring@googlegroups.com

In [174]:
# check gpu type
!nvidia-smi

Fri Apr 19 22:00:53 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 537.79                 Driver Version: 537.79       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA T600 Laptop GPU       WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   59C    P8              N/A / ERR! |    735MiB /  4096MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Import packages

In [175]:
# Numerical Operations
import math
import numpy as np

# Reading/Writing Data
import pandas as pd
import os
import csv

# For Progress Bar
# from tqdm import tqdm

# Pytorch
import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

# For plotting learning curve
from torch.utils.tensorboard import SummaryWriter

# Optuna 调参
import optuna

# Some Utility Functions

You do not need to modify this part.

In [176]:
def same_seed(seed): 
    '''Fixes random number generator seeds for reproducibility.'''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def train_valid_split(data_set, valid_ratio, seed):
    '''Split provided training data into training set and validation set'''
    valid_set_size = int(valid_ratio * len(data_set)) 
    train_set_size = len(data_set) - valid_set_size
    train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
    return np.array(train_set), np.array(valid_set)

def predict(test_loader, model, device):
    model.eval() # Set your model to evaluation mode.
    preds = []
    for x in test_loader:
        x = x.to(device)                        
        with torch.no_grad():                   
            pred = model(x)                     
            preds.append(pred.detach().cpu())   
    preds = torch.cat(preds, dim=0).numpy()  
    return preds

# Dataset

In [177]:
class COVID19Dataset(Dataset):
    '''
    x: Features.
    y: Targets, if none, do prediction.
    '''
    def __init__(self, x, y=None):
        if y is None:
            self.y = y
        else:
            self.y = torch.FloatTensor(y)
        self.x = torch.FloatTensor(x)

    def __getitem__(self, idx):
        if self.y is None:
            return self.x[idx]
        else:
            return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)

# Neural Network Model
Try out different model architectures by modifying the class below.

In [178]:
class My_Model(nn.Module):
    def __init__(self, input_dim):
        super(My_Model, self).__init__()
        # TODO: modify model's structure, be aware of dimensions. 
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )

    def forward(self, x):
        x = self.layers(x)
        x = x.squeeze(1) # (B, 1) -> (B)
        return x

# Feature Selection
Choose features you deem useful by modifying the function below.

In [179]:
from sklearn.feature_selection import SelectKBest, f_regression # type: ignore

def select_feat(train_data, valid_data, test_data, no_select_all=True):
    '''Selects useful features to perform regression'''
    global config
    y_train, y_valid = train_data[:,-1], valid_data[:,-1]
    raw_x_train, raw_x_valid, raw_x_test = train_data[:,:-1], valid_data[:,:-1], test_data

    if not no_select_all:
        feat_idx = list(range(raw_x_train.shape[1]))
    else:
        # feat_idx = [0,1,2,3,4] # TODO: Select suitable feature columns.
        k = config['k']
        selector = SelectKBest(score_func=f_regression, k=k)
        result = selector.fit(train_data[:, 35:-1], train_data[:, -1])
        idx = np.argsort(result.scores_)[::-1]
        feat_idx = list(np.sort(idx[:k]))
        # feat_idx = list([34, 36, 51, 52, 54, 70, 72, 69])
        
    return raw_x_train[:,feat_idx], raw_x_valid[:,feat_idx], raw_x_test[:,feat_idx], y_train, y_valid

# Training Loop

In [180]:
def trainer(train_loader, valid_loader, model, config, device):
    print('training...')
    criterion = nn.MSELoss(reduction='mean') # Define your loss function, do not modify this.

    # Define your optimization algorithm. 
    # TODO: Please check https://pytorch.org/docs/stable/optim.html to get more available algorithms.
    # TODO: L2 regularization (optimizer(weight decay...) or implement by your self).
    if config['optim'] == 'SGD':
        if config['no_momentum']:
            optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
        else:
            optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=config['momentum'], weight_decay=config['weight_decay'])
    elif config['optim'] == 'Adam':
        optimizer = torch.optim.Adam(model.paremeters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
    
    
    
    writer = SummaryWriter() # Writer of tensoboard.

    if not os.path.isdir('./models'):
        os.mkdir('./models') # Create directory of saving models.

    n_epochs, best_loss, step, early_stop_count= config['n_epochs'], math.inf, 0, 0

    for epoch in range(n_epochs):
        model.train() # Set your model to train mode.
        loss_record = []

        # tqdm is a package to visualize your training progress.
        # train_pbar = tqdm(train_loader, position=0, leave=True)

        for x, y in train_loader:
            optimizer.zero_grad()               # Set gradient to zero.
            x, y = x.to(device), y.to(device)   # Move your data to device. 
            pred = model(x)             
            loss = criterion(pred, y)
            loss.backward()                     # Compute gradient(backpropagation).
            optimizer.step()                    # Update parameters.
            step += 1
            loss_record.append(loss.detach().item())
            
            # Display current epoch number and loss on tqdm progress bar.
            # train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
            # train_pbar.set_postfix({'loss': loss.detach().item()})

        mean_train_loss = sum(loss_record)/len(loss_record)
        # writer.add_scalar('Loss/train', mean_train_loss, step)

        model.eval() # Set your model to evaluation mode.
        loss_record = []
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                loss = criterion(pred, y)

            loss_record.append(loss.item())
            
        mean_valid_loss = sum(loss_record)/len(loss_record)
        # print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
        # writer.add_scalar('Loss/valid', mean_valid_loss, step)
        if not config['no_tensorboard']:
            writer.add_scalar('Loss/train', mean_train_loss, step)
            writer.add_scalar('Loss/valid', mean_valid_loss, step)




        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_path']) # Save your best model
            print('Saving model with loss {:.3f}...'.format(best_loss))         
            # print('Saving model with loss {:.3f}...'.format(best_loss))

            early_stop_count = 0
        else: 
            early_stop_count += 1

        
        if early_stop_count >= config['early_stop']:
            print('\nModel is not improving, so we halt the training session.')
            break
    return best_loss

# Save predictions

In [181]:
def save_pred(preds, file):
    ''' Save predictions to specified file '''
    with open(file, 'w') as fp:
        writer = csv.writer(fp)
        writer.writerow(['id', 'tested_positive'])
        for i, p in enumerate(preds):
            writer.writerow([i, p])

# Configurations
`config` contains hyper-parameters for training and the path to save your model.

In [182]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = {
    'seed': 5201314,      # Your seed number, you can pick your lucky number. :)
    'valid_ratio': 0.1,   # validation_size = train_size * valid_ratio
    'n_epochs': 10000,     # Number of epochs.            
    'batch_size': 256, 
    'learning_rate': 1e-5,
    'weight_decay': 1e-5,              
    'early_stop': 1000,    # If model has not improved for this many consecutive epochs, stop training.     
    'save_path': './models/best_model.ckpt',  # Your model will be saved here.
    'layer': [16, 16],   # NN结点数
    'k': 16,    # 选择k个特征
    'optim': 'SGD',
    'momentum': 0.7,
    'no_select_all': True,  # 是否选择全部特征
    'no_momentum': False,    # 是否使用动量
    'no_normal': True,      # 是否归一化数据
    # 'no_k_cross': False,    # 是否K折交叉验证
    'no_save': False,       # 是否保存模型
    'no_tensorboard': True # 是否记录训练过程
}
# 设置 k-fold 中的 k，这里是根据 valid_ratio 设定的
# k = int(1 / config['valid_ratio'])

# Start training!
``config``包含需要调整的超参数和模型保存路径

``objective()``可以自动调参, 设置``AUTO_TUNE_PARAM``为``False``可以取消 

In [183]:
# Set seed for reproducibility
same_seed(config['seed'])

train_data, test_data = pd.read_csv('./covid_train.csv').values, pd.read_csv('./covid_test.csv').values
train_data, valid_data = train_valid_split(train_data, config['valid_ratio'], config['seed'])

# num_valid_samples = len(training_data) // k
# np.random.shuffle(training_data)
valid_losses = []  # 记录 valid_loss

def objective(trial):
    if trial is not None:
        print('\nNew trial here')
        # 定义需要调优的超参数空间
        config['learning_rate'] = trial.suggest_float('lr', 1e-6, 1e-3)
        config['momentum'] = trial.suggest_float('momentum', 0, 0.7)
        # config['batch_size'] = trial.suggest_categorical('batch_size', [128])
        config['k'] = trial.suggest_int('k_feats', 1, 32)
        # config['layer'][0] = config['k']
    
    # 打印所需的超参数
    print(f'''hyper-parameter: 
        optimizer: {config['optim']},
        lr: {config['learning_rate']}, 
        momentum: {config['momentum']},
        select_feats: {config['k']}, 
        layer: {config['layer']}''')
    
    global valid_losses
    # global valid_scores
    # 每次 trial 初始化 valid_scores，可以不初始化，通过 trial * k + fold 来访问当前 trial 的 valid_score，
    # 这样可以让 trainer() 保存 trials 中最好的模型参数，但这并不意味着该参数对应的 k-fold validation loss 最低。
    # valid_scores = []
    # for fold in range(k):
    #     # Data split
    #     valid_data = training_data[num_valid_samples * fold:
    #                             num_valid_samples * (fold + 1)]
    #     train_data = np.concatenate((
    #         training_data[:num_valid_samples * fold],
    #         training_data[num_valid_samples * (fold + 1):]))

    # Normalization
    if not config['no_normal']:
        train_mean = np.mean(train_data[:, 35:-1], axis=0)  # 前 35 列为 one-hot vector，我并没有对他们做 normalization，可以自行设置
        train_std = np.std(train_data[:, 35:-1], axis=0)
        train_data[:, 35:-1] -= train_mean
        train_data[:, 35:-1] /= train_std
        valid_data[:, 35:-1] -= train_mean
        valid_data[:, 35:-1] /= train_std
        test_data[:, 35:] -= train_mean
        test_data[:, 35:] /= train_std

    x_train, x_valid, x_test, y_train, y_valid = select_feat(train_data, valid_data, test_data, config['no_select_all'])
    # Print out the data size.
    print(f"""train_data size: {x_train.shape} 
    valid_data size: {x_valid.shape} 
    test_data size: {x_test.shape}""")

    train_dataset, valid_dataset, test_dataset = COVID19Dataset(x_train, y_train), \
                                            COVID19Dataset(x_valid, y_valid), \
                                            COVID19Dataset(x_test)

    # Pytorch data loader loads pytorch dataset into batches.
    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
    valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)
    
    model = My_Model(input_dim=x_train.shape[1]).to(device) # put your model and data on the same computation device.
    best_loss = trainer(train_loader, valid_loader, model, config, device)
    # valid_scores.append(valid_score)
    valid_losses.append(best_loss)
    # if not config['no_k_cross']:
    #     break
        
    # if valid_score > 2:
    #     print(f'在第{fold+1}折上欠拟合') # 提前终止，减少计算资源
    #     break       
    
    print(f'best_loss: {best_loss}')
    
    if best_loss<min(valid_losses):
        model = My_Model(input_dim=x_test.shape[1]).to(device)
        model.load_state_dict(torch.load(config['save_path']))
        preds = predict(test_loader, model, device)
        save_pred(preds, 'submission.csv')

    if trial is not None:
        return best_loss
    else:
        return x_test, test_loader



AUTO_TUNE_PARAM = False  # Whether to tune parameters automatically

if AUTO_TUNE_PARAM:
    # 使用Optuna库进行超参数搜索
    n_trials = 20  # 设置试验数量
    print(f'AUTO_TUNE_PARAM: {AUTO_TUNE_PARAM}\nn_trials: {n_trials}')
    study = optuna.create_study(pruner=optuna.pruners.MedianPruner(), direction='minimize')
    
    # sample config
    study.enqueue_trial(
    {
        "lr": 1e-5,
        "momentum": 0.7,
        "k": 88,
    }
)
    study.optimize(objective, n_trials=n_trials)

    # 输出最优的超参数组合和性能指标
    print('Best hyperparameters: {}'.format(study.best_params))
    print('Best performance: {:.4f}'.format(study.best_value))
    x_test, test_loader = objective(None)
else:
    # 注意，只有非自动调参时才进行了predict，节省一下计算资源
    print(f'You could set AUTO_TUNE_PARAM True to tune parameters automatically.\nAUTO_TUNE_PARAM: {AUTO_TUNE_PARAM}')
    x_test, test_loader = objective(None)
    model = My_Model(input_dim=x_test.shape[1]).to(device)
    model.load_state_dict(torch.load(config['save_path']))
    preds = predict(test_loader, model, device)
    save_pred(preds, 'submission.csv')

You could set AUTO_TUNE_PARAM True to tune parameters automatically.
AUTO_TUNE_PARAM: False
hyper-parameter: 
        optimizer: SGD,
        lr: 1e-05, 
        momentum: 0.7,
        select_feats: 16, 
        layer: [16, 16]
train_data size: (2709, 16) 
    valid_data size: (300, 16) 
    test_data size: (997, 16)
training...
Saving model with loss 465.029...
Saving model with loss 409.405...
Saving model with loss 393.558...
Saving model with loss 381.241...
Saving model with loss 367.695...
Saving model with loss 365.394...


Saving model with loss 359.091...
Saving model with loss 351.640...
Saving model with loss 349.572...
Saving model with loss 340.735...
Saving model with loss 338.859...
Saving model with loss 337.285...
Saving model with loss 324.225...
Saving model with loss 311.644...
Saving model with loss 290.031...
Saving model with loss 277.639...
Saving model with loss 274.475...
Saving model with loss 271.205...
Saving model with loss 263.278...
Saving model with loss 125.993...
Saving model with loss 95.050...
Saving model with loss 84.723...
Saving model with loss 79.207...
Saving model with loss 78.528...
Saving model with loss 76.781...
Saving model with loss 71.727...
Saving model with loss 61.277...
Saving model with loss 60.844...
Saving model with loss 60.083...
Saving model with loss 59.602...
Saving model with loss 59.552...
Saving model with loss 54.933...
Saving model with loss 51.905...
Saving model with loss 51.110...
Saving model with loss 48.015...
Saving model with loss 47.714

# Reference
This notebook uses code written by Heng-Jui Chang @ NTUEE (https://github.com/ga642381/ML2021-Spring/blob/main/HW01/HW01.ipynb)