In [1]:
%load_ext autoreload

In [142]:
# Utilities 
from options.options import is_notebook, get_options, update_options, print_options
import argparse
import math
import time
import random
import os
from tqdm import tqdm

# Data
import numpy as np
from sklearn.model_selection import train_test_split
from datautils.ts_dataset import TSDataset
from datautils.data import prepare_dataloaders
import pandas as pd

# Torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

# Transformer
#import tstransformer.Constants as Constants
from tstransformer.Models import Transformer
from tstransformer.Optim import ScheduledOptim

%autoreload 2

In [143]:
opt = get_options()
print_options(opt)

[93m[1m           train_path [0m:  /home/yannic/master-thesis/data_air/prsa_data.parquet
[93m[1m             val_path [0m:  /home/yannic/master-thesis/data_air/prsa_data.parquet
[93m[1m                epoch [0m:  10
[93m[1m           batch_size [0m:  32
[93m[1m              d_model [0m:  512
[93m[1m       d_inner_hidden [0m:  2048
[93m[1m                d_key [0m:  64
[93m[1m              d_value [0m:  64
[93m[1m           d_sequence [0m:  512
[93m[1m               n_head [0m:  8
[93m[1m             n_layers [0m:  6
[93m[1m       n_warmup_steps [0m:  4000
[93m[1m               lr_mul [0m:  2.0
[93m[1m                 seed [0m:  False
[93m[1m              dropout [0m:  0.1
[93m[1m    embs_share_weight [0m:  False
[93m[1m    proj_share_weight [0m:  False
[93m[1m     scale_emb_or_prj [0m:  prj
[93m[1m           output_dir [0m:  ./output
[93m[1m               use_tb [0m:  True
[93m[1m            use_wandb [0m:  False
[93m[1m 

In [4]:
# https://pytorch.org/docs/stable/notes/randomness.html
# For reproducibility
if opt["seed"] is not None:
    torch.manual_seed(opt["seed"])
    torch.backends.cudnn.benchmark = False
    # torch.set_deterministic(True)
    np.random.seed(opt["seed"])
    random.seed(opt["seed"])

In [5]:
if not opt["output_dir"]:
    print('No experiment result will be saved.')
    raise

In [6]:
if not os.path.exists(opt["output_dir"]):
    os.makedirs(opt["output_dir"])

In [136]:
# define the cuda devide
device = torch.device('cuda' if opt["cuda"] else 'cpu')
print(device)

cuda


In [8]:
# Load the dataset and preprocess it 
aq = pd.read_parquet("/home/yannic/master-thesis/data_air/prsa_data.parquet")
aq_prep = pd.concat([aq,pd.get_dummies(aq['station'], prefix='station',dummy_na=False)],axis=1).drop(['station'],axis=1).drop(columns=["wind_direction"])
aq_prep.head()

Unnamed: 0,year,month,day,hour,pm25,pm10,so2,no2,co,o3,...,station_Dingling,station_Dongsi,station_Guanyuan,station_Gucheng,station_Huairou,station_Nongzhanguan,station_Shunyi,station_Tiantan,station_Wanliu,station_Wanshouxigong
0,2013,3,1,0,6.0,18.0,5.0,,800.0,88.0,...,0,0,0,1,0,0,0,0,0,0
1,2013,3,1,1,6.0,15.0,5.0,,800.0,88.0,...,0,0,0,1,0,0,0,0,0,0
2,2013,3,1,2,5.0,18.0,,,700.0,52.0,...,0,0,0,1,0,0,0,0,0,0
3,2013,3,1,3,6.0,20.0,6.0,,,,...,0,0,0,1,0,0,0,0,0,0
4,2013,3,1,4,5.0,17.0,5.0,,600.0,73.0,...,0,0,0,1,0,0,0,0,0,0


# DataLoader and Transformer

# Functions

In [None]:
def cal_performance(pred, gold, trg_pad_idx, smoothing=False):
    ''' Apply label smoothing if needed '''

    loss = cal_loss(pred, gold, trg_pad_idx, smoothing=smoothing)

    pred = pred.max(1)[1]
    gold = gold.contiguous().view(-1)
    non_pad_mask = gold.ne(trg_pad_idx)
    n_correct = pred.eq(gold).masked_select(non_pad_mask).sum().item()
    n_word = non_pad_mask.sum().item()

    return loss, n_correct, n_word

In [None]:
def cal_loss(pred, gold, trg_pad_idx, smoothing=False):
    ''' Calculate cross entropy loss, apply label smoothing if needed. '''

    gold = gold.contiguous().view(-1)

    if smoothing:
        eps = 0.1
        n_class = pred.size(1)

        one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1, 1), 1)
        one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1)
        log_prb = F.log_softmax(pred, dim=1)

        non_pad_mask = gold.ne(trg_pad_idx)
        loss = -(one_hot * log_prb).sum(dim=1)
        loss = loss.masked_select(non_pad_mask).sum()  # average later
    else:
        loss = F.cross_entropy(pred, gold, ignore_index=trg_pad_idx, reduction='sum')
    return loss

In [None]:
def patch_src(src, pad_idx):
    src = src.transpose(0, 1)
    return src

In [None]:
def patch_trg(trg, pad_idx):
    trg = trg.transpose(0, 1)
    trg, gold = trg[:, :-1], trg[:, 1:].contiguous().view(-1)
    return trg, gold

In [None]:
def train_epoch(model, training_data, optimizer, opt, device, smoothing):
    ''' Epoch operation in training phase'''

    model.train()
    total_loss, n_word_total, n_word_correct = 0, 0, 0 

    desc = '  - (Training)   '
    for batch in tqdm(training_data, mininterval=2, desc=desc, leave=False):
        
        
        # prepare data
        src_seq = patch_src(batch.src, opt.src_pad_idx).to(device)
        trg_seq, gold = map(lambda x: x.to(device), patch_trg(batch.trg, opt.trg_pad_idx))

        # forward
        optimizer.zero_grad()
        pred = model(src_seq, trg_seq)

        # backward and update parameters
        loss, n_correct, n_word = cal_performance(
            pred, gold, opt.trg_pad_idx, smoothing=smoothing) 
        loss.backward()
        optimizer.step_and_update_lr()

        # note keeping
        n_word_total += n_word
        n_word_correct += n_correct
        total_loss += loss.item()

    loss_per_word = total_loss/n_word_total
    accuracy = n_word_correct/n_word_total
    return loss_per_word, accuracy

In [None]:
def eval_epoch(model, validation_data, device, opt):
    ''' Epoch operation in evaluation phase '''

    model.eval()
    total_loss, n_word_total, n_word_correct = 0, 0, 0

    desc = '  - (Validation) '
    with torch.no_grad():
        for batch in tqdm(validation_data, mininterval=2, desc=desc, leave=False):

            # prepare data
            src_seq = patch_src(batch.src, opt.src_pad_idx).to(device)
            trg_seq, gold = map(lambda x: x.to(device), patch_trg(batch.trg, opt.trg_pad_idx))

            # forward
            pred = model(src_seq, trg_seq)
            loss, n_correct, n_word = cal_performance(
                pred, gold, opt.trg_pad_idx, smoothing=False)

            # note keeping
            n_word_total += n_word
            n_word_correct += n_correct
            total_loss += loss.item()

    loss_per_word = total_loss/n_word_total
    accuracy = n_word_correct/n_word_total
    return loss_per_word, accuracy

In [None]:
def train(model, training_data, validation_data, optimizer, device, opt):
    ''' Start training '''
    
    # Use wandb to plot curves, e.g. perplexity, accuracy, learning rate
    # TODO: Implement this

    log_train_file = os.path.join(opt["output_dir"], 'train.log')
    log_valid_file = os.path.join(opt["output_dir"], 'valid.log')

    print('[Info] Training performance will be written to file: {} and {}'.format(log_train_file, log_valid_file))

    with open(log_train_file, 'w') as log_tf, open(log_valid_file, 'w') as log_vf:
        log_tf.write('epoch,loss,ppl,accuracy\n')
        log_vf.write('epoch,loss,ppl,accuracy\n')

    def print_performances(header, accu, start_time, lr):
        print('  - {header:12} , accuracy: {accu:3.3f} %, lr: {lr:8.5f}, ''elapse: {elapse:3.3f} min'.format(
                  header=f"({header})",accu=100*accu, elapse=(time.time()-start_time)/60, lr=lr))

    #valid_accus = []
    valid_losses = []
    for epoch_i in range(opt["epoch"]):
        print('[ Epoch', epoch_i, ']')

        start = time.time()
        train_loss, train_accu = train_epoch(
            model, training_data, optimizer, opt, device, smoothing=opt["label_smoothing"])
        
        
        # Current learning rate
        lr = optimizer._optimizer.param_groups[0]['lr']
        print_performances('Training', train_accu, start, lr)

        start = time.time()
        valid_loss, valid_accu = eval_epoch(model, validation_data, device, opt)
        valid_ppl = math.exp(min(valid_loss, 100))
        print_performances('Validation', valid_ppl, valid_accu, start, lr)

        valid_losses += [valid_loss]

        checkpoint = {'epoch': epoch_i, 'settings': opt, 'model': model.state_dict()}
        
        if opt["save_mode"] == 'all':
            model_name = 'model_accu_{accu:3.3f}.chkpt'.format(accu=100*valid_accu)
            torch.save(checkpoint, model_name)
        elif opt["save_mode"] == 'best':
            model_name = 'model.chkpt'
            if valid_loss <= min(valid_losses):
                torch.save(checkpoint, os.path.join(opt["output_dir"], model_name))
                print('    - [Info] The checkpoint file has been updated.')

        with open(log_train_file, 'a') as log_tf, open(log_valid_file, 'a') as log_vf:
            log_tf.write('{epoch},{loss: 8.5f},{ppl: 8.5f},{accu:3.3f}\n'.format(
                epoch=epoch_i, loss=train_loss,
                ppl=train_ppl, accu=100*train_accu))
            log_vf.write('{epoch},{loss: 8.5f},{ppl: 8.5f},{accu:3.3f}\n'.format(
                epoch=epoch_i, loss=valid_loss,
                ppl=valid_ppl, accu=100*valid_accu))

        if opt["use_tb"]:
            tb_writer.add_scalars('ppl', {'train': train_ppl, 'val': valid_ppl}, epoch_i)
            tb_writer.add_scalars('accuracy', {'train': train_accu*100, 'val': valid_accu*100}, epoch_i)
            tb_writer.add_scalar('learning_rate', lr, epoch_i)


--------------------------------------------------------------------------------------------------------------------------------------------------------------------

# New Code 

In [132]:
def train_epoch(model, train_dataloader, optimizer, opt, device):
    
    # Set the model into training mode
    model.train()
    
    desc = '  - (Training)   '
    for data, labels in tqdm(train_dataloader, mininterval=2, desc=desc, leave=False):
        
        #TODO: prepare data
        data = data.to(device)
        labels = labels.to(device)
        
        #TODO: forward pass
        optimizer.zero_grad()
        pred = model(data, labels)
        
        
        #TODO: backward pass
    
    

In [133]:
def train(model, train_dataloader, validation_data, optimizer, device, opt):
    
    # TODO: Implement wandb connection
    
    # TODO: Implement logging
    
    def print_performance(header, accu, start_time, lr):
        print('  - {header:12} , accuracy: {accu:3.3f} %, lr: {lr:8.5f}, elapse: {elapse:3.3f} min'.format(
                  header=f"({header})",accu=100*accu, elapse=(time.time()-start_time)/60, lr=lr))
    
    # TODO: Training Epoch Loop
    for epoch_i in range(opt["epoch"]):
        print("[Epoch: {:>3}]".format(epoch_i))
        
        #TODO: write function: train_epoch() 
        train_epoch(model,train_dataloader,optimizer,opt,device)
        
        
        #TODO: write function: eval_epoch() 
        #eval_epoch()
    
    

In [150]:
update_options(opt)
train(transformer,train_dataloader,train_dataloader,optimizer,device,opt)

[Epoch:   0]


                                                                                                                                                                                   

RuntimeError: The size of tensor a (32) must match the size of tensor b (512) at non-singleton dimension 1

In [25]:

linear_layer = nn.Linear(27, 512)

desc = '  - (Training)   '
for data, labels in tqdm(train_dataloader, mininterval=2, desc="Training: ", leave=False):
    
    print("Data:        ",data.shape)
    print("Layer shape: ",linear_layer.weight.size())
    output = linear_layer(data)
    print("Output:      ",output.shape)
    
    break




                                                                                                                                                                                   

Data:         torch.Size([32, 10, 27])
Layer shape:  torch.Size([512, 27])
Output:       torch.Size([32, 10, 512])




In [91]:
# Use the defined transformer

transformer.train()
desc = '  - (Training)   '
for data, labels in tqdm(train_dataloader, mininterval=2, desc="Training: ", leave=False):
        
    #TODO: prepare data
    data = data.to(device)
    labels = labels.to(device)
    
    #TODO: forward pass
    #optimizer.zero_grad()
    pred = transformer(data, labels)
    break


                                                                                                                                                                                   

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/home/yannic/anaconda3/envs/jupyter-lab/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3369, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_1562746/3941019311.py", line 13, in <cell line: 5>
    pred = transformer(data, labels)
  File "/home/yannic/anaconda3/envs/jupyter-lab/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/yannic/master-thesis/models/tstransformer/Models.py", line 338, in forward
    enc_output, *_ = self.encoder(src_seq, src_mask)
  File "/home/yannic/anaconda3/envs/jupyter-lab/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/yannic/master-thesis/models/tstransformer/Models.py", line 139, in forward
    enc_output, enc_slf_attn = enc_layer(enc_output, slf_attn_mask=src_mask)
  File "/ho

# DataLoader and Transformer

In [148]:
# Load Dataset
train_split, test_split = train_test_split(aq_prep, test_size=0.10)

train_dataset, train_dataloader = prepare_dataloaders(train_split.values, opt["batch_size"],window_size=10)
test_dataset, test_dataloader = prepare_dataloaders(test_split.values,  opt["batch_size"],window_size=10)

opt["src_sequence_size"] = 27
opt["trg_sequence_size"] = 27
opt["src_pad_idx"] = 0
opt["trg_pad_idx"] = 0

# Define Transformer
transformer = Transformer(
        opt["src_sequence_size"],
        opt["trg_sequence_size"],
        src_pad_idx=opt["src_pad_idx"],
        trg_pad_idx=opt["trg_pad_idx"],
        trg_emb_prj_weight_sharing=opt["proj_share_weight"],
        emb_src_trg_weight_sharing=opt["embs_share_weight"],
        d_k=opt["d_key"],
        d_v=opt["d_value"],
        d_model=opt["d_model"],
        d_sequence_vec=opt["d_sequence"],
        d_inner=opt["d_inner_hidden"],
        n_layers=opt["n_layers"],
        n_head=opt["n_head"],
        dropout=opt["dropout"],
        n_position=opt["d_sequence"],
        scale_emb_or_prj=opt["scale_emb_or_prj"]).to(device)

# Define Optimizer
optimizer = ScheduledOptim(
                optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09),
                opt["lr_mul"], opt["d_model"], opt["n_warmup_steps"])

# Start the training process
# train(transformer, train_dataloader, train_dataloader, optimizer, device, opt)

In [149]:
transformer

Transformer(
  (encoder): Encoder(
    (linear_emb): Linear(in_features=27, out_features=512, bias=True)
    (position_enc): PositionalEncoding()
    (dropout): Dropout(p=0.1, inplace=False)
    (layer_stack): ModuleList(
      (0): EncoderLayer(
        (slf_attn): MultiHeadAttention(
          (w_qs): Linear(in_features=512, out_features=512, bias=False)
          (w_ks): Linear(in_features=512, out_features=512, bias=False)
          (w_vs): Linear(in_features=512, out_features=512, bias=False)
          (fc): Linear(in_features=512, out_features=512, bias=False)
          (attention): ScaledDotProductAttention(
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (dropout): Dropout(p=0.1, inplace=False)
          (layer_norm): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
        )
        (pos_ffn): PositionwiseFeedForward(
          (w_1): Linear(in_features=512, out_features=2048, bias=True)
          (w_2): Linear(in_features=2048, out_features=512