In [15]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
# Utilities 
from options.options import is_notebook, get_options, update_options, print_options
import argparse
import math
import time
import random
import os
from tqdm import tqdm

# Data
import numpy as np
from sklearn.model_selection import train_test_split
from datautils.ts_dataset import TSDataset
from datautils.data import prepare_dataloaders
import pandas as pd

# Torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

# Transformer
#import tstransformer.Constants as Constants
from tstransformer.Models import Transformer
from tstransformer.Optim import ScheduledOptim

%autoreload 2

In [17]:
# Helper function to determin GPU usage
def get_gpu_memory(name = "GPU Memory", verbose = True):
    import nvidia_smi
    nvidia_smi.nvmlInit()

    handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
    # card id 0 hardcoded here, there is also a call to get all available card ids, so we could iterate

    info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
    
    memory = {"total":  info.total / (1024 * 1024), "free": info.free / (1024 * 1024), "used": info.used / (1024 * 1024)}
    
    if verbose:
        print("#### ", name , "####")
        print("Total memory:", memory["total"], "MB")
        print("Free memory: ", memory["free"], "MB")
        print("Used memory: ", memory["used"], "MB")
        print("")
    
    nvidia_smi.nvmlShutdown()
    
    return memory

_=get_gpu_memory("Before Empty Cache")
torch.cuda.empty_cache()
_=get_gpu_memory("After Empty Cache")

####  Before Empty Cache ####
Total memory: 12288.0 MB
Free memory:  10239.1875 MB
Used memory:  2048.8125 MB

####  After Empty Cache ####
Total memory: 12288.0 MB
Free memory:  10349.1875 MB
Used memory:  1938.8125 MB



In [71]:
opt = get_options()
print_options(opt)

[93m[1m           train_path [0m:  /home/yannic/master-thesis/data_air/prsa_data.parquet
[93m[1m             val_path [0m:  /home/yannic/master-thesis/data_air/prsa_data.parquet
[93m[1m                epoch [0m:  10
[93m[1m           batch_size [0m:  64
[93m[1m              d_model [0m:  512
[93m[1m       d_inner_hidden [0m:  2048
[93m[1m                d_key [0m:  512
[93m[1m              d_value [0m:  512
[93m[1m           d_sequence [0m:  512
[93m[1m               n_head [0m:  8
[93m[1m             n_layers [0m:  6
[93m[1m       n_warmup_steps [0m:  4000
[93m[1m               lr_mul [0m:  2.0
[93m[1m            loss_func [0m:  rmse
[93m[1m                 seed [0m:  False
[93m[1m              dropout [0m:  0.1
[93m[1m    embs_share_weight [0m:  False
[93m[1m    proj_share_weight [0m:  False
[93m[1m     scale_emb_or_prj [0m:  prj
[93m[1m           output_dir [0m:  ./output
[93m[1m               use_tb [0m:  False
[93m[1

In [19]:
# https://pytorch.org/docs/stable/notes/randomness.html
# For reproducibility
if opt["seed"] is not None:
    torch.manual_seed(opt["seed"])
    torch.backends.cudnn.benchmark = False
    # torch.set_deterministic(True)
    np.random.seed(opt["seed"])
    random.seed(opt["seed"])

In [20]:
if not opt["output_dir"]:
    print('No experiment result will be saved.')
    raise

In [21]:
if not os.path.exists(opt["output_dir"]):
    os.makedirs(opt["output_dir"])

In [22]:
# define the cuda devide
device = torch.device('cuda' if opt["cuda"] else 'cpu')
print(device)

cuda


In [76]:
# Load the dataset and preprocess it 
aq = pd.read_parquet("/home/yannic/master-thesis/data_air/prsa_data.parquet")

# With Stations
#aq_prep = pd.concat([aq,pd.get_dummies(aq['station'], prefix='station',dummy_na=False)],axis=1).drop(['station'],axis=1).drop(columns=["wind_direction"])

# Without Stations
aq_prep = aq.drop(['station'],axis=1).drop(columns=["wind_direction"]).iloc[:,4:]

aq_prep = aq_prep.fillna(0)[:300]
aq_prep.tail()

Unnamed: 0,pm25,pm10,so2,no2,co,o3,temperatur,pressure,dew_point,rain,wind_speed
295,14.0,14.0,8.0,61.0,800.0,47.0,3.7,1021.5,-10.4,0.0,1.6
296,18.0,36.0,12.0,62.0,1000.0,47.0,4.9,1022.7,-11.7,0.0,2.2
297,24.0,37.0,17.0,45.0,900.0,47.0,6.1,1023.0,-11.1,0.0,1.4
298,15.0,15.0,13.0,23.0,800.0,69.0,6.5,1022.9,-12.7,0.0,1.7
299,14.0,6.0,9.0,19.0,500.0,75.0,6.8,1022.2,-12.9,0.0,3.4


# DataLoader and Transformer

# Old Functions

In [24]:
def cal_performance(pred, gold, trg_pad_idx, smoothing=False):
    ''' Apply label smoothing if needed '''

    loss = cal_loss(pred, gold, trg_pad_idx, smoothing=smoothing)

    pred = pred.max(1)[1]
    gold = gold.contiguous().view(-1)
    non_pad_mask = gold.ne(trg_pad_idx)
    n_correct = pred.eq(gold).masked_select(non_pad_mask).sum().item()
    n_word = non_pad_mask.sum().item()

    return loss, n_correct, n_word

In [25]:
def cal_loss(pred, gold, trg_pad_idx, smoothing=False):
    ''' Calculate cross entropy loss, apply label smoothing if needed. '''

    gold = gold.contiguous().view(-1)

    if smoothing:
        eps = 0.1
        n_class = pred.size(1)

        one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1, 1), 1)
        one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1)
        log_prb = F.log_softmax(pred, dim=1)

        non_pad_mask = gold.ne(trg_pad_idx)
        loss = -(one_hot * log_prb).sum(dim=1)
        loss = loss.masked_select(non_pad_mask).sum()  # average later
    else:
        loss = F.cross_entropy(pred, gold, ignore_index=trg_pad_idx, reduction='sum')
    return loss

In [26]:
def patch_src(src, pad_idx):
    src = src.transpose(0, 1)
    return src

In [27]:
def patch_trg(trg, pad_idx):
    trg = trg.transpose(0, 1)
    trg, gold = trg[:, :-1], trg[:, 1:].contiguous().view(-1)
    return trg, gold

In [28]:
def train_epoch(model, training_data, optimizer, opt, device, smoothing):
    ''' Epoch operation in training phase'''

    model.train()
    total_loss, n_word_total, n_word_correct = 0, 0, 0 

    desc = '  - (Training)   '
    for batch in tqdm(training_data, mininterval=2, desc=desc, leave=False):
        
        
        # prepare data
        src_seq = patch_src(batch.src, opt.src_pad_idx).to(device)
        trg_seq, gold = map(lambda x: x.to(device), patch_trg(batch.trg, opt.trg_pad_idx))

        # forward
        optimizer.zero_grad()
        pred = model(src_seq, trg_seq)

        # backward and update parameters
        loss, n_correct, n_word = cal_performance(
            pred, gold, opt.trg_pad_idx, smoothing=smoothing) 
        loss.backward()
        optimizer.step_and_update_lr()

        # note keeping
        n_word_total += n_word
        n_word_correct += n_correct
        total_loss += loss.item()

    loss_per_word = total_loss/n_word_total
    accuracy = n_word_correct/n_word_total
    return loss_per_word, accuracy

In [29]:
def eval_epoch(model, validation_data, device, opt):
    ''' Epoch operation in evaluation phase '''

    model.eval()
    total_loss, n_word_total, n_word_correct = 0, 0, 0

    desc = '  - (Validation) '
    with torch.no_grad():
        for batch in tqdm(validation_data, mininterval=2, desc=desc, leave=False):

            # prepare data
            src_seq = patch_src(batch.src, opt.src_pad_idx).to(device)
            trg_seq, gold = map(lambda x: x.to(device), patch_trg(batch.trg, opt.trg_pad_idx))

            # forward
            pred = model(src_seq, trg_seq)
            loss, n_correct, n_word = cal_performance(
                pred, gold, opt.trg_pad_idx, smoothing=False)

            # note keeping
            n_word_total += n_word
            n_word_correct += n_correct
            total_loss += loss.item()

    loss_per_word = total_loss/n_word_total
    accuracy = n_word_correct/n_word_total
    return loss_per_word, accuracy

In [30]:
def train(model, training_data, validation_data, optimizer, device, opt):
    ''' Start training '''
    
    # Use wandb to plot curves, e.g. perplexity, accuracy, learning rate
    # TODO: Implement this

    log_train_file = os.path.join(opt["output_dir"], 'train.log')
    log_valid_file = os.path.join(opt["output_dir"], 'valid.log')

    print('[Info] Training performance will be written to file: {} and {}'.format(log_train_file, log_valid_file))

    with open(log_train_file, 'w') as log_tf, open(log_valid_file, 'w') as log_vf:
        log_tf.write('epoch,loss,ppl,accuracy\n')
        log_vf.write('epoch,loss,ppl,accuracy\n')

    def print_performances(header, accu, start_time, lr):
        print('  - {header:12} , accuracy: {accu:3.3f} %, lr: {lr:8.5f}, ''elapse: {elapse:3.3f} min'.format(
                  header=f"({header})",accu=100*accu, elapse=(time.time()-start_time)/60, lr=lr))

    #valid_accus = []
    valid_losses = []
    for epoch_i in range(opt["epoch"]):
        print('[ Epoch', epoch_i, ']')

        start = time.time()
        train_loss, train_accu = train_epoch(
            model, training_data, optimizer, opt, device, smoothing=opt["label_smoothing"])
        
        
        # Current learning rate
        lr = optimizer._optimizer.param_groups[0]['lr']
        print_performances('Training', train_accu, start, lr)

        start = time.time()
        valid_loss, valid_accu = eval_epoch(model, validation_data, device, opt)
        valid_ppl = math.exp(min(valid_loss, 100))
        print_performances('Validation', valid_ppl, valid_accu, start, lr)

        valid_losses += [valid_loss]

        checkpoint = {'epoch': epoch_i, 'settings': opt, 'model': model.state_dict()}
        
        if opt["save_mode"] == 'all':
            model_name = 'model_accu_{accu:3.3f}.chkpt'.format(accu=100*valid_accu)
            torch.save(checkpoint, model_name)
        elif opt["save_mode"] == 'best':
            model_name = 'model.chkpt'
            if valid_loss <= min(valid_losses):
                torch.save(checkpoint, os.path.join(opt["output_dir"], model_name))
                print('    - [Info] The checkpoint file has been updated.')

        with open(log_train_file, 'a') as log_tf, open(log_valid_file, 'a') as log_vf:
            log_tf.write('{epoch},{loss: 8.5f},{ppl: 8.5f},{accu:3.3f}\n'.format(
                epoch=epoch_i, loss=train_loss,
                ppl=train_ppl, accu=100*train_accu))
            log_vf.write('{epoch},{loss: 8.5f},{ppl: 8.5f},{accu:3.3f}\n'.format(
                epoch=epoch_i, loss=valid_loss,
                ppl=valid_ppl, accu=100*valid_accu))

        if opt["use_tb"]:
            tb_writer.add_scalars('ppl', {'train': train_ppl, 'val': valid_ppl}, epoch_i)
            tb_writer.add_scalars('accuracy', {'train': train_accu*100, 'val': valid_accu*100}, epoch_i)
            tb_writer.add_scalar('learning_rate', lr, epoch_i)


--------------------------------------------------------------------------------------------------------------------------------------------------------------------

# New Code 

In [64]:
def calc_loss(pred, target, lossfn="rmse"):
    
    # flatten targets in order to match predicitions
    target = target.flatten(start_dim=1)
    
    #TODO: Check if NAN's in target, and mask them out in th eloss calculation
    
    # calc loss
    
    if lossfn=="rmse":
        loss = rmse_loss(pred, target)
        
    elif lossfn=="mse":
        loss = mse_loss(pred, target)
    
    
    
    return loss

def mse_loss(pred, target):
    """" Mean Squared Error loss function """
    loss = F.mse_loss(pred, target)
    
    return loss


def rmse_loss(pred, target):
    """ Root Mean Squared Error loss function """
    loss = torch.sqrt(F.mse_loss(pred, target))
    
    return loss

In [81]:
def train_epoch(model, train_dataloader, optimizer, opt, device):
    
    # Set the model into training mode
    model.train()
    
    # Performance 
    losses = []
    
    desc = '  - (Training)   '
    for data, target in tqdm(train_dataloader, mininterval=2, desc=desc, leave=False, ascii=" ▖▘▝▗▚▞█"): #ascii="░▒█"):
        
        # prepare data
        data = data.to(device)
        target = target.to(device)
        
        # Take the last step of the encoder input sequence to use as a startingpoint for the decoder 
        decoder_input = data[:,-1,:].unsqueeze(dim=1)
        
        # forward pass
        optimizer.zero_grad()
        pred = model(data, decoder_input)
        
        # backward pass
        loss = calc_loss(pred, target, opt["loss_func"])
        loss.backward()
        optimizer.step_and_update_lr()
        
        # Sum up loss
        losses.append(loss)
         
    return losses
    

In [82]:
def train(model, train_dataloader, validation_data, optimizer, device, opt):
    
    # TODO: Implement wandb connection
    
    # TODO: Implement logging
    epoch_losses = []
    
    def print_performance(header, loss_name, loss, start_time, lr):
        print("  - ({header:12}) | {loss_name}: {loss:3.3f} | lr: {lr:8.5f} | elapse: {elapse:3.3f} min".format(header=header,loss_name=loss_name, loss=loss, elapse=(time.time()-start_time)/60, lr=lr))
    
    
    # TODO: Training Epoch Loop
    for epoch_i in range(opt["epoch"]):
        print("[Epoch: {:>3}]".format(epoch_i))
        
        # trains one epoch
        start = time.time()
        losses = train_epoch(model,train_dataloader,optimizer,opt,device)
        
        #calculate epoch loss and add to logging
        total_epoch_loss = sum([loss.item() for loss in losses])
        epoch_losses.append(total_epoch_loss)
        
        lr = optimizer._optimizer.param_groups[0]['lr']
        
        print_performance('Training',opt["loss_func"], total_epoch_loss, start, lr)
        
        
        #TODO: write function: eval_epoch() 
        #eval_epoch()
    
    

In [83]:
train(transformer,train_dataloader,train_dataloader,optimizer,device,opt)

[Epoch:   0]


                                                                                                                                                                                   

[tensor(845.5779, device='cuda:0', grad_fn=<SqrtBackward0>), tensor(800.0784, device='cuda:0', grad_fn=<SqrtBackward0>), tensor(740.5040, device='cuda:0', grad_fn=<SqrtBackward0>), tensor(776.2935, device='cuda:0', grad_fn=<SqrtBackward0>), tensor(905.9479, device='cuda:0', grad_fn=<SqrtBackward0>)]
<class 'list'>
<class 'list'>
<class 'torch.Tensor'>




KeyError: 'mse'

In [80]:
# Load Dataset
opt = get_options()

train_split, test_split = train_test_split(aq_prep, test_size=0.10)

train_dataset, train_dataloader = prepare_dataloaders(train_split.values, opt["batch_size"],window_size=10)
test_dataset, test_dataloader = prepare_dataloaders(test_split.values,  opt["batch_size"],window_size=10)

opt["src_sequence_size"] = 11
opt["trg_sequence_size"] = 11
opt["src_pad_idx"] = 0
opt["trg_pad_idx"] = 0

# Define Transformer
transformer = Transformer(
        n_src_sequence=opt["src_sequence_size"],
        n_trg_sequence=opt["trg_sequence_size"],
        src_pad_idx=opt["src_pad_idx"],
        trg_pad_idx=opt["trg_pad_idx"],
        trg_emb_prj_weight_sharing=opt["proj_share_weight"],
        emb_src_trg_weight_sharing=opt["embs_share_weight"],
        d_k=opt["d_key"],
        d_v=opt["d_value"],
        d_model=opt["d_model"],
        d_sequence_vec=opt["d_sequence"],
        d_inner=opt["d_inner_hidden"],
        n_layers=opt["n_layers"],
        n_head=opt["n_head"],
        dropout=opt["dropout"],
        n_position=opt["d_sequence"],
        scale_emb_or_prj=opt["scale_emb_or_prj"]).to(device)

# Define Optimizer
optimizer = ScheduledOptim(
                optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09),
                opt["lr_mul"], opt["d_model"], opt["n_warmup_steps"])

# Start the training process
train(transformer, train_dataloader, train_dataloader, optimizer, device, opt)

[Epoch:   0]


                                                                                                                                                                                   

<class 'float'>




TypeError: 'float' object is not subscriptable

## Testing and Stuff