In [1]:
%load_ext autoreload

In [2]:
# Utilities 
from options.options import is_notebook, get_options, update_options, print_options
import argparse
import math
import time
import random
import os
from tqdm.notebook import tqdm, trange
import sys

# Data
import numpy as np
from sklearn.model_selection import train_test_split
from data_utils.ts_dataset import TSDataset
from data_utils.data_utils import prepare_dataloaders
import pandas as pd

from preprocessor.preprocessor import PreprocessingPipeline

# Training Utils
from train_utils.train_utils import *
import wandb

# Torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

# Transformer
from tstransformer.Models import Transformer
from tstransformer.Optim import ScheduledOptim

%autoreload 2

In [3]:
# Helper function to determin GPU usage
def get_gpu_memory(name = "GPU Memory", verbose = True):
    import nvidia_smi
    nvidia_smi.nvmlInit()

    
    handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
    info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
    
    memory = {"total":  info.total / (1024 * 1024), "free": info.free / (1024 * 1024), "used": info.used / (1024 * 1024)}
    
    if verbose:
        print("#### ", name , "####")
        print("Total memory:", memory["total"], "MB")
        print("Free memory: ", memory["free"], "MB")
        print("Used memory: ", memory["used"], "MB")
        print("")
    
    nvidia_smi.nvmlShutdown()
    
    return memory

#_=get_gpu_memory("Before Empty Cache")
torch.cuda.empty_cache()
_=get_gpu_memory("After Empty Cache")

####  After Empty Cache ####
Total memory: 12288.0 MB
Free memory:  10885.1875 MB
Used memory:  1402.8125 MB



In [4]:
opt = get_options()
print_options(opt)

[93m[1m train_path [0m:  /home/yannic/master-thesis/data_air/prsa_data.parquet
[93m[1m val_path [0m:  /home/yannic/master-thesis/data_air/prsa_data.parquet
[93m[1m src_sequence_size [0m:  11
[93m[1m trg_sequence_size [0m:  11
[93m[1m src_pad_idx [0m:  0
[93m[1m trg_pad_idx [0m:  0
[93m[1m nsamples [0m:  0.1
[93m[1m epoch [0m:  10
[93m[1m batch_size [0m:  64
[93m[1m window_size [0m:  256
[93m[1m loss_func [0m:  rmse
[93m[1m learning_rate [0m:  0.001
[93m[1m d_model [0m:  512
[93m[1m d_inner_hidden [0m:  2048
[93m[1m d_key [0m:  512
[93m[1m d_value [0m:  512
[93m[1m d_sequence [0m:  512
[93m[1m n_head [0m:  8
[93m[1m n_layers [0m:  6
[93m[1m lr_mul [0m:  2.0
[93m[1m seed [0m:  False
[93m[1m dropout [0m:  0.1
[93m[1m embs_share_weight [0m:  False
[93m[1m proj_share_weight [0m:  False
[93m[1m scale_emb_or_prj [0m:  prj
[93m[1m output_dir [0m:  ./output
[93m[1m use_wandb [0m:  True
[93m[1m save_mode [0m:  be

# Data Preprocessing

In [5]:
# Load the dataset and preprocess it 
aq = pd.read_parquet("/home/yannic/master-thesis/data_air/prsa_data.parquet")

# With Stations
#aq_prep = pd.concat([aq,pd.get_dummies(aq['station'], prefix='station',dummy_na=False)],axis=1).drop(['station'],axis=1).drop(columns=["wind_direction"])

# Without Stations
aq_prepared = aq.drop(['station'],axis=1).drop(columns=["wind_direction"]).iloc[:,4:]


fill_type = 'ema_fast'           # OPTIONS: 'nan', 'median', 'locf', 'nocb', 'ema', 'ema_fast'
clip_quantile_value = 0.99   # OPTIONS: range(0.0 , 1.0)

preprocessing_pipe = PreprocessingPipeline(fill_type, clip_quantile_value)
aq_preprocessed = preprocessing_pipe.fit_transform(aq_prepared)
print("Preprocessed Shape:",aq_preprocessed.shape,"\n")

aq_preprocessed = aq_preprocessed.iloc[:,:]
print("Preprocessed Dataset:")
print("Nans : ",aq_preprocessed.isnull().sum().sum())
print("Shape: ",aq_preprocessed.shape)
aq_preprocessed.head()

Preprocessed Shape: (420768, 11) 

Preprocessed Dataset:
Nans :  0
Shape:  (420768, 11)


Unnamed: 0,pm25,pm10,so2,no2,co,o3,temperatur,pressure,dew_point,rain,wind_speed
0,0.257708,0.172894,0.479068,0.042473,0.398344,0.337877,-0.180366,-0.016343,-0.2049,1.52216,0.604078
1,0.118495,0.027765,0.392871,-0.148608,0.346235,0.405559,-0.292978,0.074749,-0.345709,1.413122,0.822465
2,-0.020333,-0.106805,0.257591,-0.331794,0.282495,0.379272,-0.396448,0.159459,-0.476346,1.301458,1.052115
3,-0.152077,-0.23101,0.190721,-0.505281,0.109226,0.223226,-0.491079,0.250003,-0.602353,1.188858,1.084803
4,-0.280683,-0.354956,0.116248,-0.668109,0.043444,0.26834,-0.577962,0.336914,-0.715936,1.076822,1.181538


# DataLoader and Transformer

In [17]:
def train(model, train_dataloader, validation_datalaoder, optimizer, device, opt):
    
    if opt["use_wandb"]:
        wandb.init(project="masters-thesis", entity="yannicj", config=opt)
        #wandb.run.name = "test_run"
        #wandb.config = opt

    for epoch in range(opt["epoch"]):
        
        # Initiate running losses for epoch
        running_train_loss = 0
        train_losses = []
        running_val_loss = 0
        val_losses = []
        
        
        with tqdm(train_dataloader, unit="batch", mininterval=1, leave=True) as tepoch:
            tepoch.set_description(f"Epoch {epoch}")
            
            # Train epoch
            for data, target in train_dataloader:
                tepoch.update(1)
                
                # Prepare data
                data = data.to(device)
                target = target.to(device)
                decoder_input = data[:,-1,:].unsqueeze(dim=1)
                
                # Trainig step
                optimizer.zero_grad()
                pred = model(data, decoder_input)

                loss = calc_loss(pred, target, opt["loss_func"])
                running_train_loss += loss.item()
                train_losses.append(loss.item())

                loss.backward()
                optimizer.step()
                
                # Update progress bar
                temp_mean_train_loss = sum(train_losses) / len(train_losses)
                tqdm_loss = "{:10.4f}".format(temp_mean_train_loss)
                tepoch.set_postfix(mean_train_loss=tqdm_loss)
            
            # Validate epoch
            running_val_loss, val_losses = validate_epoch(model, validation_datalaoder, device, opt)
            
            # Update Progress Bar
            mean_tain_loss = sum(train_losses) / len(train_losses)
            mean_val_loss = sum(val_losses) / len(val_losses)
            tqdm_train_loss = "{:10.4f}".format(mean_tain_loss)
            tqdm_val_losses = "{:10.4f}".format(mean_val_loss)
            
            tepoch.set_postfix({"loss": tqdm_train_loss, "val_loss": tqdm_val_losses})
            tepoch.close()
            
            if opt["use_wandb"]:
                wandb.log({"train_loss": mean_tain_loss, "val_loss": mean_val_loss})
                wandb.watch(model)
                
            # save model localy
            
    
    if opt["use_wandb"]:
        wandb.finish()

In [18]:
# define the cuda devide
device = torch.device('cuda' if opt["cuda"] else 'cpu')

# Load options
opt = get_options()

# create train, eval, test split
data = aq_preprocessed[:int(np.round(opt["nsamples"]*aq_preprocessed.shape[0]))]
train_split, test_split = train_test_split(data, test_size=0.15, shuffle=False)
train_split, eval_split = train_test_split(train_split, test_size=0.18, shuffle=False)

# Training dataset, Evaluation dataset in each epoch, Testing dataset after training
train_dataset, train_dataloader = prepare_dataloaders(train_split.values, opt["batch_size"],window_size=opt["window_size"], device=device)
eval_dataset, eval_dataloader = prepare_dataloaders(eval_split.values,  opt["batch_size"],window_size=opt["window_size"], device=device)
test_dataset, test_dataloader = prepare_dataloaders(test_split.values,  opt["batch_size"],window_size=opt["window_size"], device=device)

# Define Transformer
transformer = Transformer(
        n_src_sequence=opt["src_sequence_size"],
        n_trg_sequence=opt["trg_sequence_size"],
        src_pad_idx=opt["src_pad_idx"],
        trg_pad_idx=opt["trg_pad_idx"],
        trg_emb_prj_weight_sharing=opt["proj_share_weight"],
        emb_src_trg_weight_sharing=opt["embs_share_weight"],
        d_k=opt["d_key"],
        d_v=opt["d_value"],
        d_model=opt["d_model"],
        d_sequence_vec=opt["d_sequence"],
        d_inner=opt["d_inner_hidden"],
        n_layers=opt["n_layers"],
        n_head=opt["n_head"],
        dropout=opt["dropout"],
        n_position=opt["d_sequence"],
        scale_emb_or_prj=opt["scale_emb_or_prj"]).to(device)

# Define Optimizer
optimizer = optim.Adam(transformer.parameters(), lr= opt["learning_rate"], betas=(0.9, 0.98), eps=1e-09)

# print training settings
print_training_settings(device, train_split, eval_split, test_split, opt["loss_func"])

# Start the training process
train(transformer, train_dataloader, train_dataloader, optimizer, device, opt)

[95m[1m#-------------------------------------#[0m
[4m[1mTraining Settings:[0m
- [1mdevice:[0m cuda
- [1mtrain_size:[0m 0.7% (2932, 11)
- [1meval_size:[0m 0.15% (644, 11)
- [1mtest_size:[0m 0.15% (632, 11)
- [1mloss_fn:[0m rmse
[95m[1m#-------------------------------------#[0m


Epoch 0: 100%|███████████████████████████████████████████████████████| 42/42 [00:16<00:00,  2.54batch/s, loss=1.3605, val_loss=1.1960]
Epoch 1: 100%|███████████████████████████████████████████████████████| 42/42 [00:17<00:00,  2.38batch/s, loss=0.9562, val_loss=1.1185]
Epoch 2: 100%|███████████████████████████████████████████████████████| 42/42 [00:17<00:00,  2.46batch/s, loss=0.9139, val_loss=1.1415]
Epoch 3: 100%|███████████████████████████████████████████████████████| 42/42 [00:15<00:00,  2.69batch/s, loss=0.9169, val_loss=1.2298]
Epoch 4: 100%|███████████████████████████████████████████████████████| 42/42 [00:17<00:00,  2.37batch/s, loss=0.9574, val_loss=1.0665]
Epoch 5: 100%|███████████████████████████████████████████████████████| 42/42 [00:15<00:00,  2.73batch/s, loss=0.9394, val_loss=1.0295]
Epoch 6: 100%|███████████████████████████████████████████████████████| 42/42 [00:17<00:00,  2.39batch/s, loss=0.9071, val_loss=1.0359]
Epoch 7: 100%|█████████████████████████████████████████

VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_loss,█▂▁▁▂▂▁▁▁▁
val_loss,▇▄▅█▂▁▁▁▁▂

0,1
train_loss,0.8871
val_loss,1.04959


In [12]:
wandb.finish()

## Make Prediction

In [None]:

data = data.to(device)
target = target.to(device)
decoder_input = data[:,-1,:].unsqueeze(dim=1)

# Old Functions

In [None]:
def train_epoch(model, training_data, optimizer, opt, device, smoothing):
    ''' Epoch operation in training phase'''

    model.train()
    total_loss, n_word_total, n_word_correct = 0, 0, 0 

    desc = '  - (Training)   '
    for batch in tqdm(training_data, mininterval=2, desc=desc, leave=False):
        
        
        # prepare data
        src_seq = patch_src(batch.src, opt.src_pad_idx).to(device)
        trg_seq, gold = map(lambda x: x.to(device), patch_trg(batch.trg, opt.trg_pad_idx))

        # forward
        optimizer.zero_grad()
        pred = model(src_seq, trg_seq)

        # backward and update parameters
        loss, n_correct, n_word = cal_performance(
            pred, gold, opt.trg_pad_idx, smoothing=smoothing) 
        loss.backward()
        optimizer.step_and_update_lr()

        # note keeping
        n_word_total += n_word
        n_word_correct += n_correct
        total_loss += loss.item()

    loss_per_word = total_loss/n_word_total
    accuracy = n_word_correct/n_word_total
    return loss_per_word, accuracy

In [None]:
def eval_epoch(model, validation_data, device, opt):
    ''' Epoch operation in evaluation phase '''

    model.eval()
    total_loss, n_word_total, n_word_correct = 0, 0, 0

    desc = '  - (Validation) '
    with torch.no_grad():
        for batch in tqdm(validation_data, mininterval=2, desc=desc, leave=False):

            # prepare data
            src_seq = patch_src(batch.src, opt.src_pad_idx).to(device)
            trg_seq, gold = map(lambda x: x.to(device), patch_trg(batch.trg, opt.trg_pad_idx))

            # forward
            pred = model(src_seq, trg_seq)
            loss, n_correct, n_word = cal_performance(
                pred, gold, opt.trg_pad_idx, smoothing=False)

            # note keeping
            n_word_total += n_word
            n_word_correct += n_correct
            total_loss += loss.item()

    loss_per_word = total_loss/n_word_total
    accuracy = n_word_correct/n_word_total
    return loss_per_word, accuracy

In [None]:
def train(model, training_data, validation_data, optimizer, device, opt):
    ''' Start training '''
    
    # Use wandb to plot curves, e.g. perplexity, accuracy, learning rate
    # TODO: Implement this

    log_train_file = os.path.join(opt["output_dir"], 'train.log')
    log_valid_file = os.path.join(opt["output_dir"], 'valid.log')

    print('[Info] Training performance will be written to file: {} and {}'.format(log_train_file, log_valid_file))

    with open(log_train_file, 'w') as log_tf, open(log_valid_file, 'w') as log_vf:
        log_tf.write('epoch,loss,ppl,accuracy\n')
        log_vf.write('epoch,loss,ppl,accuracy\n')

    def print_performances(header, accu, start_time, lr):
        print('  - {header:12} , accuracy: {accu:3.3f} %, lr: {lr:8.5f}, ''elapse: {elapse:3.3f} min'.format(
                  header=f"({header})",accu=100*accu, elapse=(time.time()-start_time)/60, lr=lr))

    #valid_accus = []
    valid_losses = []
    for epoch_i in range(opt["epoch"]):
        print('[ Epoch', epoch_i, ']')

        start = time.time()
        train_loss, train_accu = train_epoch(
            model, training_data, optimizer, opt, device, smoothing=opt["label_smoothing"])
        
        
        # Current learning rate
        lr = optimizer._optimizer.param_groups[0]['lr']
        print_performances('Training', train_accu, start, lr)

        start = time.time()
        valid_loss, valid_accu = eval_epoch(model, validation_data, device, opt)
        valid_ppl = math.exp(min(valid_loss, 100))
        print_performances('Validation', valid_ppl, valid_accu, start, lr)

        valid_losses += [valid_loss]

        checkpoint = {'epoch': epoch_i, 'settings': opt, 'model': model.state_dict()}
        
        if opt["save_mode"] == 'all':
            model_name = 'model_accu_{accu:3.3f}.chkpt'.format(accu=100*valid_accu)
            torch.save(checkpoint, model_name)
        elif opt["save_mode"] == 'best':
            model_name = 'model.chkpt'
            if valid_loss <= min(valid_losses):
                torch.save(checkpoint, os.path.join(opt["output_dir"], model_name))
                print('    - [Info] The checkpoint file has been updated.')

        with open(log_train_file, 'a') as log_tf, open(log_valid_file, 'a') as log_vf:
            log_tf.write('{epoch},{loss: 8.5f},{ppl: 8.5f},{accu:3.3f}\n'.format(
                epoch=epoch_i, loss=train_loss,
                ppl=train_ppl, accu=100*train_accu))
            log_vf.write('{epoch},{loss: 8.5f},{ppl: 8.5f},{accu:3.3f}\n'.format(
                epoch=epoch_i, loss=valid_loss,
                ppl=valid_ppl, accu=100*valid_accu))

        if opt["use_tb"]:
            tb_writer.add_scalars('ppl', {'train': train_ppl, 'val': valid_ppl}, epoch_i)
            tb_writer.add_scalars('accuracy', {'train': train_accu*100, 'val': valid_accu*100}, epoch_i)
            tb_writer.add_scalar('learning_rate', lr, epoch_i)


--------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Testing and Stuff

In [None]:
train_dataset, train_dataloader = prepare_dataloaders(aq_preprocessed.values, opt["batch_size"],window_size=10)

In [None]:
i = 1
j = 0
for data, label in train_dataloader:
    x = data[0]
    y = label[0]
    if j==i:
        break
    j+=1

x_df = pd.DataFrame(x.numpy(), columns=['pm25', 'pm10', 'so2', 'no2', 'co', 'o3', 'temperatur', 'pressure',
       'dew_point', 'rain', 'wind_speed'])

y_df = pd.DataFrame(y.numpy(), columns=['pm25', 'pm10', 'so2', 'no2', 'co', 'o3', 'temperatur', 'pressure',
       'dew_point', 'rain', 'wind_speed'])

sample = pd.concat([x_df, y_df]).reset_index().drop(columns=["index"]).round(2).astype(str)
real = aq_preprocessed[i+0:i+11].reset_index().drop(columns=["index"]).round(2).astype(str)

In [None]:
sample

In [None]:
real

In [None]:
(sample == real).any().any()

In [None]:
pd.DataFrame(train_dataset[3][0].numpy(), columns=['pm25', 'pm10', 'so2', 'no2', 'co', 'o3', 'temperatur', 'pressure',
       'dew_point', 'rain', 'wind_speed']).round(2)