<a href="https://colab.research.google.com/github/textnorms/date_text_norm/blob/master/V14_PL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! nvidia-smi

Thu Jun 18 02:30:44 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P8     8W /  75W |      0MiB /  7611MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install pytorch-lightning --quiet

[K     |████████████████████████████████| 256kB 9.1MB/s 
[K     |████████████████████████████████| 829kB 14.7MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone


# Imports

In [3]:
! rm -rf date*
! git clone https://github.com/textnorms/date_text_norm.git
! cp -r date_text_norm/syntetic_data/ .

! pip install -q num2words transformers
! pip install -q transformers

Cloning into 'date_text_norm'...
remote: Enumerating objects: 94, done.[K
remote: Counting objects: 100% (94/94), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 305 (delta 55), reused 58 (delta 28), pack-reused 211[K
Receiving objects: 100% (305/305), 1.46 MiB | 3.28 MiB/s, done.
Resolving deltas: 100% (173/173), done.
[K     |████████████████████████████████| 102kB 5.8MB/s 
[K     |████████████████████████████████| 675kB 16.8MB/s 
[K     |████████████████████████████████| 1.1MB 27.1MB/s 
[K     |████████████████████████████████| 890kB 52.2MB/s 
[K     |████████████████████████████████| 3.8MB 54.3MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [4]:
#Basics
from argparse import Namespace
import matplotlib.pyplot as plt
import random
import pandas as pd
import numpy as np

# Synthetic data generator
from syntetic_data import DateTextGenerator
from syntetic_data import RelativeDateTextGenerator

#Pytorch
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader

#Pytorch Lightning
import pytorch_lightning as pl

# Transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW

# Sklearn
from sklearn.model_selection import train_test_split

# LightningBase



In [5]:
class LightningBase:

    '''
        hparams needs to contain:
            - lr (float)
            - optimizer (str)
            - optimizer_kwargs (Dict[str,Optional])
            - train_batch_size (int)
            - val_batch_size (int)
            - shuffle_train (bool)
            - num_workers (int)
        Properties needed:
            - train_dataset (Dataset)
            - val_dataset (Dataset)
            - test_dataset (Dataset)
    '''

    def _average_key(self, outputs, key):
        return torch.stack([o[key] for o in outputs]).float().mean()

    def get_dataloader(self, dataset, batch_size, shuffle, num_workers):
        return DataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=shuffle,
            num_workers=num_workers
        )

    def get_optimizer(self,):
        optimizer_name = self.hparams.optimizer
        lr = self.hparams.lr
        optimizer_hparams = self.hparams.optimizer_kwargs
        optimizer = getattr(torch.optim, optimizer_name)
        return optimizer(self.parameters(), lr=lr, **optimizer_hparams)

    def train_dataloader(self,):
        return self.get_dataloader(
            self.train_dataset,
            batch_size=self.hparams.train_batch_size,
            shuffle=self.hparams.shuffle_train,
            num_workers=self.hparams.num_workers
        )

    def val_dataloader(self,):
        return self.get_dataloader(
            self.valid_dataset,
            batch_size=self.hparams.val_batch_size,
            shuffle=False,
            num_workers=self.hparams.num_workers
        )

    def test_dataloader(self,):
        return self.get_dataloader(
            self.test_dataset,
            batch_size=self.hparams.val_batch_size,
            shuffle=False,
            num_workers=self.hparams.num_workers
        )

    def configure_optimizers(self):
        optimizer = self.get_optimizer()
        return optimizer

# Dataset

In [6]:
# Dataset params
LANGUAGE = 'pt'

# Model params
MODEL_SZ = 't5-small' # 't5-base'
TOK = T5Tokenizer.from_pretrained(MODEL_SZ)
MAX_LEN_SRC  = 48
MAX_LEN_TRGT = 12

# Train params
BATCH_SZ = 16
N_EPOCHS = 50
WINDOW   = 7

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




In [7]:
pd.set_option('display.max_rows',70)

print('Generating absolute and incomplete dates')
dates = DateTextGenerator(start_date='01/01/1921',
                          end_date='31/12/2120',
                          text_noise_rate=0.1,
                          language=LANGUAGE)

print('Generating relative dates')
rel_dates = RelativeDateTextGenerator(n_samples=500, text_noise_rate=0.0,
                                       max_noise_occurences_per_sample=3,
                                       samples_per_method=18,
                                       language=LANGUAGE)

df = dates.generate_date_dataset()
df = df.append(rel_dates.generate_date_dataset(),ignore_index=True)

Generating absolute and incomplete dates
Generating relative dates


In [8]:
df['Target Format'].value_counts()

DD/MM/YYYY    73049
Relative       9000
MM/YYYY        7200
DD/MM          2562
Name: Target Format, dtype: int64

In [9]:
# Removing dates in the defined Target Format
df = df.loc[df['Target Format'] == 'Relative']
print(f'# of patterns: {df["Input Pattern"].nunique()}')
df.head()

# of patterns: 36


Unnamed: 0,Input Pattern,Noise Type,Input,Target,Target Format
82811,1,,há 1 dia,-1d,Relative
82812,29,,ao primeiro mês,1m,Relative
82813,28,,ao primeiro dia,1d,Relative
82814,36,,um ano atrás,-1y,Relative
82815,14,,por 1 mês,1m,Relative


## Function to split the dataset

In [10]:
def split_data(df, test_size=0.2, verbose=True):
    l = list(set(df['Input Pattern'].values))
    num_test = int(len(l)*test_size)
    test_methods = [random.randint(1, len(l)) for _ in range(num_test)]
    print(test_methods)
    df_test = df[df['Input Pattern'].isin(test_methods)]
    print(df_test.shape)
    x_test = df_test.Input.values
    y_test = df_test.Target.values

    df_train = df[~df['Input Pattern'].isin(test_methods)]

    x_train, x_val, y_train, y_val = train_test_split(
        df_train.Input.values,
        df_train.Target.values,
        shuffle=True, 
        test_size=test_size,
        random_state=2357
        )
    if verbose:
        print(f'Date types of test set: {test_methods} with len: {len(test_methods)}')
        print(f'x_train: {len(x_train)}  --  y_train: {len(y_train)}\n\
x_val:   {len(x_val)}  --  y_val:   {len(y_val)}\n\
x_test:  {len(x_test)}  --  y_test:  {len(y_test)}')

    return x_train, y_train, x_val, y_val, x_test, y_test

# creating sets
x_train, y_train, x_val, y_val, x_test, y_test = split_data(df, 
                                                            test_size=0.25, 
                                                            verbose=True)

[10, 34, 30, 15, 8, 6, 36, 2, 8]
(2009, 5)
Date types of test set: [10, 34, 30, 15, 8, 6, 36, 2, 8] with len: 9
x_train: 5243  --  y_train: 5243
x_val:   1748  --  y_val:   1748
x_test:  2009  --  y_test:  2009


In [11]:
class DateDataset(Dataset):
    def __init__(self, data, label, tokenizer, source_max_length, target_max_length):
        self.tokenizer = tokenizer
        self.data = data
        self.label = label
        self.source_max_length = source_max_length
        self.target_max_length = target_max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        source = self.data[idx]
        target = self.label[idx]

        source_tokenized = self.tokenizer.encode_plus(
            f'{source} {self.tokenizer.eos_token}',
            max_length=self.source_max_length,
            pad_to_max_length=True,
            return_tensors='pt')

        target_tokenized = self.tokenizer.encode_plus(
            f'{target} {self.tokenizer.eos_token}',
            max_length=self.target_max_length,
            pad_to_max_length=True,
            return_tensors='pt')

        source_token_ids = source_tokenized['input_ids'].squeeze()
        source_mask = source_tokenized['attention_mask'].squeeze()
        target_token_ids = target_tokenized['input_ids'].squeeze()
        
        return source_token_ids, source_mask, target_token_ids

In [12]:
class DateBase(LightningBase):

    ## AS FUNÇÕES ABAIXO SÃO NECESSÁRIAS PARA O PYTORCH LIGHTNING ##
    
    ## O único método exigido nesta classe é prepare_data
    ## outros métodos podem auxiliares podem ser criados acima

    def prepare_data(self):
        '''
            O dataset deve ser preparado utilizando este método.
            Ao final da execução devemos ter declarado as seguinte variáveis
                - self.train_dataset
                - self.valid_dataset
                - self.test_dataset
        '''
        self.train_dataset = DateDataset(x_train, y_train, TOK, MAX_LEN_SRC, MAX_LEN_TRGT)
        self.valid_dataset = DateDataset(x_val, y_val, TOK, MAX_LEN_SRC, MAX_LEN_TRGT)
        self.test_dataset  = DateDataset(x_test, y_test, TOK, MAX_LEN_SRC, MAX_LEN_TRGT)

# Model

In [13]:
class DateNet(torch.nn.Module):
    def __init__(self):
        super(DateNet, self).__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_SZ)
    
    def forward(self, token_ids, att_mask, labels):
        outputs = self.model.forward(
            input_ids=token_ids, 
            attention_mask=att_mask,
            lm_labels=labels
            )
        return outputs[0]
    
    @torch.no_grad()    
    def generate(self, token_ids, att_mask, max_len_target):
        predict = self.model.generate(
            input_ids=token_ids, 
            attention_mask=att_mask,
            max_length=max_len_target
            )
        return predict
    
    @torch.no_grad()  
    def generate_example(self, text_input, tokenizer, max_len_source=MAX_LEN_SRC):

        self.model.eval()
        
        example_tokenized = tokenizer.encode_plus(
            f'{text_input} {tokenizer.eos_token}',
            max_length=max_len_source,
            pad_to_max_length=True,
            return_tensors='pt')
            
        example_token_ids = example_tokenized['input_ids']
        example_mask = example_tokenized['attention_mask']

        predicted_example = self.model.generate(
            input_ids=example_token_ids.to(device), 
            attention_mask=example_mask.to(device),
            max_length=MAX_LEN_TRGT
            )

        self.model.train()

        out_text = [tokenizer.decode(text) for text in predicted_example]
        
        return out_text

# Pytorch Lightning

In [15]:
class DatePL(DateNet, pl.LightningModule):

    def _handle_batch(self, batch):
        source_token_ids, source_mask, target_token_ids = batch
        loss = self(source_token_ids, 
                    source_mask, 
                    target_token_ids
                    )
        return loss

    def _handle_eval_batch(self, batch):
        outputs = self._handle_batch(batch)
        return outputs

    def _handle_eval_epoch_end(self, outputs, phase):
        loss_avg = self._average_key(outputs, f'{phase}_loss')
        return loss_avg

    ## AS FUNÇÕES ABAIXO SÃO NECESSÁRIAS PARA O PYTORCH LIGHTNING ##

    def training_step(self, batch, batch_idx):
        outputs = self._handle_batch(batch)
        return {'loss': outputs}

    def validation_step(self, batch, batch_idx):

        outputs = self._handle_eval_batch(batch)
        return {'val_loss': outputs}

    def test_step(self, batch, batch_idx):
        outputs = self._handle_eval_batch(batch)
        return {'test_loss': outputs}

    def validation_epoch_end(self, outputs):
        loss_avg = self._handle_eval_epoch_end(outputs, phase='val')

        progress_bar = {'val_loss': loss_avg} #add 'f1': f1
        return {'val_loss': loss_avg, 'progress_bar': progress_bar}

    def test_epoch_end(self, outputs):
        loss_avg = self._handle_eval_epoch_end(outputs, phase='test')
        return {'test_loss': loss_avg}

# Final Class

In [16]:
class DateTuner(DateBase, DatePL):

    '''
        Classe final responsável por controlar os hiper parâmetros e
        combinar as classes do modelo e dataset;

        default_hparams deve conter sugestões de todos hiper parâmetros
        necessários para se criar o modelo
    '''

    default_hparams = {
        "lr": 5e-4,
        "optimizer": 'Adam',
        "optimizer_kwargs": {},
        "train_batch_size": 16,
        "val_batch_size": 16,
        "shuffle_train": True,
        "num_workers": 4,
        "deterministic": False,
        "seed": 2357,
    }

    def __init__(self, hparams=None):
        super(DatePL, self).__init__()
        self.hparams = self._construct_hparams(hparams)

    def _construct_hparams(self, hparams):
        default_hparams = self.default_hparams.copy()
        if hparams is not None:
            default_hparams.update(hparams)

        if default_hparams['deterministic']:
            pl.seed_everything(default_hparams['seed'])

        return Namespace(**default_hparams)

# Training

In [17]:
# hiper parametros que serão utilizados pelas classes acima
hparams = {
    "train_batch_size": 16,
    "val_batch_size": 16,
    "deterministic": True,
    "seed": 2357
}

# hiper parametros que serão utilizados pela class Trainer do Lightning
trainer_hparams = {
    "gpus": 1,
    "max_epochs": 20, 
    "progress_bar_refresh_rate":80,
}

# atualizamos o primeiro com o segundo para que todos fiquem salvos no arquivo
# de hiper parametros hparams.yaml
hparams.update(trainer_hparams)

In [18]:
try:
  del model
except:
  print('Model already erased, starting a new one!')

model = DateTuner()

Model already erased, starting a new one!


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1197.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=242136741.0, style=ProgressStyle(descri…




## Dev Run

Primeiramente fazemos *fast_dev_run* para ter certeza que não erros no códigos.

O *fast_dev_run* irá rodar um *step* de cada fase: treino, validação e teste.

In [19]:
dev_run = True

if dev_run:
    trainer = pl.Trainer(fast_dev_run=True, **trainer_hparams)
    trainer.fit(model)

Running in fast_dev_run mode: will run a full train, val and test loop using a single batch
GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]

    | Name                                                                  | Type                       | Params
-----------------------------------------------------------------------------------------------------------------
0   | model                                                                 | T5ForConditionalGeneration | 60 M  
1   | model.shared                                                          | Embedding                  | 16 M  
2   | model.encoder                                                         | T5Stack                    | 35 M  
3   | model.encoder.block                                                   | ModuleList                 | 18 M  
4   | model.encoder.block.0                                                 | T5Block                    | 3

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




## Fit

Por fim fazemos *fit* no dataset inteiro

In [20]:
trainer = pl.Trainer(**trainer_hparams)

GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]


In [None]:
trainer.fit(model)


    | Name                                                                  | Type                       | Params
-----------------------------------------------------------------------------------------------------------------
0   | model                                                                 | T5ForConditionalGeneration | 60 M  
1   | model.shared                                                          | Embedding                  | 16 M  
2   | model.encoder                                                         | T5Stack                    | 35 M  
3   | model.encoder.block                                                   | ModuleList                 | 18 M  
4   | model.encoder.block.0                                                 | T5Block                    | 3 M   
5   | model.encoder.block.0.layer                                           | ModuleList                 | 3 M   
6   | model.encoder.block.0.layer.0                                         | T5LayerSe

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…