<a href="https://colab.research.google.com/github/textnorms/date_text_norm/blob/master/T5_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [166]:
!pip install -q num2words transformers
!rm -rf *
!git clone https://github.com/textnorms/date_text_norm.git
!cp -r date_text_norm/syntetic_data/ .

Cloning into 'date_text_norm'...
remote: Enumerating objects: 35, done.[K
remote: Counting objects: 100% (35/35), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 35 (delta 13), reused 27 (delta 8), pack-reused 0[K
Unpacking objects: 100% (35/35), done.


In [167]:
! nvidia-smi

Mon May 25 00:26:54 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    33W / 250W |   6691MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

In [0]:
# Basics
import numpy as np
import pandas as pd

# Synthetic data generator
from syntetic_data import DateTextGenerator

# PyTorch
import torch 
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Sklearn
from sklearn.model_selection import train_test_split

# Transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW

### Função para reproduzir resultados

In [169]:
manual_seed = 0
def deterministic(rep=True):
    if rep:
        np.random.seed(manual_seed)
        torch.manual_seed(manual_seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(manual_seed)
            torch.cuda.manual_seed_all(manual_seed)
        torch.backends.cudnn.enabled = False 
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
        print(f'Deterministic experiment, seed: {manual_seed}')
    else:
        print('Random experiment')

deterministic()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Deterministic experiment, seed: 0
Using device: cuda


# Dataset

In [0]:
model_size = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_size)

In [171]:
datas = DateTextGenerator(start_date='01/01/1900',end_date='31/12/2020')
datas.generate_demo(date='28/05/2020')

Gerando demostração dos formatos de datas geradas para a canônica: 28/05/2020
Método: 1 --- vinte e oito do cinco de dois mil e vinte
----------------------------------------------------------------------------------------------------
Método: 2 --- 28.05.2020
----------------------------------------------------------------------------------------------------
Método: 3 --- vinte e oito de maio de dois mil e vinte
----------------------------------------------------------------------------------------------------
Método: 4 --- vinte e oito de mai de dois mil e vinte
----------------------------------------------------------------------------------------------------


In [172]:
df = datas.generate_date_dataset(); df

Unnamed: 0,Tipo padrão,Entrada,Canônico
0,1,"um do um de mil, novecentos",01/01/1900
1,1,"dois do um de mil, novecentos",02/01/1900
2,1,"três do um de mil, novecentos",03/01/1900
3,1,"quatro do um de mil, novecentos",04/01/1900
4,1,"cinco do um de mil, novecentos",05/01/1900
...,...,...,...
176775,4,vinte e sete de dez de dois mil e vinte,27/12/2020
176776,4,vinte e oito de dez de dois mil e vinte,28/12/2020
176777,4,vinte e nove de dez de dois mil e vinte,29/12/2020
176778,4,trinta de dez de dois mil e vinte,30/12/2020


In [0]:
# split the data 
x_train, x_val, y_train, y_val = train_test_split(
        df['Entrada'].values,
        df['Canônico'].values,
        shuffle=True, 
        test_size=0.3, 
        random_state=manual_seed)

In [0]:
# -------------------
max_len_source = 40
max_len_target = 12
# -------------------

class DateDataset(Dataset):
    def __init__(self, data, label, tokenizer, source_max_length, target_max_length):
        self.tokenizer = tokenizer
        self.data = data
        self.label = label
        self.source_max_length = source_max_length
        self.target_max_length = target_max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        source = self.data[idx]
        target = self.label[idx]

        source_tokenized = self.tokenizer.encode_plus(
            f'{source} {self.tokenizer.eos_token}',
            max_length=self.source_max_length,
            pad_to_max_length=True,
            return_tensors='pt')

        target_tokenized = self.tokenizer.encode_plus(
            f'{target} {self.tokenizer.eos_token}',
            max_length=self.target_max_length,
            pad_to_max_length=True,
            return_tensors='pt')

        source_token_ids = source_tokenized['input_ids'].squeeze()
        source_mask = source_tokenized['attention_mask'].squeeze()
        target_token_ids = target_tokenized['input_ids'].squeeze()
        
        return source_token_ids, source_mask, target_token_ids

## Teste da classe

In [175]:
dataset_debug = DateDataset(
    x_train, 
    y_train,
    tokenizer,
    max_len_source,
    max_len_target,
    )

dataloader_debug = DataLoader(
    dataset_debug, 
    batch_size=1, 
    shuffle=True, 
    num_workers=0
    )

source_token_ids, source_mask, target_token_ids = next(iter(dataloader_debug))
print(f'source_token_ids:\n {source_token_ids} --- shape:{source_token_ids.shape}')
print(f'source_mask:\n {source_mask} --- shape:{source_mask.shape}')
print(f'target_token_ids:\n {target_token_ids} --- shape:{target_token_ids.shape}')

source_token_ids:
 tensor([[ 8013, 12900,  9887,     1,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]) --- shape:torch.Size([1, 40])
source_mask:
 tensor([[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) --- shape:torch.Size([1, 40])
target_token_ids:
 tensor([[  586, 31497,  9887,     1,     0,     0,     0,     0,     0,     0,
             0,     0]]) --- shape:torch.Size([1, 12])


## Datasets e Dataloaders

In [176]:
BATCH_SZ = 128

# datasets
ds_debug = DateDataset(
    x_train[:BATCH_SZ], 
    y_train[:BATCH_SZ],
    tokenizer,
    max_len_source,
    max_len_target
    )

ds_train = DateDataset(
    x_train, 
    y_train,
    tokenizer,
    max_len_source,
    max_len_target
    )
ds_valid = DateDataset(
    x_val, 
    y_val,
    tokenizer,
    max_len_source,
    max_len_target
    )

# dataloaders
dataloaders = {
    'debug': DataLoader(
         ds_debug,
         batch_size=BATCH_SZ,
         shuffle=True,
         num_workers=2,
         pin_memory=True),
    
    'train': DataLoader(
         ds_train,
         batch_size=BATCH_SZ,
         shuffle=True,
         num_workers=2,
         pin_memory=True),

    'valid': DataLoader(
         ds_valid,
         batch_size=BATCH_SZ,
         shuffle=False,
         num_workers=2,
         pin_memory=True)
               }

# sanity check
dl_sizes = {x: len(dataloaders[x]) for x in dataloaders.keys()}; dl_sizes 

{'debug': 1, 'train': 967, 'valid': 415}

In [0]:
# testando o dataloader 
source_token_ids, source_mask, target_token_ids = next(iter(dataloaders['train']))

In [0]:
class Net(torch.nn.Module):
    def __init__(self, train=True):
        super(Net, self).__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(model_size)
        self.training = train
    
    def forward(self, token_ids, att_mask, labels=None):
        if self.training:
            outputs = self.model.forward(
                input_ids=token_ids, 
                attention_mask=att_mask,
                lm_labels=labels)
            return outputs[0] # loss
        else:
            predict = self.model.generate(
                input_ids=token_ids, 
                attention_mask=att_mask,
                max_length=max_len_target)
            return predict

    def generate_seq(self,text_input,tokenizer,max_len_source=max_len_source):
        
        self.model.eval()
        
        sample_tokenized = tokenizer.encode_plus(
            f'{text_input} {tokenizer.eos_token}',
            max_length=max_len_source,
            pad_to_max_length=True,
            return_tensors='pt')
    
            
        sample_token_ids = sample_tokenized['input_ids']
        sample_mask = sample_tokenized['attention_mask']

        predicted_samples = self.model.generate(
          input_ids=sample_token_ids.to(device), 
          attention_mask=sample_mask.to(device),
          max_length=max_len_target
          )
        
        self.model.train()

        out_text = [tokenizer.decode(text) for text in predicted_samples]
        
        return out_text
        


In [179]:
model = Net()
model.to(device)
sample = 'oi mundo'
print(f'amostra: {sample} ---- saída de amostra: {model.generate_seq(sample,tokenizer)}')
del model

amostra: oi mundo ---- saída de amostra: ['oi mundo']


## Funções de treino e eval

In [0]:
def acc_in_text(trues, preds):
    acc = []
    for d in zip(trues, preds):
        if d[0] == d[1]:
            acc.append(1)
        else:
            acc.append(0)
    return acc # bool

def train(model, device, train_loader, optimizer):
    model.train()
    loss_train = []
    
    for source_token_ids, source_mask, target_token_ids in train_loader:
        source_token_ids, source_mask, target_token_ids = source_token_ids.to(device), \
        source_mask.to(device), target_token_ids.to(device)
        
        optimizer.zero_grad()

        loss = model(
            source_token_ids, 
            source_mask, 
            target_token_ids)
        loss_train.append(loss.item())
    
        loss.backward()
        optimizer.step()
  
    ave_train_loss = sum(loss_train) / len(loss_train)
    return ave_train_loss

def evaluate_fn(model, device, val_loader):
    model.eval()
    model.training=False # generate  
    all_acc, all_preds, all_trues = [], [], []
    for source_token_ids, source_mask, target_token_ids in val_loader:
        source_token_ids, source_mask, target_token_ids = source_token_ids.to(device), \
        source_mask.to(device), target_token_ids.to(device)
        
        predicted_tokens = model(
            source_token_ids, 
            source_mask,
            target_token_ids)
        
        preds = [tokenizer.decode(t) for t in predicted_tokens]
        trues = [tokenizer.decode(t) for t in target_token_ids]
        acc = acc_in_text(trues, preds)
        all_acc.extend(acc)
        all_preds.extend(preds)
        all_trues.extend(trues)
        
    return np.array(all_acc).mean(), all_trues, all_preds

# Overfit em poucas amostras

In [0]:
overfit = False
if overfit:
  start = torch.cuda.Event(enable_timing=True)
  end = torch.cuda.Event(enable_timing=True)
  deterministic() 

  model = Net(train=True).to(device)
  optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

  epoch = 0
  N_EPOCHS  = 1000

  #--------------------------------------------------------------------------
  start.record()
  for step in range(1, N_EPOCHS+1):
      loss_t = train(model, device, dataloaders['debug'], optimizer)
      acc, trues, preds = evaluate_fn(model, device, dataloaders['debug'])
      if step == 1:
          print(f'[Epoch [{step}/{N_EPOCHS}] |', end=' ')
          print(f'Train Loss: {loss_t:.3f} -- Acc: {acc:.3f}')
      if step % 50 == 0:
          print(f'[Epoch [{step}/{N_EPOCHS}] |', end=' ')
          print(f'Train Loss: {loss_t:.3f} -- Acc: {acc:.3f}')
          print(f'  Trues: {trues[:7]}\n  Preds: {preds[:7]}')
  end.record()
  torch.cuda.synchronize()    
  #--------------------------------------------------------------------------

  print(f'Tempo: {start.elapsed_time(end)/1000/60 :.3f} min.')
  del model

In [182]:
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
deterministic() 

model = Net(train=True).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

epoch = 0
N_EPOCHS  = 1

#--------------------------------------------------------------------------
start.record()
for step in range(1, N_EPOCHS+1):
    loss_t = train(model, device, dataloaders['train'], optimizer)
    acc, trues, preds = evaluate_fn(model, device, dataloaders['valid'])
    if step % 2 == 0:
        print(f'  Trues: {trues[:7]}\n  Preds: {preds[:7]}')
    print(f'[Epoch [{step}/{N_EPOCHS}] |', end=' ')
    print(f'Train Loss: {loss_t:.3f} -- Acc: {acc:.3f}')
end.record()
torch.cuda.synchronize()    
#--------------------------------------------------------------------------

print(f'Tempo: {start.elapsed_time(end)/1000/60 :.3f} min.')

Deterministic experiment, seed: 0
[Epoch [1/1] | Train Loss: 0.686 -- Acc: 0.981
Tempo: 5.388 min.


In [183]:
data = 'un do janro de mil novecentu e otenta y sete'

model.generate_seq(data,tokenizer)


['01/01/1987']