<a href="https://colab.research.google.com/github/textnorms/date_text_norm/blob/master/T5_V3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
! nvidia-smi

Mon May 25 21:48:19 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P0    34W / 250W |   4733MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

In [23]:
! rm -rf date*
! git clone https://github.com/textnorms/date_text_norm.git
! cp -r date_text_norm/syntetic_data/ .

!pip install -q num2words transformers
! pip install -q transformers

Cloning into 'date_text_norm'...
remote: Enumerating objects: 68, done.[K
remote: Counting objects:   1% (1/68)[Kremote: Counting objects:   2% (2/68)[Kremote: Counting objects:   4% (3/68)[Kremote: Counting objects:   5% (4/68)[Kremote: Counting objects:   7% (5/68)[Kremote: Counting objects:   8% (6/68)[Kremote: Counting objects:  10% (7/68)[Kremote: Counting objects:  11% (8/68)[Kremote: Counting objects:  13% (9/68)[Kremote: Counting objects:  14% (10/68)[Kremote: Counting objects:  16% (11/68)[Kremote: Counting objects:  17% (12/68)[Kremote: Counting objects:  19% (13/68)[Kremote: Counting objects:  20% (14/68)[Kremote: Counting objects:  22% (15/68)[Kremote: Counting objects:  23% (16/68)[Kremote: Counting objects:  25% (17/68)[Kremote: Counting objects:  26% (18/68)[Kremote: Counting objects:  27% (19/68)[Kremote: Counting objects:  29% (20/68)[Kremote: Counting objects:  30% (21/68)[Kremote: Counting objects:  32% (22/68)[Kremote: Co

In [0]:
# Basics
import numpy as np
import pandas as pd
import random

# Synthetic data generator
from syntetic_data import DateTextGenerator

# PyTorch
import torch 
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Sklearn
from sklearn.model_selection import train_test_split

# Transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW

### Deterministic experiments

In [25]:
manual_seed = 2357 # only primes, cuz I like
def deterministic(rep=True):
    if rep:
        np.random.seed(manual_seed)
        torch.manual_seed(manual_seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(manual_seed)
            torch.cuda.manual_seed_all(manual_seed)
        torch.backends.cudnn.enabled = False 
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
        print(f'Deterministic experiment, seed: {manual_seed}')
    else:
        print('Random experiment')

deterministic()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Deterministic experiment, seed: 2357
Using device: cuda


## Config constants

In [0]:
# Model params
MODEL_SZ = 't5-small'
TOK = T5Tokenizer.from_pretrained(MODEL_SZ)
MAX_LEN_SRC  = 40
MAX_LEN_TRGT = 12

# Train params
BATCH_SZ = 128
N_EPOCHS = 3
WINDOW   = 7

# Dataset

In [27]:
datas = DateTextGenerator(start_date='01/01/1900',
                          end_date='31/12/2020',
                          text_noise_rate=0.2)
datas.generate_demo(date='28/05/2020')

Gerando demostração dos formatos de datas geradas para a canônica: 28/05/2020
Método: 1 --- vinte e oito do cinco de dois mil e vinte
--------------------------------------------------
Método: 2 --- 28.05.2020
--------------------------------------------------
Método: 3 --- vinte e oito de maio de dois mil e vinte
--------------------------------------------------
Método: 4 --- vinte e oito de mai de dois mil e vinte
--------------------------------------------------


In [28]:
df = datas.generate_date_dataset(); df

Unnamed: 0,Tipo padrão,Ruído,Entrada,Canônico
0,1,,"um do um de mil, novecentos",01/01/1900
1,1,,"dois do um de mil, novecentos",02/01/1900
2,1,,"três do um de mil, novecentos",03/01/1900
3,1,lookalike_replace,"quatro do um de mil, n0veçentos",04/01/1900
4,1,lookalike_replace,"cinco do um de nll, novecentos",05/01/1900
...,...,...,...,...
176775,4,lookalike_replace,vlnte e sete de dez de d0is mil e vinte,27/12/2020
176776,4,,vinte e oito de dez de dois mil e vinte,28/12/2020
176777,4,lookalike_replace,vimte e nove de dez de dois nil e vinte,29/12/2020
176778,4,,trinta de dez de dois mil e vinte,30/12/2020


In [29]:
# Inpsecting noise rate per samples
df['Ruído'].value_counts()/len(df)

N/A                  0.800967
lookalike_replace    0.199033
Name: Ruído, dtype: float64

## Function to split the dataset

In [30]:
def split_data(data, labels, test_size=0.2):
    x_train, x_test, y_train, y_test = train_test_split(
        data,
        labels,
        shuffle=True, 
        test_size=test_size,
        random_state=manual_seed
        )
    return x_train, x_test, y_train, y_test

# creating test set
x_train, x_test, y_train, y_test = split_data(df.Entrada.values, 
                                              df.Canônico.values, 
                                              test_size=0.2)

# creating valid set
x_train, x_val, y_train, y_val = split_data(x_train, 
                                            y_train, 
                                            test_size=0.2)

# checking
len(x_train), len(y_train), len(x_val), len(y_val), len(x_test), len(y_test)

(113139, 113139, 28285, 28285, 35356, 35356)

In [0]:
class DateDataset(Dataset):
    def __init__(self, data, label, tokenizer, source_max_length, target_max_length):
        self.tokenizer = tokenizer
        self.data = data
        self.label = label
        self.source_max_length = source_max_length
        self.target_max_length = target_max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        source = self.data[idx]
        target = self.label[idx]

        source_tokenized = self.tokenizer.encode_plus(
            f'{source} {self.tokenizer.eos_token}',
            max_length=self.source_max_length,
            pad_to_max_length=True,
            return_tensors='pt')

        target_tokenized = self.tokenizer.encode_plus(
            f'{target} {self.tokenizer.eos_token}',
            max_length=self.target_max_length,
            pad_to_max_length=True,
            return_tensors='pt')

        source_token_ids = source_tokenized['input_ids'].squeeze()
        source_mask = source_tokenized['attention_mask'].squeeze()
        target_token_ids = target_tokenized['input_ids'].squeeze()
        
        return source_token_ids, source_mask, target_token_ids

## Checking the DateDataset class

In [32]:
dataset_debug = DateDataset(
    x_train, 
    y_train,
    TOK,
    MAX_LEN_SRC,
    MAX_LEN_TRGT,
    )

dataloader_checking = DataLoader(
    dataset_debug, 
    batch_size=1, 
    shuffle=True, 
    num_workers=0
    )

source_token_ids, source_mask, target_token_ids = next(iter(dataloader_checking))
print(f'source_token_ids:\n {source_token_ids} --- shape:{source_token_ids.shape}')
print(f'source_mask:\n {source_mask} --- shape:{source_mask.shape}')
print(f'target_token_ids:\n {target_token_ids} --- shape:{target_token_ids.shape}')

source_token_ids:
 tensor([[  103,   159,    20,     3, 17639,    40,    20, 15533,     6,   150,
           162,  3728,    32,     7,     3,    15,   150,  2169,     9,     3,
            15,   561,     1,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]) --- shape:torch.Size([1, 40])
source_mask:
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) --- shape:torch.Size([1, 40])
target_token_ids:
 tensor([[11270, 24288, 13523,  4729,     1,     0,     0,     0,     0,     0,
             0,     0]]) --- shape:torch.Size([1, 12])


## Datasets e Dataloaders

In [33]:
# datasets
ds_debug = DateDataset(x_train[:BATCH_SZ], y_train[:BATCH_SZ], TOK, MAX_LEN_SRC, MAX_LEN_TRGT)
ds_train = DateDataset(x_train, y_train, TOK, MAX_LEN_SRC, MAX_LEN_TRGT)
ds_valid = DateDataset(x_val, y_val, TOK, MAX_LEN_SRC, MAX_LEN_TRGT)
ds_test  = DateDataset(x_test, y_test, TOK, MAX_LEN_SRC, MAX_LEN_TRGT)

print('Datasets len:')
print(f'len ds_debug: {len(ds_debug)}')
print(f'len ds_train: {len(ds_train)}')
print(f'len ds_valid: {len(ds_valid)}')
print(f'len ds_test:  {len(ds_test)}')

# dataloaders
dataloaders = {
    'debug': DataLoader(
         ds_debug,
         batch_size=BATCH_SZ,
         shuffle=True,
         num_workers=2,
         pin_memory=True),
    'train': DataLoader(
         ds_train,
         batch_size=BATCH_SZ,
         shuffle=True,
         num_workers=2,
         pin_memory=True),
    'valid': DataLoader(
         ds_valid,
         batch_size=BATCH_SZ,
         shuffle=False,
         num_workers=2,
         pin_memory=True),
    'test': DataLoader(
         ds_test,
         batch_size=BATCH_SZ,
         shuffle=False,
         num_workers=2,
         pin_memory=True),
               }
# sanity check
print('\nDataloaders len (in batch):')
dl_sizes = {x: len(dataloaders[x]) for x in dataloaders.keys()}; dl_sizes

Datasets len:
len ds_debug: 128
len ds_train: 113139
len ds_valid: 28285
len ds_test:  35356

Dataloaders len (in batch):


{'debug': 1, 'test': 277, 'train': 884, 'valid': 221}

In [0]:
# testando o dataloader 
source_token_ids, source_mask, target_token_ids = next(iter(dataloaders['test']))

In [0]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_SZ)
    
    def forward(self, token_ids, att_mask, labels):
        outputs = self.model.forward(
            input_ids=token_ids, 
            attention_mask=att_mask,
            lm_labels=labels
            )
        return outputs[0] # loss
    
    @torch.no_grad()    
    def generate(self, token_ids, att_mask, max_len_target):
        predict = self.model.generate(
            input_ids=token_ids, 
            attention_mask=att_mask,
            max_length=max_len_target
            )
        return predict
    
    @torch.no_grad()  
    def generate_example(self, text_input, tokenizer, max_len_source=MAX_LEN_SRC):

        self.model.eval()
        
        example_tokenized = tokenizer.encode_plus(
            f'{text_input} {tokenizer.eos_token}',
            max_length=max_len_source,
            pad_to_max_length=True,
            return_tensors='pt')
            
        example_token_ids = example_tokenized['input_ids']
        example_mask = example_tokenized['attention_mask']

        predicted_example = self.model.generate(
            input_ids=example_token_ids.to(device), 
            attention_mask=example_mask.to(device),
            max_length=MAX_LEN_TRGT
            )

        self.model.train()

        out_text = [tokenizer.decode(text) for text in predicted_example]
        
        return out_text

## Train and evaluation functions

In [0]:
# acc metric for text inputs
def acc_in_text(trues, preds): 
    acc = []
    for d in zip(trues, preds):
        if d[0] == d[1]:
            acc.append(1)
        else:
            acc.append(0)
    return acc # bool

def train(model, device, train_loader, optimizer):
    loss_train = []
    model.train()
    for source_token_ids, source_mask, target_token_ids in train_loader:
        optimizer.zero_grad()
        loss = model(
            source_token_ids.to(device), 
            source_mask.to(device), 
            target_token_ids.to(device)
            )
        
        loss_train.append(loss.item())
        loss.backward()
        optimizer.step()
  
    return sum(loss_train) / len(loss_train)

def evaluate_fn(model, device, val_loader, max_len=MAX_LEN_TRGT):
    all_acc, all_preds, all_trues = [], [], []
    model.eval()
    for source_token_ids, source_mask, target_token_ids in val_loader:
        predicted_ids = model.generate(
            source_token_ids.to(device), 
            source_mask.to(device),
            max_len
            )
        
        preds = [TOK.decode(t) for t in predicted_ids]
        trues = [TOK.decode(t) for t in target_token_ids]
        acc = acc_in_text(trues, preds)
        all_acc.extend(acc)
        all_trues.extend(trues)
        all_preds.extend(preds)
        
    return np.array(all_acc).mean(), all_trues, all_preds

# Overfit in one batch 
- dataloader debug

In [0]:
overfit = False

if overfit:

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    deterministic() 

    model = Net().to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
    
    # -----------------------------------------------------------------------------
    start.record()
    for step in range(1, 1001):
        samp = random.randint(0, BATCH_SZ-WINDOW) # to show random trues and preds
        loss_t = train(model, device, dataloaders['debug'], optimizer)
        acc, trues, preds = evaluate_fn(model, device, dataloaders['debug'])
        if step == 1:
            print(f'[Epoch: {step}/{1000}] |', end=' ')
            print(f'Train Loss: {loss_t:.3f} -- Acc: {acc:.3f}')
        if step % 100 == 0:
            print(f'[Epoch: {step}/{1000}] |', end=' ')
            print(f'Train Loss: {loss_t:.3f} -- Acc: {acc:.3f}')
            print(f'  Trues: {trues[samp:samp+WINDOW]}\n  Preds: {preds[samp:samp+WINDOW]}')
    end.record()
    torch.cuda.synchronize()    
    # -----------------------------------------------------------------------------

    print(f'Training time: {start.elapsed_time(end)/1000/60 :.3f} min.')
    del model

# Training 

In [38]:
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
deterministic() 

model = Net().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# ---------------------------------------------------------------------------------
start.record()

for step in range(1, N_EPOCHS+1):
    samp = random.randint(0, BATCH_SZ-WINDOW) # to show random trues and preds
    loss_t = train(model, device, dataloaders['train'], optimizer)
    acc, trues, preds = evaluate_fn(model, device, dataloaders['valid'])
    print(f'[Epoch: {step}/{N_EPOCHS}] |', end=' ')
    print(f'Train Loss: {loss_t:.3f} -- Acc: {acc:.3f}')
    print(f'  Trues: {trues[samp:samp+WINDOW]}\n  Preds: {preds[samp:samp+WINDOW]}')

end.record()
torch.cuda.synchronize()    
# ---------------------------------------------------------------------------------

print(f'Training time: {start.elapsed_time(end)/1000/60 :.3f} min.')

Deterministic experiment, seed: 2357
[Epoch: 1/3] | Train Loss: 0.796 -- Acc: 0.892
  Trues: ['08/02/2015', '31/01/1900', '24/11/2001', '26/03/1902', '25/05/1929', '03/10/1903', '18/05/1943']
  Preds: ['08/02/2015', '31/01/1900', '24/11/2001', '26/03/1902', '25/05/1929', '03/10/1903', '18/05/1943']
[Epoch: 2/3] | Train Loss: 0.075 -- Acc: 0.977
  Trues: ['31/12/1923', '20/05/1972', '09/09/1939', '09/08/1976', '31/03/1921', '08/10/1932', '06/03/1971']
  Preds: ['31/12/1923', '20/05/1972', '09/09/1939', '09/08/1976', '31/03/1921', '08/10/1932', '06/03/1971']
[Epoch: 3/3] | Train Loss: 0.027 -- Acc: 0.992
  Trues: ['26/08/1967', '11/07/1931', '21/11/1968', '01/07/1969', '19/10/1919', '14/09/1928', '22/03/1961']
  Preds: ['26/08/1967', '11/07/1931', '21/11/1968', '01/07/1969', '19/10/1919', '14/09/1928', '22/03/1961']
Training time: 10.876 min.


# Test

In [39]:
# ---------------------------------------------------------------------------------
start.record()

samp = random.randint(0, BATCH_SZ-WINDOW) # to show random trues and preds
acc, trues, preds = evaluate_fn(model, device, dataloaders['test'])
print(f'Acc: {acc:.3f}')
print(f' Trues: {trues[samp:samp+WINDOW]}\n  Preds: {preds[samp:samp+WINDOW]}')

end.record()
torch.cuda.synchronize()    
# ---------------------------------------------------------------------------------

print(f'Test time: {start.elapsed_time(end)/1000/60 :.3f} min.')

Acc: 0.992
 Trues: ['28/05/2018', '20/06/1952', '26/04/1956', '11/06/1995', '21/01/1931', '01/09/1961', '01/02/1978']
  Preds: ['28/05/2018', '20/06/1952', '26/04/1956', '11/06/1995', '21/01/1931', '01/09/1961', '01/02/1978']
Test time: 0.635 min.


# Predict an example




In [43]:
data_0 = 'un do janro de mil novecentu e otenta y sete'
model.generate_example(data_0,TOK)

['01/01/1987']

In [48]:
data_1 = 'tr3$ d$ fev$ir0 d3 doi m1ll e novi'
model.generate_example(data_1,TOK)

['03/02/2011']