In [1]:
! nvidia-smi

Mon May 25 13:06:27 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [2]:
! rm -rf date*
! git clone https://github.com/textnorms/date_text_norm.git
! cp -r date_text_norm/syntetic_data/ .
! pip install -q transformers
! pip install -q num2words

Cloning into 'date_text_norm'...
remote: Enumerating objects: 43, done.[K
remote: Counting objects: 100% (43/43), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 43 (delta 16), reused 29 (delta 8), pack-reused 0[K
Unpacking objects: 100% (43/43), done.
[K     |████████████████████████████████| 665kB 2.8MB/s 
[K     |████████████████████████████████| 3.8MB 10.3MB/s 
[K     |████████████████████████████████| 1.1MB 31.9MB/s 
[K     |████████████████████████████████| 890kB 51.1MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 102kB 2.3MB/s 
[?25h

In [0]:
# Basics
import numpy as np
import pandas as pd
import random

# Num2words and dates
from num2words import num2words
from datetime import datetime, timedelta

# PyTorch
import torch 
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Sklearn
from sklearn.model_selection import train_test_split

# Transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW

### Função para reproduzir resultados

In [0]:
manual_seed = 0
def deterministic(rep=True):
    if rep:
        np.random.seed(manual_seed)
        torch.manual_seed(manual_seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(manual_seed)
            torch.cuda.manual_seed_all(manual_seed)
        torch.backends.cudnn.enabled = False 
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
        print(f'Deterministic experiment, seed: {manual_seed}')
    else:
        print('Random experiment')

deterministic()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Deterministic experiment, seed: 0
Using device: cuda


# Dataset

In [0]:
model_size = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_size)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




In [0]:
'''
    Este arquivo contêm dicionários auxiliares para a construção
    de datas por extenso em PT_BR. 

'''

'''
    Meses escritos por extenso
'''
extensive_months_dict = {
    '01': 'janeiro',
    '02': 'fevereiro',
    '03': 'março',
    '04': 'abril',
    '05': 'maio',
    '06': 'junho',
    '07': 'julho',
    '08': 'agosto',
    '09': 'setembro',
    '10': 'outubro',
    '11': 'novembro',
    '12': 'dezembro'
}

In [0]:
class DateTextGenerator():

    '''
    Essa classe implementa um gerador de texto sintético
    que usa como entrada datas no formato canônico e produz
    amostras em formatos textuais não canônicos. 
    E.g.:
        - Entrada: 01/05/2020
        - Saídas possíveis:
            - 01 de maio de 2020;
            - primeiro de maior de 2020;
            - primeiro de maio de dois mil e vinte;
            - primeiro do 05 de 2020;
                .
                .
                .
    '''
    def __init__(self,start_date='01/01/0001',end_date='31/12/2999'):

        self.start_date = datetime.strptime(start_date, "%d/%m/%Y")
        self.end_date = datetime.strptime(end_date, "%d/%m/%Y")

        self.date_range = self.generate_date_range(self.start_date,self.end_date)


    def generate_date_dataset(self):

        X = []

        for sample in self.date_range:
            day,month,year = sample.split('/')

            X.append(
                self._text_gen(day,month,year)
            )

        for sample in self.date_range:
            day,month,year = sample.split('/')

            X.append(
                self._dot_as_sep(day,month,year)
            )

        for sample in self.date_range:
            day,month,year = sample.split('/')

            X.append(
                self._all_extensive_numbers(day,month,year)
            )

        dataset = pd.DataFrame(list(zip(X,3*self.date_range)),columns=['inputs','labels'])

        return dataset

    @staticmethod
    def _all_extensive_numbers(day,month,year):

        input_day = num2words(int(day),lang='pt_BR')
        input_month = num2words(int(month),lang='pt_BR')
        input_year = num2words(int(year),lang='pt_BR')

        return f'{input_day} do {input_month} de {input_year}'

    @staticmethod
    def _dot_as_sep(day,month,year):
        return f'{day}.{month}.{year}'

    @staticmethod
    def _text_gen(day,month,year):

        input_day = num2words(int(day),lang='pt_BR')
        input_month = extensive_months_dict[month]
        input_year = num2words(int(year),lang='pt_BR')

        return f'{input_day} de {input_month} de {input_year}'

    @staticmethod
    def generate_date_range (start_date,end_date,step=1):
        '''
           Implementa um range de datas com os dias que estão entre
           start_date e end_date. Implementação inspirada em:
            https://gist.github.com/ramhiser/989263a7a136601e3723
            e
            https://stackoverflow.com/questions/339007/how-to-pad-zeroes-to-a-string
        '''
        
        dates = []

        for d in range(0, (end_date - start_date).days + step, step):
            date_i = start_date + timedelta(days=d)
            
            dia = str(date_i.date().day).zfill(2)
            mes = str(date_i.date().month).zfill(2)
            ano = str(date_i.date().year).zfill(4)

            dates.append(f'{dia}/{mes}/{ano}')

        return dates

In [0]:
datas = DateTextGenerator(start_date='01/01/1900',end_date='31/12/2020')
df = datas.generate_date_dataset(); df

Unnamed: 0,inputs,labels
0,"um de janeiro de mil, novecentos",01/01/1900
1,"dois de janeiro de mil, novecentos",02/01/1900
2,"três de janeiro de mil, novecentos",03/01/1900
3,"quatro de janeiro de mil, novecentos",04/01/1900
4,"cinco de janeiro de mil, novecentos",05/01/1900
...,...,...
132580,vinte e sete do doze de dois mil e vinte,27/12/2020
132581,vinte e oito do doze de dois mil e vinte,28/12/2020
132582,vinte e nove do doze de dois mil e vinte,29/12/2020
132583,trinta do doze de dois mil e vinte,30/12/2020


In [0]:
# incluir exemplo
# datas incompletas 1/2020 -> 01/2020 , 5/2020 -> 05/2020
# 1/1/2010 -> 01/01/2010
# aos cinco primeiros dias de...
# data com erro de ocr: vint sete doze dois mil e vinte


In [0]:
# split the data 
x_train, x_val, y_train, y_val = train_test_split(
        df.inputs.values,
        df.labels.values,
        shuffle=True, 
        test_size=0.3, 
        random_state=manual_seed
        )

len(x_train), len(y_train), len(x_val), len(y_val)

(92809, 92809, 39776, 39776)

In [0]:
# -------------------
max_len_source = 40
max_len_target = 12
# -------------------

class DateDataset(Dataset):
    def __init__(self, data, label, tokenizer, source_max_length, target_max_length):
        self.tokenizer = tokenizer
        self.data = data
        self.label = label
        self.source_max_length = source_max_length
        self.target_max_length = target_max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        source = self.data[idx]
        target = self.label[idx]

        source_tokenized = self.tokenizer.encode_plus(
            f'{source} {self.tokenizer.eos_token}',
            max_length=self.source_max_length,
            pad_to_max_length=True,
            return_tensors='pt')

        target_tokenized = self.tokenizer.encode_plus(
            f'{target} {self.tokenizer.eos_token}',
            max_length=self.target_max_length,
            pad_to_max_length=True,
            return_tensors='pt')

        source_token_ids = source_tokenized['input_ids'].squeeze()
        source_mask = source_tokenized['attention_mask'].squeeze()
        target_token_ids = target_tokenized['input_ids'].squeeze()
        
        return source_token_ids, source_mask, target_token_ids

## Teste da classe

In [0]:
dataset_debug = DateDataset(
    x_train, 
    y_train,
    tokenizer,
    max_len_source,
    max_len_target,
    )

dataloader_debug = DataLoader(
    dataset_debug, 
    batch_size=1, 
    shuffle=True, 
    num_workers=0
    )

source_token_ids, source_mask, target_token_ids = next(iter(dataloader_debug))
print(f'source_token_ids:\n {source_token_ids} --- shape:{source_token_ids.shape}')
print(f'source_mask:\n {source_mask} --- shape:{source_mask.shape}')
print(f'target_token_ids:\n {target_token_ids} --- shape:{target_token_ids.shape}')

source_token_ids:
 tensor([[    3,   208,  2429,     3,    15,   561,    20,    91,    76,  5702,
            20, 15533,     6,   150,   162,  3728,    32,     7,     3,    15,
             3,    32,   155,   295,     9,     3,    15,   356,    15,     1,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]) --- shape:torch.Size([1, 40])
source_mask:
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) --- shape:torch.Size([1, 40])
target_token_ids:
 tensor([[ 1401, 11476, 13523,  4225,     1,     0,     0,     0,     0,     0,
             0,     0]]) --- shape:torch.Size([1, 12])


## Datasets e Dataloaders

In [0]:
BATCH_SZ = 128

# datasets
ds_debug = DateDataset(
    x_train[:BATCH_SZ], 
    y_train[:BATCH_SZ],
    tokenizer,
    max_len_source,
    max_len_target
    )

ds_train = DateDataset(
    x_train, 
    y_train,
    tokenizer,
    max_len_source,
    max_len_target
    )
ds_valid = DateDataset(
    x_val, 
    y_val,
    tokenizer,
    max_len_source,
    max_len_target
    )

print(f'len ds_train: {len(ds_train)}')
print(f'len ds_valid: {len(ds_valid)}')
print(f'len ds_debug: {len(ds_debug)}')

# dataloaders
dataloaders = {
    'debug': DataLoader(
         ds_debug,
         batch_size=BATCH_SZ,
         shuffle=True,
         num_workers=2,
         pin_memory=True),
    
    'train': DataLoader(
         ds_train,
         batch_size=BATCH_SZ,
         shuffle=True,
         num_workers=2,
         pin_memory=True),

    'valid': DataLoader(
         ds_valid,
         batch_size=BATCH_SZ,
         shuffle=False,
         num_workers=2,
         pin_memory=True)
               }

# sanity check
dl_sizes = {x: len(dataloaders[x]) for x in dataloaders.keys()}; dl_sizes

len ds_train: 92809
len ds_valid: 39776
len ds_debug: 128


{'debug': 1, 'train': 726, 'valid': 311}

In [0]:
# testando o dataloader 
source_token_ids, source_mask, target_token_ids = next(iter(dataloaders['debug']))

In [0]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(model_size)
    
    def forward(self, token_ids, att_mask, labels):
        outputs = self.model.forward(
            input_ids=token_ids, 
            attention_mask=att_mask,
            lm_labels=labels
            )
        return outputs[0] # loss
    
    @torch.no_grad()    
    def generate(self, token_ids, att_mask, max_len_target):
        predict = self.model.generate(
            input_ids=token_ids, 
            attention_mask=att_mask,
            max_length=max_len_target
            )
        return predict

## Funções de treino e eval

In [0]:
def acc_in_text(trues, preds):
    acc = []
    for d in zip(trues, preds):
        if d[0] == d[1]:
            acc.append(1)
        else:
            acc.append(0)
    return acc # bool

def train(model, device, train_loader, optimizer):
    loss_train = []
    model.train()
    for source_token_ids, source_mask, target_token_ids in train_loader:
        optimizer.zero_grad()
        loss = model(
            source_token_ids.to(device), 
            source_mask.to(device), 
            target_token_ids.to(device)
            )
        
        loss_train.append(loss.item())
        loss.backward()
        optimizer.step()
  
    return sum(loss_train) / len(loss_train)

def evaluate_fn(model, device, val_loader, max_len=max_len_target):
    all_acc, all_preds, all_trues = [], [], []
    model.eval()
    for source_token_ids, source_mask, target_token_ids in val_loader:
        predicted_ids = model.generate(
            source_token_ids.to(device), 
            source_mask.to(device),
            max_len
            )
        
        preds = [tokenizer.decode(t) for t in predicted_ids]
        trues = [tokenizer.decode(t) for t in target_token_ids]
        acc = acc_in_text(trues, preds)
        all_acc.extend(acc)
        all_trues.extend(trues)
        all_preds.extend(preds)
        
    return np.array(all_acc).mean(), all_trues, all_preds

# Overfit em 1 batch

In [0]:
overfit = False

if overfit:

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    deterministic() 

    model = Net().to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

    N_EPOCHS  = 1000
    WINDOW    = 7

    # -----------------------------------------------------------------------------
    start.record()
    for step in range(1, N_EPOCHS+1):
        samp = random.randint(0, BATCH_SZ-WINDOW) # to show random trues and preds
        loss_t = train(model, device, dataloaders['debug'], optimizer)
        acc, trues, preds = evaluate_fn(model, device, dataloaders['debug'])
        if step == 1:
            print(f'[Epoch: {step}/{N_EPOCHS}] |', end=' ')
            print(f'Train Loss: {loss_t:.3f} -- Acc: {acc:.3f}')
        if step % 50 == 0:
            print(f'[Epoch: {step}/{N_EPOCHS}] |', end=' ')
            print(f'Train Loss: {loss_t:.3f} -- Acc: {acc:.3f}')
            print(f'  Trues: {trues[samp:samp+WINDOW]}\n  Preds: {preds[samp:samp+WINDOW]}')
    end.record()
    torch.cuda.synchronize()    
    # -----------------------------------------------------------------------------

    print(f'Tempo: {start.elapsed_time(end)/1000/60 :.3f} min.')
    del model

Deterministic experiment, seed: 0
[Epoch: 1/1000] | Train Loss: 12.042 -- Acc: 0.000
[Epoch: 50/1000] | Train Loss: 1.969 -- Acc: 0.000
  Trues: ['23/11/1969', '13/12/1969', '24/11/1950', '09/03/2008', '05/05/2013', '26/04/1984', '05/12/2011']
  Preds: ['', '', '', '', '', '', '']
[Epoch: 100/1000] | Train Loss: 1.195 -- Acc: 0.133
  Trues: ['09/06/1984', '07/07/1910', '23/05/1958', '20/06/1986', '19/06/1926', '07/01/1917', '27/12/1937']
  Preds: [' ⁇   ⁇   ⁇   ⁇   ⁇  ', ' ⁇   ⁇   ⁇   ⁇   ⁇  ', ' ⁇   ⁇   ⁇   ⁇   ⁇  ', ' ⁇   ⁇   ⁇   ⁇   ⁇  ', '', ' ⁇   ⁇   ⁇   ⁇   ⁇  ', '']
[Epoch: 150/1000] | Train Loss: 0.873 -- Acc: 0.180
  Trues: ['21/10/1954', '13/04/1951', '26/04/1984', '20/08/1978', '07/07/1910', '23/11/1969', '05/12/2011']
  Preds: ['21/10/1954', '15/06/1988', '05/07/1983', '05/08/1999', '05/07/1999', '05/07/1988', '05/08/1920']
[Epoch: 200/1000] | Train Loss: 0.783 -- Acc: 0.195
  Trues: ['05/02/1918', '12/06/1976', '12/03/2020', '22/07/1996', '07/07/1910', '09/06/1984', '05/08

# Treino completo

In [0]:
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
deterministic() 

model = Net().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

N_EPOCHS  = 4
WINDOW    = 7

# ---------------------------------------------------------------------------------
start.record()

for step in range(1, N_EPOCHS+1):
    samp = random.randint(0, BATCH_SZ-WINDOW) # to show random trues and preds
    loss_t = train(model, device, dataloaders['train'], optimizer)
    acc, trues, preds = evaluate_fn(model, device, dataloaders['valid'])
    print(f'[Epoch: {step}/{N_EPOCHS}] |', end=' ')
    print(f'Train Loss: {loss_t:.3f} -- Acc: {acc:.3f}')
    print(f'  Trues: {trues[samp:samp+WINDOW]}\n  Preds: {preds[samp:samp+WINDOW]}')

end.record()
torch.cuda.synchronize()    
# ---------------------------------------------------------------------------------

print(f'Tempo: {start.elapsed_time(end)/1000/60 :.3f} min.')

Deterministic experiment, seed: 0
[Epoch: 1/4] | Train Loss: 0.852 -- Acc: 0.819
  Trues: ['20/04/1943', '22/02/1927', '01/09/1972', '02/01/1905', '17/04/2008', '28/03/1928', '17/07/1961']
  Preds: ['20/04/1943', '22/02/1927', '01/09/1972', '02/01/1905', '17/04/08', '28/03/1928', '17/07/1962']
[Epoch: 2/4] | Train Loss: 0.082 -- Acc: 0.998
  Trues: ['16/03/2006', '12/01/2019', '23/10/1972', '25/10/2004', '08/01/1986', '08/05/1953', '27/05/1961']
  Preds: ['16/03/2006', '12/01/2019', '23/10/1972', '25/10/2004', '08/01/1986', '08/05/1953', '27/05/1961']
[Epoch: 3/4] | Train Loss: 0.023 -- Acc: 0.999
  Trues: ['27/05/1961', '09/09/1969', '01/01/1975', '07/04/1912', '01/06/1901', '17/07/1992', '10/11/2010']
  Preds: ['27/05/1961', '09/09/1969', '01/01/1975', '07/04/1912', '01/06/1901', '17/07/1992', '10/11/2010']
[Epoch: 4/4] | Train Loss: 0.011 -- Acc: 1.000
  Trues: ['23/01/1994', '01/11/1914', '03/12/1977', '11/08/1968', '02/06/1935', '25/02/1934', '09/08/1915']
  Preds: ['23/01/1994', 

# Teste de predição

In [0]:
example_0  = 'um de jan. de mil, dois mil e dez'
example_1 = '01.01.2050'

s = tokenizer.encode_plus(
    f'{example_0} {tokenizer.eos_token}',
    max_length=max_len_source,
    pad_to_max_length=True,
    return_tensors='pt')
s.keys()

In [0]:
model.eval()
predicted_tokens = model.generate(
    s.input_ids.to(device), 
    s.attention_mask.to(device),
    )
predict = [tokenizer.decode(t) for t in predicted_tokens]; predict[0]