<a href="https://colab.research.google.com/github/textnorms/date_text_norm/blob/master/T5_V11_Pt_incomplete_dates.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [73]:
! nvidia-smi

Wed Jun  3 13:42:55 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0    76W / 149W |   2052MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

In [74]:
! rm -rf date*
! git clone https://github.com/textnorms/date_text_norm.git
! cp -r date_text_norm/syntetic_data_En/ .
! cp -r date_text_norm/syntetic_data_Pt/ .

! pip install -q num2words transformers
! pip install -q transformers

Cloning into 'date_text_norm'...
remote: Enumerating objects: 204, done.[K
remote: Counting objects:   0% (1/204)[Kremote: Counting objects:   1% (3/204)[Kremote: Counting objects:   2% (5/204)[Kremote: Counting objects:   3% (7/204)[Kremote: Counting objects:   4% (9/204)[Kremote: Counting objects:   5% (11/204)[Kremote: Counting objects:   6% (13/204)[Kremote: Counting objects:   7% (15/204)[Kremote: Counting objects:   8% (17/204)[Kremote: Counting objects:   9% (19/204)[Kremote: Counting objects:  10% (21/204)[Kremote: Counting objects:  11% (23/204)[Kremote: Counting objects:  12% (25/204)[Kremote: Counting objects:  13% (27/204)[Kremote: Counting objects:  14% (29/204)[Kremote: Counting objects:  15% (31/204)[Kremote: Counting objects:  16% (33/204)[Kremote: Counting objects:  17% (35/204)[Kremote: Counting objects:  18% (37/204)[Kremote: Counting objects:  19% (39/204)[Kremote: Counting objects:  20% (41/204)[Kremote: Counting objects: 

# Libs

### Choose Language

In [0]:
LANGUAGE = 'Pt'
# LANGUAGE = 'En'

In [0]:
# Basics
import numpy as np
import pandas as pd
import random

# Synthetic data generator

if LANGUAGE == 'En':
    from syntetic_data_En import DateTextGenerator 

if LANGUAGE == 'Pt':
    from syntetic_data_Pt import DateTextGenerator 

# PyTorch
import torch 
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Sklearn
from sklearn.model_selection import train_test_split

# Transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW

# Matplot lib
import matplotlib.pyplot as plt

### Deterministic experiments

In [77]:
manual_seed = 2357 # only primes, cuz I like
def deterministic(rep=True):
    if rep:
        np.random.seed(manual_seed)
        torch.manual_seed(manual_seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(manual_seed)
            torch.cuda.manual_seed_all(manual_seed)
        torch.backends.cudnn.enabled = False 
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
        print(f'Deterministic experiment, seed: {manual_seed}')
    else:
        print('Random experiment')

deterministic()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Deterministic experiment, seed: 2357
Using device: cuda


# Config constants

In [0]:
# Model params
MODEL_SZ = 't5-small' # 't5-base'
TOK = T5Tokenizer.from_pretrained(MODEL_SZ)
MAX_LEN_SRC  = 48
MAX_LEN_TRGT = 12

# Train params
BATCH_SZ = 16
N_EPOCHS = 50
WINDOW   = 7

# Dataset

In [79]:
pd.set_option('display.max_rows',70)

datas = DateTextGenerator(start_date='01/01/1921',
                          end_date='31/12/2120',
                          text_noise_rate=0.3)

examples = datas.generate_demo(date='4/11/1983'); examples

Unnamed: 0,Input Pattern,Generated Text,Origin Sample
0,1,quatro de novembro de 1983,4/11/1983
1,2,"quatro de nov de mil, novecentos e oitenta e três",4/11/1983
2,3,"quatro de novembro de mil, novecentos e oitent...",4/11/1983
3,4,"quarto dia do mês onze de mil, novecentos e oi...",4/11/1983
4,5,4 de Novembro de 1983,4/11/1983
5,6,"4 de novembro de mil, novecentos e oitenta e três",4/11/1983
6,7,"4-11 de mil, novecentos e oitenta e três",4/11/1983
7,8,quatro - 11 - 1983,4/11/1983
8,9,quatro de novembro - 1983,4/11/1983
9,10,4º de novembro de 1983,4/11/1983


In [80]:
df = datas.generate_date_dataset() 
f'df: {df.shape}'

'df: (75815, 5)'

In [81]:
df.head()

Unnamed: 0,Input Pattern,Noise Type,Input,Target,Target Format
0,15,,"um janeiro mil, novecentos e vinte e um",01/01/1921,DD/MM/YYYY
1,24,,02-01-1921,02/01/1921,DD/MM/YYYY
2,43,[unexpected_space_noise],3 / jan / 1921,03/01/1921,DD/MM/YYYY
3,1,[remove_char_noise],uatro dejaneiro de 1921,04/01/1921,DD/MM/YYYY
4,31,"[unexpected_space_noise, lookalike_replace_noi...",o5 . 1 . 921,05/01/1921,DD/MM/YYYY


In [82]:
df.loc[df['Target Format'] == 'DD/MM']

Unnamed: 0,Input Pattern,Noise Type,Input,Target,Target Format
75449,21,"[remove_char_noise, unexpected_space_noise, lo...",0 1do ê s un,01/01,DD/MM
75450,43,"[remove_char_noise, lookalike_replace_noise, u...",2 j am,02/01,DD/MM
75451,36,"[unexpected_space_noise, remove_char_noise]",03.ja n,03/01,DD/MM
75452,28,,04-jan,04/01,DD/MM
75453,13,,05º / Janeiro,05/01,DD/MM
...,...,...,...,...,...
75810,38,[unexpected_space_noise],2 8/12,28/12,DD/MM
75811,4,"[unexpected_space_noise, remove_char_noise, lo...",u igésimo nono dia do mês d0e,29/12,DD/MM
75812,13,,30º / Dezembro,30/12,DD/MM
75813,44,[remove_char_noise],31/2,31/12,DD/MM


In [83]:
df.loc[df['Target Format'] == 'MM/YYYY']

Unnamed: 0,Input Pattern,Noise Type,Input,Target,Target Format
73049,13,,Janeiro / 1921,01/1921,MM/YYYY
73050,29,,fev - 1921,02/1921,MM/YYYY
73051,16,,"março mil, novecentos e vinte e um",03/1921,MM/YYYY
73052,19,,"04 mil, novecentos e vinte e um",04/1921,MM/YYYY
73053,34,,maio.1921,05/1921,MM/YYYY
...,...,...,...,...,...
75444,6,"[lookalike_replace_noise, remove_char_noise]","agosto de dolsnil, cento e vinte",08/2120,MM/YYYY
75445,21,,nove de 2120,09/2120,MM/YYYY
75446,4,,"mês dez de dois mil, cento e vinte",10/2120,MM/YYYY
75447,22,,11-2120,11/2120,MM/YYYY


In [0]:
# Removing dates in the defined Target Format
df = df.loc[df['Target Format'] != 'DD/MM/YYYY']

## Function to split the dataset

In [85]:
def split_data(df, test_size=0.2, verbose=True):
    l = list(set(df['Input Pattern'].values))
    num_test = int(len(l)*test_size)
    test_methods = [random.randint(1, len(l)) for _ in range(num_test)]
    print(test_methods)
    df_test = df[df['Input Pattern'].isin(test_methods)]
    print(df_test.shape)
    x_test = df_test.Input.values
    y_test = df_test.Target.values

    df_train = df[~df['Input Pattern'].isin(test_methods)]

    x_train, x_val, y_train, y_val = train_test_split(
        df_train.Input.values,
        df_train.Target.values,
        shuffle=True, 
        test_size=test_size,
        random_state=manual_seed
        )
    if verbose:
        print(f'Date types of test set: {test_methods} with len: {len(test_methods)}')
        print(f'x_train: {len(x_train)}  --  y_train: {len(y_train)}\n\
x_val:   {len(x_val)}  --  y_val:   {len(y_val)}\n\
x_test:  {len(x_test)}  --  y_test:  {len(y_test)}')

    return x_train, y_train, x_val, y_val, x_test, y_test

# creating sets
x_train, y_train, x_val, y_val, x_test, y_test = split_data(df, 
                                                            test_size=0.25, 
                                                            verbose=True)

[4, 12, 4, 6, 45, 18, 28, 13, 6, 26, 5]
(565, 5)
Date types of test set: [4, 12, 4, 6, 45, 18, 28, 13, 6, 26, 5] with len: 11
x_train: 1650  --  y_train: 1650
x_val:   551  --  y_val:   551
x_test:  565  --  y_test:  565


In [0]:
class DateDataset(Dataset):
    def __init__(self, data, label, tokenizer, source_max_length, target_max_length):
        self.tokenizer = tokenizer
        self.data = data
        self.label = label
        self.source_max_length = source_max_length
        self.target_max_length = target_max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        source = self.data[idx]
        target = self.label[idx]

        source_tokenized = self.tokenizer.encode_plus(
            f'{source} {self.tokenizer.eos_token}',
            max_length=self.source_max_length,
            pad_to_max_length=True,
            return_tensors='pt')

        target_tokenized = self.tokenizer.encode_plus(
            f'{target} {self.tokenizer.eos_token}',
            max_length=self.target_max_length,
            pad_to_max_length=True,
            return_tensors='pt')

        source_token_ids = source_tokenized['input_ids'].squeeze()
        source_mask = source_tokenized['attention_mask'].squeeze()
        target_token_ids = target_tokenized['input_ids'].squeeze()
        
        return source_token_ids, source_mask, target_token_ids

## Checking the DateDataset class

In [87]:
dataset_debug = DateDataset(
    x_train, 
    y_train,
    TOK,
    MAX_LEN_SRC,
    MAX_LEN_TRGT,
    )

dataloader_checking = DataLoader(
    dataset_debug, 
    batch_size=1, 
    shuffle=True, 
    num_workers=0
    )

source_token_ids, source_mask, target_token_ids = next(iter(dataloader_checking))
print(f'source_token_ids:\n {source_token_ids} --- shape:{source_token_ids.shape}')
print(f'source_mask:\n {source_mask} --- shape:{source_mask.shape}')
print(f'target_token_ids:\n {target_token_ids} --- shape:{target_token_ids.shape}')

source_token_ids:
 tensor([[10668,     3,    18, 14834,     1,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0]]) --- shape:torch.Size([1, 48])
source_mask:
 tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) --- shape:torch.Size([1, 48])
target_token_ids:
 tensor([[10668, 13523,  3940,     1,     0,     0,     0,     0,     0,     0,
             0,     0]]) --- shape:torch.Size([1, 12])


## Datasets e Dataloaders

In [88]:
# datasets
ds_debug = DateDataset(x_train[:BATCH_SZ], y_train[:BATCH_SZ], TOK, MAX_LEN_SRC, MAX_LEN_TRGT)
ds_train = DateDataset(x_train, y_train, TOK, MAX_LEN_SRC, MAX_LEN_TRGT)
ds_valid = DateDataset(x_val, y_val, TOK, MAX_LEN_SRC, MAX_LEN_TRGT)
ds_test  = DateDataset(x_test, y_test, TOK, MAX_LEN_SRC, MAX_LEN_TRGT)

print('Datasets len:')
print(f'len ds_debug: {len(ds_debug)}')
print(f'len ds_train: {len(ds_train)}')
print(f'len ds_valid: {len(ds_valid)}')
print(f'len ds_test:  {len(ds_test)}')

# dataloaders
dataloaders = {
    'debug': DataLoader(
         ds_debug,
         batch_size=BATCH_SZ,
         shuffle=True,
         num_workers=2,
         pin_memory=True),
    'train': DataLoader(
         ds_train,
         batch_size=BATCH_SZ,
         shuffle=True,
         num_workers=2,
         pin_memory=True),
    'valid': DataLoader(
         ds_valid,
         batch_size=BATCH_SZ,
         shuffle=False,
         num_workers=2,
         pin_memory=True),
    'test': DataLoader(
         ds_test,
         batch_size=BATCH_SZ,
         shuffle=False,
         num_workers=2,
         pin_memory=True),
               }
# sanity check
print('\nDataloaders len (in batch):')
dl_sizes = {x: len(dataloaders[x]) for x in dataloaders.keys()}; dl_sizes

Datasets len:
len ds_debug: 16
len ds_train: 1650
len ds_valid: 551
len ds_test:  565

Dataloaders len (in batch):


{'debug': 1, 'test': 36, 'train': 104, 'valid': 35}

In [0]:
# testando o dataloader 
source_token_ids, source_mask, target_token_ids = next(iter(dataloaders['debug']))

In [0]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_SZ)
    
    def forward(self, token_ids, att_mask, labels):
        outputs = self.model.forward(
            input_ids=token_ids, 
            attention_mask=att_mask,
            lm_labels=labels
            )
        return outputs[0] # loss
    
    @torch.no_grad()    
    def generate(self, token_ids, att_mask, max_len_target):
        predict = self.model.generate(
            input_ids=token_ids, 
            attention_mask=att_mask,
            max_length=max_len_target
            )
        return predict
    
    @torch.no_grad()  
    def generate_example(self, text_input, tokenizer, max_len_source=MAX_LEN_SRC):

        self.model.eval()
        
        example_tokenized = tokenizer.encode_plus(
            f'{text_input} {tokenizer.eos_token}',
            max_length=max_len_source,
            pad_to_max_length=True,
            return_tensors='pt')
            
        example_token_ids = example_tokenized['input_ids']
        example_mask = example_tokenized['attention_mask']

        predicted_example = self.model.generate(
            input_ids=example_token_ids.to(device), 
            attention_mask=example_mask.to(device),
            max_length=MAX_LEN_TRGT
            )

        self.model.train()

        out_text = [tokenizer.decode(text) for text in predicted_example]
        
        return out_text

## Train and evaluation functions

In [0]:
# acc metric for text inputs
def acc_in_text(trues, preds): 
    acc = []
    for d in zip(trues, preds):
        if d[0] == d[1]:
            acc.append(1)
        else:
            acc.append(0)
    return acc # bool

def train(model, device, train_loader, optimizer):
    loss_train = []
    model.train()
    for source_token_ids, source_mask, target_token_ids in train_loader:
        optimizer.zero_grad()
        loss = model(
            source_token_ids.to(device), 
            source_mask.to(device), 
            target_token_ids.to(device)
            )
        
        loss_train.append(loss.item())
        loss.backward()
        optimizer.step()
    
    train_losses = sum(loss_train) / len(loss_train)
  
    return train_losses

def evaluate_fn(model, device, val_loader, max_len=MAX_LEN_TRGT):
    loss_val, all_acc, all_preds, all_trues = [], [], [], []
    model.eval()
    for source_token_ids, source_mask, target_token_ids in val_loader:
        predicted_ids = model.generate(
            source_token_ids.to(device), 
            source_mask.to(device),
            max_len
            )
        
        preds = [TOK.decode(t) for t in predicted_ids]
        trues = [TOK.decode(t) for t in target_token_ids]
        acc = acc_in_text(trues, preds)
        all_acc.extend(acc)
        all_trues.extend(trues)
        all_preds.extend(preds)
        
        # val loss   
        loss = model(
        source_token_ids.to(device), 
        source_mask.to(device), 
        target_token_ids.to(device)
        )
        loss_val.append(loss.item())
    
    val_losses = sum(loss_val) / len(loss_val)
    
    return val_losses, np.array(all_acc).mean(), all_trues, all_preds

# Overfit in one batch 
- dataloader debug

In [0]:
overfit = False

if overfit:

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    deterministic() 

    model = Net().to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
    
    # -----------------------------------------------------------------------------
    start.record()
    for step in range(1, 1001):
        samp = random.randint(0, BATCH_SZ-WINDOW) # to show random trues and preds
        loss_t = train(model, device, dataloaders['debug'], optimizer)
        acc, trues, preds = evaluate_fn(model, device, dataloaders['debug'])
        if step == 1:
            print(f'[Epoch: {step}/{1000}] |', end=' ')
            print(f'Train Loss: {loss_t:.3f} -- Acc: {acc:.3f}')
        if step % 100 == 0:
            print(f'[Epoch: {step}/{1000}] |', end=' ')
            print(f'Train Loss: {loss_t:.3f} -- Acc: {acc:.3f}')
            print(f'  Trues: {trues[samp:samp+WINDOW]}\n  Preds: {preds[samp:samp+WINDOW]}')
    end.record()
    torch.cuda.synchronize()    
    # -----------------------------------------------------------------------------

    print(f'Training time: {start.elapsed_time(end)/1000/60 :.3f} min.')
    del model

# Training 

In [93]:
# del model
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
deterministic() 

model = Net().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# ---------------------------------------------------------------------------------
start.record()
for step in range(1, N_EPOCHS+1):
    samp = random.randint(0, BATCH_SZ-WINDOW) # to show random trues and preds
    loss_t = train(model, device, dataloaders['train'], optimizer)
    loss_v, acc, trues, preds = evaluate_fn(model, device, dataloaders['valid'])
    print(f'[Epoch: {step}/{N_EPOCHS}] |', end=' ')
    print(f'Train Loss: {loss_t:.3f} -- Valid Loss: {loss_v:.3f} -- Acc: {acc:.3f}')
    print(f'  Trues: {trues[samp:samp+WINDOW]}\n  Preds: {preds[samp:samp+WINDOW]}')

end.record()
torch.cuda.synchronize()    
# ---------------------------------------------------------------------------------

print(f'Training time: {start.elapsed_time(end)/1000/60 :.3f} min.')

Deterministic experiment, seed: 2357
[Epoch: 1/50] | Train Loss: 3.472 -- Valid Loss: 0.913 -- Acc: 0.025
  Trues: ['07/2007', '10/1969', '29/01', '06/1951', '04/1970', '10/2049', '07/2051']
  Preds: ['', '', '', '', '', '', '']
[Epoch: 2/50] | Train Loss: 1.051 -- Valid Loss: 0.522 -- Acc: 0.149
  Trues: ['29/01', '06/1951', '04/1970', '10/2049', '07/2051', '10/2029', '24/04']
  Preds: ['29/09', '01/1951', '04/1999', '', '07/2051', '2029', '24/24']
[Epoch: 3/50] | Train Loss: 0.622 -- Valid Loss: 0.413 -- Acc: 0.236
  Trues: ['09/09', '07/2007', '10/1969', '29/01', '06/1951', '04/1970', '10/2049']
  Preds: ['09/1999', '07/2007', '10/1969', '29/02', '07/1951', '04/1999', '02/1994']
[Epoch: 4/50] | Train Loss: 0.498 -- Valid Loss: 0.372 -- Acc: 0.290
  Trues: ['08/2064', '09/09', '07/2007', '10/1969', '29/01', '06/1951', '04/1970']
  Preds: ['02/2064', '09/1999', '07/2007', '10/1969', '29/02', '07/1951', '04/1999']
[Epoch: 5/50] | Train Loss: 0.438 -- Valid Loss: 0.340 -- Acc: 0.307
  T

# Test

In [94]:
# ---------------------------------------------------------------------------------
start.record()

samp = random.randint(0, BATCH_SZ-WINDOW) # to show random trues and preds
loss, acc, trues, preds = evaluate_fn(model, device, dataloaders['test'])
print(f'Loss: {loss:.3f} -- Acc: {acc:.3f}')
print(f' Trues: {trues[samp:samp+WINDOW]}\n  Preds: {preds[samp:samp+WINDOW]}')

end.record()
torch.cuda.synchronize()    
# ---------------------------------------------------------------------------------

print(f'Test time: {start.elapsed_time(end)/1000/60 :.3f} min.')

Loss: 0.127 -- Acc: 0.717
 Trues: ['03/1922', '09/1922', '01/1923', '04/1923', '02/1924', '11/1924', '11/1925']
  Preds: ['03/1922', '09/1922', '01/1923', '04/1933', '02/1924', '11/1924', '06/1925']
Test time: 0.092 min.


# Evaluating types for a same date

Given a sample date, this section evaluates wich is the accuracy.




In [0]:
def evaluate_for_a_same_date(date,model=model,tokenizer=TOK,verbose=True):
  '''
    Given a specific date, returns the accuracy in all evalueated types.
    Also prints results per sample.
  '''

  results = []
  
  examples = datas.generate_demo(date=date)

  for x,target in zip(examples['Generated Text'],examples['Origin Sample']):

    prediction = model.generate_example(x,TOK)[0]

    results.append(prediction == target)

    if verbose:
      print(f'Entrada: {x} -- Target: {target} --- Previsto: {prediction} --- {prediction == target}')

  if verbose:
    print(f'Total accuracy: {np.mean(results)}')

  return np.mean(results)


In [96]:
'''
  A date in the century with more dates occuring
'''
evaluate_for_a_same_date('11/07/1988')

Entrada: onze de julho de 1988 -- Target: 11/07/1988 --- Previsto: 11/1988 --- False
Entrada: onze de jul de mil, novecentos e oitenta e oito -- Target: 11/07/1988 --- Previsto: 11/1988 --- False
Entrada: onze de julho de mil, novecentos e oitenta e oito -- Target: 11/07/1988 --- Previsto: 11/1988 --- False
Entrada: décimo primeiro dia do mês sete de mil, novecentos e oitenta e oito -- Target: 11/07/1988 --- Previsto: 12/1988 --- False
Entrada: 11 de Julho de 1988 -- Target: 11/07/1988 --- Previsto: 11/1988 --- False
Entrada: 11 de julho de mil, novecentos e oitenta e oito -- Target: 11/07/1988 --- Previsto: 11/1988 --- False
Entrada: 11-07 de mil, novecentos e oitenta e oito -- Target: 11/07/1988 --- Previsto: 11/1988 --- False
Entrada: onze - 07 - 1988 -- Target: 11/07/1988 --- Previsto: 11/1988 --- False
Entrada: onze de julho - 1988 -- Target: 11/07/1988 --- Previsto: 11/1988 --- False
Entrada: 11º de julho de 1988 -- Target: 11/07/1988 --- Previsto: 11/1988 --- False
Entrada: 11º 

0.0

In [97]:
'''
  A date in a century with less dates occurring, but that is inside the
  generated dataset
'''
evaluate_for_a_same_date('20/12/2015')

Entrada: vinte de dezembro de 2015 -- Target: 20/12/2015 --- Previsto: 21/2015 --- False
Entrada: vinte de dez de dois mil e quinze -- Target: 20/12/2015 --- Previsto: 22/2015 --- False
Entrada: vinte de dezembro de dois mil e quinze -- Target: 20/12/2015 --- Previsto: 21/2015 --- False
Entrada: vigésimo dia do mês doze de dois mil e quinze -- Target: 20/12/2015 --- Previsto: 21/2015 --- False
Entrada: 20 de Dezembro de 2015 -- Target: 20/12/2015 --- Previsto: 20/2015 --- False
Entrada: 20 de dezembro de dois mil e quinze -- Target: 20/12/2015 --- Previsto: 20/2015 --- False
Entrada: 20-12 de dois mil e quinze -- Target: 20/12/2015 --- Previsto: 20/12 --- False
Entrada: vinte - 12 - 2015 -- Target: 20/12/2015 --- Previsto: 12/2015 --- False
Entrada: vinte de dezembro - 2015 -- Target: 20/12/2015 --- Previsto: 21/2015 --- False
Entrada: 20º de dezembro de 2015 -- Target: 20/12/2015 --- Previsto: 20/2015 --- False
Entrada: 20º - 12 - 2015 -- Target: 20/12/2015 --- Previsto: 20/2015 --- F

0.0

In [98]:
'''
  Evaluating for a date in a century out of the training range gives the worst
  acc possible. (0)
'''
evaluate_for_a_same_date('25/12/2141')

Entrada: vinte e cinco de dezembro de 2141 -- Target: 25/12/2141 --- Previsto: 25/241 --- False
Entrada: vinte e cinco de dez de dois mil, cento e quarenta e um -- Target: 25/12/2141 --- Previsto: 25/2114 --- False
Entrada: vinte e cinco de dezembro de dois mil, cento e quarenta e um -- Target: 25/12/2141 --- Previsto: 21/2104 --- False
Entrada: vigésimo quinto dia do mês doze de dois mil, cento e quarenta e um -- Target: 25/12/2141 --- Previsto: 21/2104 --- False
Entrada: 25 de Dezembro de 2141 -- Target: 25/12/2141 --- Previsto: 25/241 --- False
Entrada: 25 de dezembro de dois mil, cento e quarenta e um -- Target: 25/12/2141 --- Previsto: 25/2114 --- False
Entrada: 25-12 de dois mil, cento e quarenta e um -- Target: 25/12/2141 --- Previsto: 25/2111 --- False
Entrada: vinte e cinco - 12 - 2141 -- Target: 25/12/2141 --- Previsto: 25/241 --- False
Entrada: vinte e cinco de dezembro - 2141 -- Target: 25/12/2141 --- Previsto: 25/241 --- False
Entrada: 25º de dezembro de 2141 -- Target: 25

0.044444444444444446

In [99]:
'''
  An earlier date than the beggining of the generated dataset
'''
evaluate_for_a_same_date('27/05/1920')

Entrada: vinte e sete de maio de 1920 -- Target: 27/05/1920 --- Previsto: 27/1920 --- False
Entrada: vinte e sete de mai de mil, novecentos e vinte -- Target: 27/05/1920 --- Previsto: 26/1927 --- False
Entrada: vinte e sete de maio de mil, novecentos e vinte -- Target: 27/05/1920 --- Previsto: 26/1927 --- False
Entrada: vigésimo sétimo dia do mês cinco de mil, novecentos e vinte -- Target: 27/05/1920 --- Previsto: 25/1925 --- False
Entrada: 27 de Maio de 1920 -- Target: 27/05/1920 --- Previsto: 27/2020 --- False
Entrada: 27 de maio de mil, novecentos e vinte -- Target: 27/05/1920 --- Previsto: 27/1927 --- False
Entrada: 27-05 de mil, novecentos e vinte -- Target: 27/05/1920 --- Previsto: 27/1927 --- False
Entrada: vinte e sete - 05 - 1920 -- Target: 27/05/1920 --- Previsto: 25/1920 --- False
Entrada: vinte e sete de maio - 1920 -- Target: 27/05/1920 --- Previsto: 27/1920 --- False
Entrada: 27º de maio de 1920 -- Target: 27/05/1920 --- Previsto: 27/2020 --- False
Entrada: 27º - 05 - 192

0.0

# Accuracy in dataset dates

## Inside dataset

Dates into the interval that was used to built the synthetic dataset used for test and eval.

In [100]:
accs = []
dates = []

sampled_test = random.sample(list(df['Target'].values),50)

print('acc test set: ',sampled_test)

for date_sample in sampled_test:
  accs.append(evaluate_for_a_same_date(date_sample,verbose=False))
  dates.append(date_sample)
  

acc test set:  ['10/2081', '05/2003', '12/2107', '01/2031', '09/2077', '11/2096', '09/1965', '01/2091', '03/1955', '02/2009', '05/2085', '10/2024', '03/2051', '02/1933', '12/2087', '08/2067', '11/2074', '11/2076', '05/1931', '01/2102', '05/2110', '10/09', '06/2047', '09/1932', '06/1933', '17/02', '11/1936', '27/08', '08/2023', '07/2096', '01/1964', '01/2041', '01/2113', '04/2058', '02/2081', '06/2109', '01/1973', '01/2066', '05/1929', '08/2003', '07/1998', '02/1934', '02/2052', '08/2039', '10/1956', '01/1982', '02/2032', '09/1946', '06/1998', '04/1959']


ValueError: ignored

In [0]:
plt.plot(dates,accs)
plt.xticks(rotation=45)
plt.grid()
plt.ylim([0.8, 1.01])
plt.ylabel('Average accuracy on 45 formats')
plt.xlabel('Canonical target')
plt.title('Average accuracy on dates inside synthetic training dataset')
print('Average of average accuracies: ',np.mean(accs))

## Below dataset

Dates lower than the synthetic dataset used for test and eval.

In [0]:
accs = []
dates = []

new_dataset = DateTextGenerator('01/01/1900','31/12/1920')

new_df = new_dataset.generate_date_dataset()
sampled_test = random.sample(list(new_df['Target'].values),20)

print('acc test set: ',sampled_test)

for date_sample in sampled_test:
  accs.append(evaluate_for_a_same_date(date_sample,verbose=False))
  dates.append(date_sample)

In [0]:
plt.plot(dates,accs)
plt.xticks(rotation=45)
plt.grid()
plt.ylim([-0.01, 1.01])
plt.ylabel('Average accuracy on 45 formats')
plt.xlabel('Canonical target')
plt.title('Average accuracy on dates below synthetic training dataset')
print('Average of average accuracies: ',np.mean(accs))

## Above dataset

Dates greater than the synthetic dataset used for test and eval.

In [0]:
accs = []
dates = []

new_dataset = DateTextGenerator('01/01/2121','31/12/2140')


new_df = new_dataset.generate_date_dataset()
sampled_test = random.sample(list(new_df['Target'].values),20)

print('acc test set: ',sampled_test)

for date_sample in sampled_test:
  accs.append(evaluate_for_a_same_date(date_sample,verbose=False))
  dates.append(date_sample)

In [0]:
plt.plot(dates,accs)
plt.xticks(rotation=45)
plt.grid()
plt.ylim([-0.01, 1.01])
plt.ylabel('Average accuracy on 45 formats')
plt.xlabel('Canonical target')
plt.title('Average accuracy on dates above synthetic training dataset')
print('Average of average accuracies: ',np.mean(accs))

# The End