<a href="https://colab.research.google.com/github/textnorms/address_text_norm/blob/master/T5_V2_adresses.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [65]:
! nvidia-smi

Sun Jun 21 18:00:54 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   58C    P0    39W / 250W |  12103MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [66]:
! rm -rf address*
! git clone https://github.com/textnorms/address_text_norm.git
! cp -r address_text_norm/syntetic_data/ .

! pip install -q num2words transformers
! pip install -q transformers

Cloning into 'address_text_norm'...
remote: Enumerating objects: 56, done.[K
remote: Counting objects:   1% (1/56)[Kremote: Counting objects:   3% (2/56)[Kremote: Counting objects:   5% (3/56)[Kremote: Counting objects:   7% (4/56)[Kremote: Counting objects:   8% (5/56)[Kremote: Counting objects:  10% (6/56)[Kremote: Counting objects:  12% (7/56)[Kremote: Counting objects:  14% (8/56)[Kremote: Counting objects:  16% (9/56)[Kremote: Counting objects:  17% (10/56)[Kremote: Counting objects:  19% (11/56)[Kremote: Counting objects:  21% (12/56)[Kremote: Counting objects:  23% (13/56)[Kremote: Counting objects:  25% (14/56)[Kremote: Counting objects:  26% (15/56)[Kremote: Counting objects:  28% (16/56)[Kremote: Counting objects:  30% (17/56)[Kremote: Counting objects:  32% (18/56)[Kremote: Counting objects:  33% (19/56)[Kremote: Counting objects:  35% (20/56)[Kremote: Counting objects:  37% (21/56)[Kremote: Counting objects:  39% (22/56)[Kremote:

In [67]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [68]:
addresss_db_path = '/gdrive/My Drive/12 Semestre/IA376/IA376 - Atividades/Projeto Final/address_db.csv'

!cp '/gdrive/My Drive/12 Semestre/IA376/IA376 - Atividades/Projeto Final/address_db.csv' ./syntetic_data/address_csv/br_address_db.csv

# Libs

In [70]:
# Basics
import numpy as np
import pandas as pd
import random

# Synthetic data generator
from syntetic_data import AddressTextGenerator

# PyTorch
import torch 
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Sklearn
from sklearn.model_selection import train_test_split

# Transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW

# Matplot lib
import matplotlib.pyplot as plt


import unicodedata

### Deterministic experiments

In [71]:
manual_seed = 2357 # only primes, cuz I like
def deterministic(rep=True):
    if rep:
        np.random.seed(manual_seed)
        torch.manual_seed(manual_seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(manual_seed)
            torch.cuda.manual_seed_all(manual_seed)
        torch.backends.cudnn.enabled = False 
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
        print(f'Deterministic experiment, seed: {manual_seed}')
    else:
        print('Random experiment')

deterministic()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Deterministic experiment, seed: 2357
Using device: cuda


# Config constants

In [80]:
# Dataset params
LANGUAGE = 'br'

# Model params
MODEL_SZ = 't5-small' # 't5-base'
TOK = T5Tokenizer.from_pretrained(MODEL_SZ)
MAX_LEN_SRC  = 150
MAX_LEN_TRGT = 150


# Train params
BATCH_SZ = 16
N_EPOCHS = 50
WINDOW   = 7
NOISE    = 0.0

# Dataset

In [81]:
pd.set_option('display.max_rows',70)

print('Generating addresses dataset')
addresses_gen = AddressTextGenerator(occurences_per_sample=10,
                                     text_noise_rate=NOISE,
                                     max_noise_occurences_per_sample=10,
                                     language=LANGUAGE)

df = addresses_gen.generate_address_dataset()

Generating addresses dataset


In [82]:
df.tail()

Unnamed: 0,Input Pattern,Noise Type,Input,Target
16935,17,,Torre 2 número 3839 da Rua das Matas no Reside...,"Rua das Matas, 3839, Torre 2, Residencial Nort..."
16936,22,,"Rua das Matas, Bloco 9 - Sala 2,Residencial No...","Rua das Matas, S/N, Bloco 9 - Sala 2,Residenci..."
16937,1,,"3839, Rua das Matas, Residencial Norte Sul, Ap...","Rua das Matas, 3839, Residencial Norte Sul, Ap..."
16938,19,,"Goiás, Aparecida de Goiânia, Residencial Norte...","Rua das Matas, 3839, Residencial Norte Sul, Ap..."
16939,15,,"Rua das Matas, 3839, Torre 2, Residencial Nort...","Rua das Matas, 3839, Torre 2, Residencial Nort..."


In [83]:
df['Input Pattern'].value_counts()

22    801
15    800
18    792
20    789
19    788
13    787
5     787
1     780
16    775
9     775
11    774
10    770
2     769
17    767
7     761
12    761
4     760
6     751
3     751
8     750
14    737
21    715
Name: Input Pattern, dtype: int64

In [84]:
# Inspecting the relative dates dataset
df.head()

Unnamed: 0,Input Pattern,Noise Type,Input,Target
0,19,,"Amazonas, Manaus, Coroado, Avenida Rodrigo Otá...","Avenida Rodrigo Otávio 6200, 4757, Coroado, Ma..."
1,2,,"Avenida Rodrigo Otávio 6200, 4757, Coroado, Ma...","Avenida Rodrigo Otávio 6200, 4757, Coroado, Ma..."
2,14,,Bloco 9 - Sala 2 na altura do 4757 na Avenida ...,"Avenida Rodrigo Otávio 6200, 4757, Bloco 9 - S..."
3,4,,"Avenida Rodrigo Otávio 6200, 4757, Coroado, Ma...","Avenida Rodrigo Otávio 6200, 4757, Coroado, Ma..."
4,5,,"Avenida Rodrigo Otávio 6200, 4757, Coroado, Ma...","Avenida Rodrigo Otávio 6200, 4757, Coroado, Ma..."


## Function to split the dataset

In [85]:
def split_data(df, test_size=0.2, verbose=True):
    l = list(set(df['Input Pattern'].values))
    num_test = int(len(l)*test_size)
    test_methods = [random.randint(1, len(l)) for _ in range(num_test)]
    print(test_methods)
    df_test = df[df['Input Pattern'].isin(test_methods)]
    print(df_test.shape)
    x_test = df_test.Input.values
    y_test = df_test.Target.values

    df_train = df[~df['Input Pattern'].isin(test_methods)]

    x_train, x_val, y_train, y_val = train_test_split(
        df_train.Input.values,
        df_train.Target.values,
        shuffle=True, 
        test_size=test_size,
        random_state=manual_seed
        )
    if verbose:
        print(f'Address types of test set: {test_methods} with len: {len(test_methods)}')
        print(f'x_train: {len(x_train)}  --  y_train: {len(y_train)}\n\
x_val:   {len(x_val)}  --  y_val:   {len(y_val)}\n\
x_test:  {len(x_test)}  --  y_test:  {len(y_test)}')

    return x_train, y_train, x_val, y_val, x_test, y_test

# creating sets
x_train, y_train, x_val, y_val, x_test, y_test = split_data(df, 
                                                            test_size=0.25, 
                                                            verbose=True)

[8, 5, 1, 18, 14]
(3846, 4)
Address types of test set: [8, 5, 1, 18, 14] with len: 5
x_train: 9820  --  y_train: 9820
x_val:   3274  --  y_val:   3274
x_test:  3846  --  y_test:  3846


In [86]:
class AddressDataset(Dataset):
    def __init__(self, data, label, tokenizer, source_max_length, target_max_length):
        self.tokenizer = tokenizer
        self.data = data
        self.label = label
        self.source_max_length = source_max_length
        self.target_max_length = target_max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        source = self.data[idx]
        target = self.label[idx]

        source = unicodedata.normalize('NFD', source).encode('latin-1', 'xmlcharrefreplace').decode('latin-1')
        target = unicodedata.normalize('NFD', target).encode('latin-1', 'xmlcharrefreplace').decode('latin-1')


        source_tokenized = self.tokenizer.encode_plus(
            f'{source} {self.tokenizer.eos_token}',
            max_length=self.source_max_length,
            pad_to_max_length=True,
            return_tensors='pt')

        target_tokenized = self.tokenizer.encode_plus(
            f'{target} {self.tokenizer.eos_token}',
            max_length=self.target_max_length,
            pad_to_max_length=True,
            return_tensors='pt')

        source_token_ids = source_tokenized['input_ids'].squeeze()
        source_mask = source_tokenized['attention_mask'].squeeze()
        target_token_ids = target_tokenized['input_ids'].squeeze()
        
        return source_token_ids, source_mask, target_token_ids

## Checking the AddressDataset class

In [87]:
dataset_debug = AddressDataset(
    x_train, 
    y_train,
    TOK,
    MAX_LEN_SRC,
    MAX_LEN_TRGT,
    )

dataloader_checking = DataLoader(
    dataset_debug, 
    batch_size=1, 
    shuffle=True, 
    num_workers=0
    )

source_token_ids, source_mask, target_token_ids = next(iter(dataloader_checking))
print(f'source_token_ids:\n {source_token_ids} --- shape:{source_token_ids.shape}')
print(f'source_mask:\n {source_mask} --- shape:{source_mask.shape}')
print(f'target_token_ids:\n {target_token_ids} --- shape:{target_token_ids.shape}')

source_token_ids:
 tensor([[  901,     9,  2726,     9,   211,  2740,  7446,     9,     7,     6,
          9668,  3959,     6,   205, 15644,    15,  2766,     6,   242, 11686,
          1629,     6,  1064,  2551,   184,  4663,   940,  3951,   117,     6,
          1640, 11776, 16975,     1,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,    

## Datasets e Dataloaders

In [88]:
# datasets
ds_debug = AddressDataset(x_train[:BATCH_SZ], y_train[:BATCH_SZ], TOK, MAX_LEN_SRC, MAX_LEN_TRGT)
ds_train = AddressDataset(x_train, y_train, TOK, MAX_LEN_SRC, MAX_LEN_TRGT)
ds_valid = AddressDataset(x_val, y_val, TOK, MAX_LEN_SRC, MAX_LEN_TRGT)
ds_test  = AddressDataset(x_test, y_test, TOK, MAX_LEN_SRC, MAX_LEN_TRGT)

print('Datasets len:')
print(f'len ds_debug: {len(ds_debug)}')
print(f'len ds_train: {len(ds_train)}')
print(f'len ds_valid: {len(ds_valid)}')
print(f'len ds_test:  {len(ds_test)}')

# dataloaders
dataloaders = {
    'debug': DataLoader(
         ds_debug,
         batch_size=BATCH_SZ,
         shuffle=True,
         num_workers=2,
         pin_memory=True),
    'train': DataLoader(
         ds_train,
         batch_size=BATCH_SZ,
         shuffle=True,
         num_workers=2,
         pin_memory=True),
    'valid': DataLoader(
         ds_valid,
         batch_size=BATCH_SZ,
         shuffle=False,
         num_workers=2,
         pin_memory=True),
    'test': DataLoader(
         ds_test,
         batch_size=BATCH_SZ,
         shuffle=False,
         num_workers=2,
         pin_memory=True),
               }
# sanity check
print('\nDataloaders len (in batch):')
dl_sizes = {x: len(dataloaders[x]) for x in dataloaders.keys()}; dl_sizes

Datasets len:
len ds_debug: 16
len ds_train: 9820
len ds_valid: 3274
len ds_test:  3846

Dataloaders len (in batch):


{'debug': 1, 'test': 241, 'train': 614, 'valid': 205}

In [89]:
# testando o dataloader 
source_token_ids, source_mask, target_token_ids = next(iter(dataloaders['debug']))

In [90]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_SZ)
    
    def forward(self, token_ids, att_mask, labels):
        outputs = self.model.forward(
            input_ids=token_ids, 
            attention_mask=att_mask,
            lm_labels=labels
            )
        return outputs[0] # loss
    
    @torch.no_grad()    
    def generate(self, token_ids, att_mask, max_len_target):
        predict = self.model.generate(
            input_ids=token_ids, 
            attention_mask=att_mask,
            max_length=max_len_target
            )
        return predict
    
    @torch.no_grad()  
    def generate_example(self, text_input, tokenizer, max_len_source=MAX_LEN_SRC):

        self.model.eval()
        
        example_tokenized = tokenizer.encode_plus(
            f'{text_input} {tokenizer.eos_token}',
            max_length=max_len_source,
            pad_to_max_length=True,
            return_tensors='pt')
            
        example_token_ids = example_tokenized['input_ids']
        example_mask = example_tokenized['attention_mask']

        predicted_example = self.model.generate(
            input_ids=example_token_ids.to(device), 
            attention_mask=example_mask.to(device),
            max_length=MAX_LEN_TRGT
            )

        self.model.train()

        out_text = [tokenizer.decode(text) for text in predicted_example]
        
        return out_text

## Train and evaluation functions

In [91]:
# acc metric for text inputs
def acc_in_text(trues, preds): 
    acc = []
    for d in zip(trues, preds):
        if d[0] == d[1]:
            acc.append(1)
        else:
            acc.append(0)
    return acc # bool

def train(model, device, train_loader, optimizer):
    loss_train = []
    model.train()
    for source_token_ids, source_mask, target_token_ids in train_loader:
        optimizer.zero_grad()
        loss = model(
            source_token_ids.to(device), 
            source_mask.to(device), 
            target_token_ids.to(device)
            )
        
        loss_train.append(loss.item())
        loss.backward()
        optimizer.step()
    
    train_losses = sum(loss_train) / len(loss_train)
  
    return train_losses

def evaluate_fn(model, device, val_loader, max_len=MAX_LEN_TRGT):
    loss_val, all_acc, all_preds, all_trues = [], [], [], []
    model.eval()
    for source_token_ids, source_mask, target_token_ids in val_loader:
        predicted_ids = model.generate(
            source_token_ids.to(device), 
            source_mask.to(device),
            max_len
            )
        
        preds = [TOK.decode(t) for t in predicted_ids]
        trues = [TOK.decode(t) for t in target_token_ids]
        acc = acc_in_text(trues, preds)
        all_acc.extend(acc)
        all_trues.extend(trues)
        all_preds.extend(preds)
        
        # val loss   
        loss = model(
        source_token_ids.to(device), 
        source_mask.to(device), 
        target_token_ids.to(device)
        )
        loss_val.append(loss.item())
    
    val_losses = sum(loss_val) / len(loss_val)
    
    return val_losses, np.array(all_acc).mean(), all_trues, all_preds

# Overfit in one batch 
- dataloader debug

In [92]:
overfit = True

if overfit:

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    deterministic() 

    model = Net().to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
    
    # -----------------------------------------------------------------------------
    start.record()
    for step in range(1, 1001):
        samp = random.randint(0, BATCH_SZ-WINDOW) # to show random trues and preds
        loss_t = train(model, device, dataloaders['debug'], optimizer)
        loss,acc, trues, preds = evaluate_fn(model, device, dataloaders['debug'])
        if step == 1:
            print(f'[Epoch: {step}/{1000}] |', end=' ')
            print(f'Train Loss: {loss_t:.3f} -- Acc: {acc:.3f}')
        if step % 100 == 0:
            print(f'[Epoch: {step}/{1000}] |', end=' ')
            print(f'Train Loss: {loss_t:.3f} -- Acc: {acc:.3f}')
            print(f'  Trues: {trues[samp:samp+WINDOW]}\n  Preds: {preds[samp:samp+WINDOW]}')
        if acc>= 0.99:
          print('The model has overfitted! Breaking the loop :)')
          break
    end.record()
    torch.cuda.synchronize()    
    # -----------------------------------------------------------------------------

    print(f'Training time: {start.elapsed_time(end)/1000/60 :.3f} min.')
    del model

Deterministic experiment, seed: 2357
[Epoch: 1/1000] | Train Loss: 10.578 -- Acc: 0.062
[Epoch: 100/1000] | Train Loss: 0.407 -- Acc: 0.000
  Trues: ['Rua 12A, 1056, Apartamento 100, Jardim Ame&#769;rica Prolongamento A, Rio Verde', 'Viela Alfredo Soares da Silva, 1036, Box 80, Parque Residencial Francisco Belo Galindo, Presidente Prudente, SP, 19097608', 'Avenida Senador Ati&#769;lio Fontana, S/N, Distrito Industrial, Rondono&#769;polis, MT', 'Rua Sa&#771;o Urbano, 5135, Sala 60, Vila Yara, Sa&#771;o Paulo, SP', 'Rua Duas Irma&#771;s, 8679, Jardim Almeida, Sa&#771;o Paulo, SP', 'Rua Jose&#769; Rodrigues de Oliveira, 6087, Bloco 9 - Sala 2, do Quadro, Vito&#769;ria, ES', 'Rua Sa&#771;o Pedro, 6312, Vila Merce&#770;s, Carapicui&#769;ba, SP, 6380170']
  Preds: ['', '', '', '', '', '', '']
[Epoch: 200/1000] | Train Loss: 0.208 -- Acc: 0.000
  Trues: ['Avenida Senador Ati&#769;lio Fontana, S/N, Distrito Industrial, Rondono&#769;polis, MT', 'Rua Maria Adelaide Vieira, 1435, Bloco 9 - Sala 2

# Training 

In [93]:
try:
  del model
except:
  print('Model already erased, starting a new one!')
  
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
deterministic() 

model = Net().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

N_EPOCHS = 10

# ---------------------------------------------------------------------------------
start.record()
for step in range(1, N_EPOCHS+1):
    samp = random.randint(0, BATCH_SZ-WINDOW) # to show random trues and preds
    loss_t = train(model, device, dataloaders['train'], optimizer)
    loss_v, acc, trues, preds = evaluate_fn(model, device, dataloaders['valid'])
    print(f'[Epoch: {step}/{N_EPOCHS}] |', end=' ')
    print(f'Train Loss: {loss_t:.3f} -- Valid Loss: {loss_v:.3f} -- Acc: {acc:.3f}')
    print(f'  Trues: {trues[samp:samp+WINDOW]}\n  Preds: {preds[samp:samp+WINDOW]}')

end.record()
torch.cuda.synchronize()    
# ---------------------------------------------------------------------------------

print(f'Training time: {start.elapsed_time(end)/1000/60 :.3f} min.')

Model already erased, starting a new one!
Deterministic experiment, seed: 2357
[Epoch: 1/10] | Train Loss: 0.460 -- Valid Loss: 0.041 -- Acc: 0.346
  Trues: ['Rua Santa Cruz, 1027, Bom Sucesso, Gravatai&#769;, RS', 'Rua Professor Eduardo de Martini, 7508, Torre 6 - Sala 192, Nu&#769;cleo Residencial Castelo Branco, Sa&#771;o Carlos, SP, 13571100', 'Rua Francisco de Souza Pinheiro, S/N, Torre 2,Parai&#769;so, Cataguases, MG', "Rua da Borracha, S/N, Box 80,Jardim Pe&#769;rola, Santa Ba&#769;rbara D'Oeste, SP", 'Rua Dezesseis, 9697, Cidade Oli&#769;mpica, Sa&#771;o Lui&#769;s, MA, 65058536', 'Estrada Silvio Pelicer Filho, 2013, Sala 60, Cha&#769;cara Jockey Club (Zona Rural), Sa&#771;o Jose&#769; do Rio Preto, SP', 'Rua Serra da Mantiqueira, 9914, Torre 2, Parque Ponte Grande, Mogi das Cruzes, SP']
  Preds: ['Rua Santa Cruz, 1027, Gravatai&#769;, Bom Sucesso, Gravatai&#769;, Gravatai&#769;, Gravatai&#769;, Gravatai&#769;, Gravatai&#', 'Rua Professor Eduardo de Martini, 7508, Torre 6 - Sal

# Test

In [94]:
# ---------------------------------------------------------------------------------
start.record()

samp = random.randint(0, BATCH_SZ-WINDOW) # to show random trues and preds
loss, acc, trues, preds = evaluate_fn(model, device, dataloaders['test'])
print(f'Loss: {loss:.3f} -- Acc: {acc:.3f}')
print(f' Trues: {trues[samp:samp+WINDOW]}\n  Preds: {preds[samp:samp+WINDOW]}')

end.record()
torch.cuda.synchronize()    
# ---------------------------------------------------------------------------------

print(f'Test time: {start.elapsed_time(end)/1000/60 :.3f} min.')

Loss: 0.061 -- Acc: 0.774
 Trues: ['Rua Jose&#769; Raunheitti, 4364, Parque Horizonte, Nova Iguac&#807;u, RJ', 'Estrada do Gentil, 6412, Itaipava, Petro&#769;polis, RJ', 'Estrada do Gentil, 6412, Itaipava, Petro&#769;polis, RJ', 'Estrada do Gentil, 6412, Itaipava, Petro&#769;polis, RJ, 25745010', 'Estrada do Gentil, 6412, Torre 8 - Apartamento 107,Itaipava, Petro&#769;polis, RJ', 'Avenida Alberto Giovannini, 130, Betha&#770;nia, Ipatinga, MG', 'Quadra 726, 6751, Loja 79,Parque Estrela Dalva X, Luzia&#770;nia, GO']
  Preds: ['Rua Jose&#769; Raunheitti, 4364, Parque Horizonte, Nova Iguac&#807;u, RJ', 'Estrada do Gentil, 6412, Itaipava, Petro&#769;polis, RJ', 'Estrada do Gentil, 6412, Itaipava, Petro&#769;polis, RJ', 'Estrada do Gentil, 6412, Itaipava, Petro&#769;polis, RJ, 25745010', 'Estrada do Gentil, 6412, Itaipava, Petro&#769;polis, RJ', 'Avenida Alberto Giovannini, 130, Betha&#770;nia, Ipatinga, MG', 'Quadra 726, 6751, Parque Estrela Dalva X, Luzia&#770;nia, GO']
Test time: 2.469 mi

# Evaluating types for a same address

Given a sample address, this section evaluates wich is the accuracy considering all the formats.




In [99]:
def evaluate_for_a_same_address(logradouro,numero,complemento,
                bairro,cidade,uf,cep, model=model,tokenizer=TOK,
                verbose=True):
  '''
    Given a specific address, returns the accuracy in all evaluated types.
    Also prints results per sample.
  '''

  results = []
  
  examples = addresses_gen.generate_demo(logradouro,numero,complemento,
                bairro,cidade,uf,cep)


  for x,target in zip(examples['Generated Text'],examples['Origin Sample']):

    prediction = model.generate_example(x,TOK)[0]

    results.append(prediction == target)

    if verbose:
      print(f'Entrada: {x} -- Target: {target} --- Previsto: {prediction} --- {prediction == target}')

  if verbose:
    print(f'Total accuracy: {np.mean(results)}')

  return np.mean(results)

In [102]:
'''
  using our faculty address as a sample
  Avenida Albert Einstein, 400, Cidade Universitária, Campinas, SP, 13083-852
'''
evaluate_for_a_same_address('Av. Albert Einstein', '400', '','Cidade Universitária',
                         'Campinas', 'SP', '13083852')

Entrada: 400, Av. Albert Einstein, Cidade Universitária, Campinas, SP, 13083852 -- Target: Av. Albert Einstein, 400, Cidade Universitária, Campinas, SP, 13083852 --- Previsto: Av. Albert Einstein, 400, Cidade Universitária, Campinas, SP, 13083852 --- True
Entrada: Av. Albert Einstein, 400, Cidade Universitária, Campinas, SP, 13083852 -- Target: Av. Albert Einstein, 400, Cidade Universitária, Campinas, SP, 13083852 --- Previsto: Av. Albert Einstein, 400, Cidade Universitária, Campinas, SP, 13083852 --- True
Entrada: Altura do 400 na Av. Albert Einstein no Cidade Universitária de Campinas -- Target: Av. Albert Einstein, 400, Cidade Universitária, Campinas --- Previsto: Av. Albert Einstein, 400, Cidade Universitária, Campinas --- True
Entrada: Av. Albert Einstein, 400, Cidade Universitária, Campinas, São Paulo, 13083852 -- Target: Av. Albert Einstein, 400, Cidade Universitária, Campinas, SP, 13083852 --- Previsto: Av. Albert Einstein, 400, Cidade Universitária, Campinas, SP, 13083852 --- 

0.9545454545454546

# The End