In [1]:
import sys
sys.path.append('../')
from transformer.utils import load_config, get_dataset, get_tokenizer, calculate_max_lengths

## Load config
config_file_path = "../config.json"
config = load_config(config_file_path)

# Load dataset
dataset = get_dataset(config)
print(dataset['translation'][:2])

## Load tokenizer
tokenizer_src = get_tokenizer(config, dataset, config['language_source'])
tokenizer_tgt = get_tokenizer(config, dataset, config['language_target'])

## Explore tokenizer
# check vocabulary size
print("Source vocabulary size: ", tokenizer_src.get_vocab_size())
print("Target vocabulary size: ", tokenizer_tgt.get_vocab_size()) 

# Check source tokenizer encodes/decodes
print("Source tokenizer encodes 'I love you': ", tokenizer_src.encode('I love you').ids)
print("Source tokenizer decodes [131, 3181, 345]: ", tokenizer_src.decode([131, 3181, 345])) 

# Check source tokenizer encodes/decodes
print("Target tokenizer encodes 'Je vais bien': ", tokenizer_tgt.encode('Je vais bien' ).ids)
print("Target tokenizer decodes [783, 11957, 70]: ", tokenizer_tgt.decode([783, 11957, 70])) 

# Comparing Uppercase anc Lowercase words
print("Source tokenizer encodes'Love':", tokenizer_src.encode('Love').ids)
print("Source tokenizer encodes 'love': ", tokenizer_src.encode('love').ids)

# check token_to_id method
print("Source tokenizer encodes'Love': ", tokenizer_src.token_to_id('Love'))
print("Source tokenizer encodes'love': ", tokenizer_src.token_to_id('love'))

# check id_to_token method
print("Source tokenizer decodes 17854:", tokenizer_src.id_to_token(17854))
print("Source tokenizer decodes 3181:", tokenizer_src.id_to_token(3181))

## Check maximum length of source and target sequences
# Use the function
max_src_len, max_tgt_len = calculate_max_lengths(dataset, tokenizer_src, tokenizer_tgt, config)
print("Maximum length of source sentences: ", max_src_len)
print("Maximum length of target sentences: ", max_tgt_len)
print("SOS token id: ",tokenizer_src.encode('[SOS]').ids)
print("EOS token id: ",tokenizer_src.encode('[EOS]').ids)

[{'en': '$10,000 Gold?', 'fr': 'L’or à 10.000 dollars l’once\xa0?'}, {'en': 'SAN FRANCISCO – It has never been easy to have a rational conversation about the value of gold.', 'fr': 'SAN FRANCISCO – Il n’a jamais été facile d’avoir une discussion rationnelle sur la valeur du métal jaune.'}]
Source vocabulary size:  30000
Target vocabulary size:  30000
Source tokenizer encodes 'I love you':  [131, 3181, 345]
Source tokenizer decodes [131, 3181, 345]:  I love you
Target tokenizer encodes 'Je vais bien':  [783, 11957, 70]
Target tokenizer decodes [783, 11957, 70]:  Je vais bien
Source tokenizer encodes'Love': [17854]
Source tokenizer encodes 'love':  [3181]
Source tokenizer encodes'Love':  17854
Source tokenizer encodes'love':  3181
Source tokenizer decodes 17854: Love
Source tokenizer decodes 3181: love
Execution time: 41.480934619903564 seconds
Maximum length of source sentences:  222
Maximum length of target sentences:  348
SOS token id:  [2]
EOS token id:  [3]
