In [1]:
import sys
sys.path.append('../')
from transformer.utils import load_config, get_dataset, get_tokenizer, timer, calculate_max_lengths
from pathlib import Path

## Load config

In [2]:
config_file_path = "../config.json"
config = load_config(config_file_path)

# Load dataset

In [3]:
dataset = get_dataset(config)

In [4]:
dataset

Dataset({
    features: ['id', 'translation'],
    num_rows: 209479
})

In [5]:
dataset['translation'][:10]

[{'en': '$10,000 Gold?', 'fr': 'L’or à 10.000 dollars l’once\xa0?'},
 {'en': 'SAN FRANCISCO – It has never been easy to have a rational conversation about the value of gold.',
  'fr': 'SAN FRANCISCO – Il n’a jamais été facile d’avoir une discussion rationnelle sur la valeur du métal jaune.'},
 {'en': 'Lately, with gold prices up more than 300% over the last decade, it is harder than ever.',
  'fr': 'Et aujourd’hui, alors que le cours de l’or a augmenté de 300 pour cent au cours de la dernière décennie, c’est plus difficile que jamais.'},
 {'en': 'Just last December, fellow economists Martin Feldstein and Nouriel Roubini each penned op-eds bravely questioning bullish market sentiment, sensibly pointing out gold’s risks.',
  'fr': 'En décembre dernier, mes collègues économistes Martin Feldstein et Nouriel Roubini ont chacun publié une tribune libre dans laquelle ils doutaient courageusement du marché haussier, soulignant de manière sensée les risques liés à l’or.'},
 {'en': 'Wouldn’t you

## Load tokenizer

In [6]:
tokenizer_src = get_tokenizer(config, dataset, config['language_source'])

tokenizer_tgt = get_tokenizer(config, dataset, config['language_target'])

## Explore tokenizer

In [7]:
# check vocabulary size
print("Source vocabulary size: ", tokenizer_src.get_vocab_size())
print("Target vocabulary size: ", tokenizer_tgt.get_vocab_size()) 

Source vocabulary size:  30000
Target vocabulary size:  30000


In [8]:
# Check source tokenizer encodes/decodes
print("Source tokenizer encodes 'I love you': ", tokenizer_src.encode('I love you').ids)
print("Source tokenizer decodes [131, 3181, 345]: ", tokenizer_src.decode([131, 3181, 345])) 

Source tokenizer encodes 'I love you':  [131, 3181, 345]
Source tokenizer decodes [131, 3181, 345]:  I love you


In [9]:
# Check source tokenizer encodes/decodes
print("Target tokenizer encodes 'Je vais bien': ", tokenizer_tgt.encode('Je vais bien' ).ids)
print("Target tokenizer decodes [783, 11957, 70]: ", tokenizer_tgt.decode([783, 11957, 70])) 

Target tokenizer encodes 'Je vais bien':  [783, 11957, 70]
Target tokenizer decodes [783, 11957, 70]:  Je vais bien


In [10]:
# Comparing Uppercase anc Lowercase words
print("Source tokenizer encodes'Love':", tokenizer_src.encode('Love').ids)
print("Source tokenizer encodes 'love': ", tokenizer_src.encode('love').ids)

Source tokenizer encodes'Love': [17854]
Source tokenizer encodes 'love':  [3181]


In [11]:
# check token_to_id method
print("Source tokenizer encodes'Love': ", tokenizer_src.token_to_id('Love'))
print("Source tokenizer encodes'love': ", tokenizer_src.token_to_id('love'))

Source tokenizer encodes'Love':  17854
Source tokenizer encodes'love':  3181


In [12]:
# check id_to_token method
print("Source tokenizer decodes 17854:", tokenizer_src.id_to_token(17854))
print("Source tokenizer decodes 3181:", tokenizer_src.id_to_token(3181))

Source tokenizer decodes 17854: Love
Source tokenizer decodes 3181: love


## Check maximum length of source and target sequences

In [13]:
# Use the function
max_src_len, max_tgt_len = calculate_max_lengths(dataset, tokenizer_src, tokenizer_tgt, config)
print("Maximum length of source sentences: ", max_src_len)
print("Maximum length of target sentences: ", max_tgt_len)

Execution time: 50.593788385391235 seconds
Maximum length of source sentences:  222
Maximum length of target sentences:  348


In [27]:
tokenizer_src.token_to_id('[SOS]')

2

In [28]:
tokenizer_src.token_to_id('[EOS]')

3