In [1]:
import spacy
from datasets import load_dataset

In [2]:
dataset = load_dataset("bentrevett/multi30k")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
})

In [4]:
dataset["train"]["en"][0]

'Two young, White males are outside near many bushes.'

In [5]:
spacy_de = spacy.load("de_core_news_sm")

In [6]:
spacy_de.tokenizer("zwei").text

'zwei'

In [66]:
def tokenize_data(example, tokenizer, bos_token, eos_token, max_length=10000):
    
    en_tokens = [bos_token] + [token.text for token in tokenizer.tokenizer(example['en'])][:max_length] + [eos_token]
    de_tokens = [bos_token] + [token.text for token in tokenizer.tokenizer(example['de'])][:max_length] + [eos_token]
    return {"en_tokens": en_tokens, "de_tokens": de_tokens}

In [67]:
from torchtext.vocab import build_vocab_from_iterator

In [68]:
dataset = load_dataset("bentrevett/multi30k")

In [69]:
spacy_en = spacy.load("en_core_web_sm")

In [70]:
max_length = 100000
lower = True
bos_token = "<bos>"
eos_token = "<eos>"

fn_kwargs = {
    "tokenizer": spacy_en,
    "max_length": max_length,
    "bos_token": bos_token,
    "eos_token": eos_token,
}

train_dataset = dataset['train'].map(tokenize_data, fn_kwargs=fn_kwargs)
valid_dataset = dataset['validation'].map(tokenize_data, fn_kwargs=fn_kwargs)
test_dataset = dataset['test'].map(tokenize_data, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/29000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1014 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [71]:
vocab_en = build_vocab_from_iterator(train_dataset["en_tokens"] + valid_dataset["en_tokens"] + test_dataset["en_tokens"])
vocab_de = build_vocab_from_iterator(train_dataset["de_tokens"] + valid_dataset["de_tokens"] + test_dataset["de_tokens"])

31014lines [00:00, 885418.50lines/s]
31014lines [00:00, 813191.29lines/s]


In [72]:
vocab_en.itos[:10]

['<unk>', '<pad>', 'a', '<bos>', '<eos>', '.', 'A', 'in', 'the', 'on']

In [73]:
unk_token = "<unk>"
pad_token = "<pad>"

In [74]:
assert vocab_en[unk_token] == vocab_de[unk_token]

unk_index = vocab_en[unk_token]
pad_index = vocab_en[pad_token]

In [75]:
tokens = ["i", "love", "watching", "crime", "shows"]
vocab_en[tokens[0]]

2695

In [76]:
def tokenize(example, vocab_en, vocab_de):

    return {
        "en_ids": [vocab_en[x] for x in example["en_tokens"]],
        "de_ids": [vocab_de[x] for x in example["de_tokens"]],
    }

In [77]:
fn_kwargs = {"vocab_en": vocab_en, "vocab_de": vocab_de}

train_data = train_dataset.map(tokenize, fn_kwargs=fn_kwargs)
valid_data = valid_dataset.map(tokenize, fn_kwargs=fn_kwargs)
test_data = test_dataset.map(tokenize, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/29000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1014 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [78]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<bos>',
  'Two',
  'young',
  ',',
  'White',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<bos>',
  'Zwei',
  'junge',
  'weiße',
  'Männer',
  'sind',
  'i',
  'm',
  'Freien',
  'in',
  'der',
  'Nähe',
  'vieler',
  'Büsche',
  '.',
  '<eos>'],
 'en_ids': [3, 19, 25, 15, 1138, 822, 17, 58, 84, 335, 1359, 5, 4],
 'de_ids': [2, 21, 87, 225, 33, 91, 23, 22, 95, 7, 16, 121, 8239, 3403, 4, 3]}

In [None]:
def get_collate_fn(pad_idx=0):

    def collate_fn(batch_dict):

        token_ids = [x['en_ids'] for x in batch_dict]
        max_len = [len(x) for x in token_ids]