## Data Loading

In [None]:
import polars as pl
import requests
import io


DE_TEST_FILE  = 'deu_test.json'
DE_TRAIN_FILE = 'deu_training.json'
FR_TEST_FILE  = 'fra_test.json'
FR_TRAIN_FILE = 'fra_training.json'
ES_TEST_FILE  = 'spa_test.json'
ES_TRAIN_FILE = 'spa_training.json'

def read_json_from_git(fname):
    url = f'https://raw.githubusercontent.com/wortcook/LLMs-from-scratch/refs/heads/main/data/{fname}'
    s = requests.get(url).content
    
    return pl.read_json(io.StringIO(s.decode('utf-8')))

de_train  = read_json_from_git(DE_TRAIN_FILE)
# print(de_train.head())


#tokenize
def tokenize(sentence):
    return sentence.split()

def build_vocab(vocab, df, col):
    for sentence in df[col]:
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
    return vocab

def index_tokens(tokens, vocab):
    return [vocab[token] for token in tokens]

#tokenize into a new column tokens
de_train = de_train.with_columns([
    pl.col("source").map_elements(tokenize, return_dtype=pl.List(pl.Utf8)).alias("source_tokens"),
    pl.col("target").map_elements(tokenize, return_dtype=pl.List(pl.Utf8)).alias("target_tokens"),
])

# print(de_train.head())
de_vocab = {"":0}
de_vocab = build_vocab(de_vocab, de_train, "source_tokens")

en_vocab = {"":0}
en_vocab = build_vocab(en_vocab, de_train, "target_tokens")

de_train = de_train.with_columns([
    pl.col("source_tokens").map_elements(lambda x: index_tokens(x, de_vocab), return_dtype=pl.List(pl.UInt16)).alias("source_indices"),
    pl.col("target_tokens").map_elements(lambda x: index_tokens(x, en_vocab), return_dtype=pl.List(pl.UInt16)).alias("target_indices"),
])

#find max length of source and target
max_source_len = de_train["source_indices"].list.len().max()
max_target_len = de_train["target_indices"].list.len().max()
# max_target_len = de_train.with_columns([pl.col("target_indices").list.len().max()])

print(de_train.head())



********************************************************************************
101
76
********************************************************************************
shape: (5, 6)
┌────────────────┬────────────────┬────────────────┬───────────────┬───────────────┬───────────────┐
│ source         ┆ target         ┆ source_tokens  ┆ target_tokens ┆ source_indice ┆ target_indice │
│ ---            ┆ ---            ┆ ---            ┆ ---           ┆ s             ┆ s             │
│ str            ┆ str            ┆ list[str]      ┆ list[str]     ┆ ---           ┆ ---           │
│                ┆                ┆                ┆               ┆ list[u16]     ┆ list[u16]     │
╞════════════════╪════════════════╪════════════════╪═══════════════╪═══════════════╪═══════════════╡
│ Tom didn't     ┆ Tom hat nicht  ┆ ["Tom",        ┆ ["Tom",       ┆ [1, 2, … 4]   ┆ [1, 2, … 5]   │
│ even smile.    ┆ mal gelächelt. ┆ "didn't", …    ┆ "hat", …      ┆               ┆               │
│        

In [38]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-de-en")

def tokenize_with_transformers(sentence):
    return tokenizer.tokenize(sentence)

def encode_with_transformers(sentence):
    return tokenizer.encode(sentence)

de_train = de_train.with_columns([
    pl.col("source").map_elements(tokenize_with_transformers, return_dtype=pl.List(pl.Utf8)).alias("source_tokens_transformers"),
    pl.col("target").map_elements(tokenize_with_transformers, return_dtype=pl.List(pl.Utf8)).alias("target_tokens_transformers"),
    pl.col("source").map_elements(tokenizer.encode, return_dtype=pl.List(pl.UInt16)).alias("source_indices_transformers"),
    pl.col("target").map_elements(tokenizer.encode, return_dtype=pl.List(pl.UInt16)).alias("target_indices_transformers"),
])

print(de_train.head())

shape: (5, 10)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ source    ┆ target    ┆ source_to ┆ target_to ┆ … ┆ source_to ┆ target_to ┆ source_in ┆ target_i │
│ ---       ┆ ---       ┆ kens      ┆ kens      ┆   ┆ kens_tran ┆ kens_tran ┆ dices_tra ┆ ndices_t │
│ str       ┆ str       ┆ ---       ┆ ---       ┆   ┆ sformers  ┆ sformers  ┆ nsformers ┆ ransform │
│           ┆           ┆ list[str] ┆ list[str] ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ers      │
│           ┆           ┆           ┆           ┆   ┆ list[str] ┆ list[str] ┆ list[u16] ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ list[u16 │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ ]        │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ Tom       ┆ Tom hat   ┆ ["Tom",   ┆ ["Tom",   ┆ … ┆ ["▁Tom",  ┆ ["▁Tom",  