## Data Loading

In [23]:
import polars as pl
import requests
import io


DE_TEST_FILE  = 'deu_test.json'
DE_TRAIN_FILE = 'deu_training.json'
FR_TEST_FILE  = 'fra_test.json'
FR_TRAIN_FILE = 'fra_training.json'
ES_TEST_FILE  = 'spa_test.json'
ES_TRAIN_FILE = 'spa_training.json'

def read_json_from_git(fname):
    url = f'https://raw.githubusercontent.com/wortcook/LLMs-from-scratch/refs/heads/main/data/{fname}'
    s = requests.get(url).content
    
    return pl.read_json(io.StringIO(s.decode('utf-8')))

de_train  = read_json_from_git(DE_TRAIN_FILE)
print(de_train.head())


#tokenize
def tokenize(sentence):
    return sentence.split()

def build_vocab(df, col):
    vocab = {}
    for sentence in df[col]:
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
    return vocab

def index_tokens(tokens, vocab):
    return [vocab[token] for token in tokens]

#tokenize into a new column tokens
de_train = de_train.with_columns([
    pl.col("source").map_elements(tokenize, return_dtype=pl.List(pl.Utf8)).alias("source_tokens"),
    pl.col("target").map_elements(tokenize, return_dtype=pl.List(pl.Utf8)).alias("target_tokens"),
])

print(de_train.head())

de_vocab = build_vocab(de_train, "source_tokens")

print(de_vocab)

en_vocab = build_vocab(de_train, "target_tokens")

de_train = de_train.with_columns([
    pl.col("source_tokens").map_elements(lambda x: index_tokens(x, de_vocab), return_dtype=pl.List(pl.UInt16)).alias("source_indices"),
    pl.col("target_tokens").map_elements(lambda x: index_tokens(x, en_vocab), return_dtype=pl.List(pl.UInt16)).alias("target_indices"),
])

print(de_train.head())



shape: (5, 2)
┌────────────────────────────────┬─────────────────────────────────┐
│ source                         ┆ target                          │
│ ---                            ┆ ---                             │
│ str                            ┆ str                             │
╞════════════════════════════════╪═════════════════════════════════╡
│ Tom didn't even smile.         ┆ Tom hat nicht mal gelächelt.    │
│ Tom is a really bad singer.    ┆ Tom ist ein wirklich schlechte… │
│ I don't want to fail my exams. ┆ Ich will nicht durch meine Prü… │
│ There were many wounded.       ┆ Es gab viele Verletzte.         │
│ We became very good friends.   ┆ Wir wurden sehr gute Freunde.   │
└────────────────────────────────┴─────────────────────────────────┘
shape: (5, 4)
┌────────────────────────┬────────────────────────┬────────────────────────┬───────────────────────┐
│ source                 ┆ target                 ┆ source_tokens          ┆ target_tokens         │
│ ---      