# Dataset Building

In [None]:
from nltk.tokenize.punkt import PunktLanguageVars
import mysql.connector
import pandas as pd
import csv
import random
from sklearn.model_selection import train_test_split
from typing import Iterable
from huggingface_hub import notebook_login, Repository, HfApi
from datasets import Dataset, concatenate_datasets

## Config

In [None]:
# Save the dataset to huggingface

notebook_login()

In [None]:
# Hugging Face config
HF_USERNAME = 'RodrigoSalazar-U'
HF_REPO = 'ANG-dataset'
HF_REPO_PATH = "{}/{}".format(HF_USERNAME, HF_REPO)

# Splits config
SPLIT_TRAIN_RATIO = 0.7
SPLIT_TEST_RATIO = 0.2
SPLIT_VAL_RATIO = 0.1

# Pretraining config
PRETRAINING_COD_RATE = 0.5

# Random seeds
RANDOM_SEEDS = {
  "split0" : 68,
  "split1" : 41,
  "split2" : 53,
  "cod0": 35,
  "corpus0": 7
}

### === FILE LOCATIONS === ###
FILES_ROOT="data"
FILES_TRANSLATE_FRAGS=f"{FILES_ROOT}/TranslatedByFragment.tab"
FILES_TRANSLATE_WORDS=f"{FILES_ROOT}/TranslatedByWord.tab"
FILES_DOEC=f"{FILES_ROOT}/DOECByFragmentTextOnly.txt"



### === CONSTANTS === ###
LANG_CODE_ANG = "ANG"
LANG_CODE_EN = "EN"
COL_ANG_TEXT = "ANG_text"
COL_EN_TEXT = "EN_text"
COL_ANG_WORD = "ANG_word"
COL_EN_WORD = "EN_word"
COL_EN_DEF = "EN_def"
COL_COD_EN2ANG = "COD_EN2ANG"
COL_COD_ANG2EN = "COD_ANG2EN"

# Suppress specific warning message
import warnings
warnings.filterwarnings("ignore", message="pandas only supports SQLAlchemy connectable")

## Utils

In [3]:
### === PRINT UTILS === ###
def getDatasetStats(df_dict : dict[str,pd.DataFrame]):
    total_size = 0
    for df in df_dict.values():
        total_size += len(df)
    # Create temporary dataframe with stats
    tmp_df = []
    for name, df in df_dict.items():
        tmp_df.append({"name": name, "size": len(df), "percentage": 100 * len(df)/total_size, "columns": ", ".join(sorted(df.columns))})
    # Display stats
    tmp_df = pd.DataFrame(tmp_df)
    return tmp_df

## Data Standarization

In [4]:


WORD_TOKENIZER = PunktLanguageVars()

NORMALIZE_ANG = {
    # OLD ENGLISH CHARACTERS
    ## Reference: from cltk.phonology.ang.transcription import Word
    ## These characters are not present in Modern English
    ## If we desired to standarize them to ASCII, we would use the following mapping
    ## Nonetheless, experimental results shows better performance when keeping these characters
    ## It is hypothesized that it allows the model to better differentiate between Old English and Modern English
    #"Æ": "Ae",
    #"æ": "ae",
    #"Ƿ": "W",
    #"ƿ": "w",
    #"Þ": "Th",
    #"þ": "th",
    #"Ð": "D", 
    #"ð": "d",  

    # DIACRITICS
    ## Remove diacritics identified in the DOEC
    "é": "e",
    "á": "a",
    "â": "a",
    "í": "i",
    "î": "i",
    "ó": "o",
    "ô": "o",
    "ú": "u",
    "û": "u",
    "ý": "y",
    "è": "e",
    "ê": "e",
    "ř": "r",
    
    # SPECIAL CHARACTERS
    ## Remove special characters identified in the DOEC
    "∂": "d",
    "ω": "w",
    "œ": "oe",
    "đ": "d",
}

def word_tokenize(text: str) -> list:
    """
    Tokenize text into words.
    """
    return WORD_TOKENIZER.word_tokenize(text)

def standarize_ang(text: str) -> str:
    """
    Standarize Old English text using a set of rules.
    """
    for char, repl in NORMALIZE_ANG.items():
        text = text.replace(char, repl)
    return text

def standarize_raw(text: str) -> str:
    """
    Standarize raw text by lowercasing and removing special characters.
    """
    # Lowercase
    text = text.lower()
    # Split into tokens
    tokens = WORD_TOKENIZER.word_tokenize(text)
    # Join tokens
    text = " ".join(tokens)
    # Standarize Old English
    text = standarize_ang(text)
    # Special unicode characters
    text = text.replace("­","")
    text = text.replace(" ­–","-")
    # Remove whitespaces
    text = text.replace("\n", " ")
    text = text.replace("\t", " ")
    text = text.replace("\r", " ")
    # Standarize punctuation
    text = text.replace("—", "-")
    text = text.replace("–", "-")
    text = text.replace("“", "\"")
    text = text.replace("”", "\"")
    text = text.replace("‘", "'")
    text = text.replace("’", "'")
    text = text.replace("·", ".")
    text = text.replace("´", "'")
    text = text.replace("¸", "'")
    # Remove extra spaces
    text = " ".join(text.split())
    text = text.strip()
    return text

## Data Sources

The dataset is compiled from the following sources:

- [Dictionary of Old English Corpus (DOEC)](https://varieng.helsinki.fi/CoRD/corpora/DOEC/)  
  A collection of approximately 3 million words of Old English texts dating from 600–1150 AD. The text is not annotated.

- **Translated DOEC**  
  A subset of the DOEC corpus that includes translations into Modern English. This serves as a parallel corpus at both the word and sentence levels.

- [Bosworth-Toller Anglo-Saxon Dictionary](http://www.bosworthtoller.com/)  
  A comprehensive dictionary of Old English words and their meanings.

The following code makes use of the definitions view. This view is created by the following SQL query:

```sql

CREATE VIEW definitions_view AS
SELECT 
    e.normalized AS word,
    GROUP_CONCAT(DISTINCT wc.name ORDER BY wc.name SEPARATOR '; ') AS wordclass,
    GROUP_CONCAT(DISTINCT wca.name ORDER BY wca.name SEPARATOR '; ') AS wordcategory,
    GROUP_CONCAT(DISTINCT d.normalized ORDER BY d.normalized SEPARATOR '; ') AS definitions
FROM 
    entries e
-- Wordclasses and wordcategories
LEFT JOIN
    (
        SELECT 
            entries_wordclass.entry_id,
            wordclasses.name
        FROM 
            entries_wordclass
        JOIN
            wordclasses ON entries_wordclass.wordclass_id = wordclasses.id
    ) wc
    ON e.id = wc.entry_id
LEFT JOIN
    (
        SELECT 
            entries_wordcategory.entry_id,
            wordcategories.name
        FROM 
            entries_wordcategory
        JOIN
            wordcategories ON entries_wordcategory.wordcategory_id = wordcategories.id
    ) wca
    ON e.id = wca.entry_id
-- Include definitions (MUST exist)
JOIN 
    definitions d ON e.id = d.entry_id
WHERE
    -- Filter out prefixes and suffixes
    e.is_prefix = False
    AND
    e.is_suffix = False
    AND 
    -- Filter out tiny words. Usually these are hard to replace
    LENGTH(e.orthographic) > 2
    AND
    -- Either wordclass or wordcategory must be present
    (wc.name IS NOT NULL OR wca.name IS NOT NULL)
GROUP BY 
    e.normalized
HAVING
    -- Only include words with a single wordclass and wordcategory
    COUNT(DISTINCT wc.name) = 1
    AND
    COUNT(DISTINCT wca.name) = 1
``` 

In [8]:
def load_defintions(
    db_uri: str = "localhost",
    db_user: str = "root",
    db_pass: str = "admin",
    db_database: str = "dictionary",
    db_view: str = "definitions_view",
  ):
    """
    Read dictionary data from MySQL database.
    """
    db = mysql.connector.connect(
        host=db_uri,
        user=db_user,
        password=db_pass,
        database=db_database
    )
    # Get only uniquely identifiable words
    df : pd.DataFrame = pd.read_sql(f"SELECT * FROM {db_view}", db) # type: ignore
    df = df[["word", "definitions", "wordclass", "wordcategory"]] 
    # Rename columns
    df = df.rename(
        columns={
            "word": COL_ANG_WORD,
            "definitions": COL_EN_DEF
        })
    # Apply standarization
    df[COL_ANG_WORD] = df[COL_ANG_WORD].apply(standarize_raw)
    # Index using ANG WORD
    df = df.set_index(COL_ANG_WORD)
    return df


def load_parallel(
  filepath: str,
  headers: list = ["text", "translation"],
  headers_mapping: dict = {
      "text": COL_ANG_TEXT,
      "translation": COL_EN_TEXT
  }) -> pd.DataFrame:
    with open(filepath, "r", encoding="utf-8") as f:
      df = pd.read_csv(f, sep="\t", header=None, names=headers, quoting=csv.QUOTE_NONE)
    # Drop rows with NaN values
    df = df.dropna()
    # Standarize the text and translation columns
    for header in headers:
        df[header] = df[header].apply(standarize_raw)
    # Rename columns to text=ANG and translation=EN
    df = df.rename(columns=headers_mapping)
    # Drop any empty rows
    df = df[(df[COL_ANG_TEXT] != "") & (df[COL_EN_TEXT] != "")]
    df = df[(df[COL_ANG_TEXT] != " ") & (df[COL_EN_TEXT] != " ")]
    return df

def load_parallel_words(
  filepath: str,
  headers: list = ["word", "translation", "text", "translation_text"],
  headers_mapping: dict = {
      "word": COL_ANG_WORD,
      "translation": COL_EN_WORD,
      "text": COL_ANG_TEXT,
      "translation_text": COL_EN_TEXT
  }):
  with open(filepath, "r", encoding="utf-8") as f:
    df = pd.read_csv(f, sep="\t", header=None, names=headers, quoting=csv.QUOTE_NONE)
  # Drop rows with NaN values
  df = df.dropna()
  # Standarize the text and translation columns
  for header in headers:
      df[header] = df[header].apply(standarize_raw)
      # Drop any empty rows
      df = df[df[header] != ""]
      df = df[df[header] != " "]
  # Rename columns to text=ANG and translation=EN
  df = df.rename(columns=headers_mapping)

  return df


def load_corpus(
      filepath: str,
      header: str = "text"
  ):
    with open(filepath, "r", encoding="utf-8") as f:
        lines = f.readlines()
    # To dataframe
    df = pd.DataFrame(lines, columns=[header])
    # Drop rows with NaN values
    df = df.dropna()
    # apply standarization
    df[header] = df[header].apply(standarize_raw)
    # Drop any empty rows
    df = df[df[header] != ""]
    df = df[df[header] != " "]
    # Rename to ANG
    df = df.rename(columns={header: COL_ANG_TEXT})
    return df

In [9]:

# READ FILES
print("Reading dictionary...")
dictionary_df = load_defintions()
print("Reading translation files...")
translate_df = load_parallel(FILES_TRANSLATE_FRAGS)
word_translate_df = load_parallel_words(FILES_TRANSLATE_WORDS)
print("Reading DOEC files...")
doec_df = load_corpus(FILES_DOEC)
print("Done reading files.")

# Show stats
print("========= Initial datasets =========")
getDatasetStats({"Dictionary": dictionary_df, "Translate": translate_df, "DOEC": doec_df})

Reading dictionary...
Reading translation files...
Reading DOEC files...
Done reading files.


Unnamed: 0,name,size,percentage,columns
0,Dictionary,8929,4.209727,"EN_def, wordcategory, wordclass"
1,Translate,14358,6.769321,"ANG_text, EN_text"
2,DOEC,188817,89.020952,ANG_text


## Split

In [12]:
# Split the dataset into train, val, test
def split_ttv(df: pd.DataFrame, train_size: float, val_size: float, test_size: float, random_state: int):
  # Get random seeds
  random.seed(random_state)
  seed_1 = random.randint(0, 1000)
  seed_2 = random.randint(0, 1000)

  # Split the dataset
  train_df, test_df = train_test_split(df, test_size=(val_size + test_size), random_state=seed_1)
  val_df, test_df = train_test_split(test_df, test_size=test_size/(val_size + test_size), random_state=seed_2)
  return train_df, val_df, test_df

def split_tt(df: pd.DataFrame, train_size: float, random_state: int):
  # Split the dataset
  train_df, test_df = train_test_split(df, train_size=train_size, random_state=random_state)
  return train_df, test_df

In [15]:
# Split translate dataset
translate_train_df, translate_test_df, translate_val_df = split_ttv(
  translate_df,
  SPLIT_TRAIN_RATIO,
  SPLIT_TEST_RATIO,
  SPLIT_VAL_RATIO,
  RANDOM_SEEDS["split0"]
)

# Print stats
print("========= Translate datasets =========")
getDatasetStats({"Train": translate_train_df, "Test": translate_test_df, "Val": translate_val_df})



Unnamed: 0,name,size,percentage,columns
0,Train,10050,69.995821,"ANG_text, EN_text"
1,Test,2872,20.002786,"ANG_text, EN_text"
2,Val,1436,10.001393,"ANG_text, EN_text"


## Data Preparation

### Translation Enrichment

To smooth the model's learning process, we will enrich the dataset with the Chain-of-Dictionary (CoD) technique. This technique consists of incorporating word-level translations into the fragments of the DOEC corpus that have been translated into Modern English. In orther to help tackle the problem of words Out-of-Vocabulary (OOV) the technique used for sampling the words is based on their frequency in the corpus, giving more weight to the less frequent words.


In [None]:

### === FREQUENCY UTILS === ###
def word_frecuency(dataset: Iterable[str]) -> pd.Series:
    """
    Calculate the frequency of each word in the dataset.
    """
    # Tokenize the dataset
    words = [word for text in dataset for word in word_tokenize(text)]
    # Calculate the frequency
    freq = pd.Series(words).value_counts()
    return freq

def top_k_infrequent(
        frequency_table: pd.Series,
        query: Iterable[str],
        k: int) -> list[str]:
    """
    Get the k most infrequent words in the dataset.
    Only considers words that exist in the table.
    Returns AT MOST k words, but no guarantees on the minimum number of words.
    """
    res = {}
    for word in query:
      if word in frequency_table:
        res[word] = frequency_table[word]
      #else:
      #print(f"Word not found in frequency table: {word}")
    # Sort items from the dict by value and return list of keys
    return [
       key
       for key, _
       in sorted(
          res.items(),
          key=lambda item: item[1]
        )[:k]
    ]

In [None]:

def apply_COD(
  translation_dataset : pd.DataFrame,
  word_dataset : pd.DataFrame,
  frequency_dataset : pd.Series,
  COD_split : float = 0.5,
  COD_words_ratio : float = 0.5, # Ratio of words to be explained
  COD_words_max : int = 10, # Maximum number of words to be explained
  random_state = 51,  
):
  # Set random seed
  random.seed(random_state)

  # Init COD and VANILLA datasets
  COD_output = []
  VANILLA_output = []

  # Shuffle the dataset
  input_dataset = translation_dataset.sample(frac=1, random_state=random_state).reset_index(drop=True)

  # Loop over translation dataset
  offset_vanilla = 0
  for i, (index, row) in enumerate(input_dataset.iterrows()):
    if i % 1000 == 0:
      print(f"C: Processing {i}th row from {input_dataset.shape[0]} rows")
    # If length/total reached split, break
    if len(COD_output) >= COD_split * input_dataset.shape[0]:
      # Set the offset for vanilla
      offset_vanilla = index
      break

    # Get the data
    ANG_text = row[COL_ANG_TEXT]
    EN_text = row[COL_EN_TEXT]

    # Search in the word dataset
    ANG_words = word_dataset[word_dataset[COL_ANG_TEXT] == ANG_text]

    # If not found, skip and push to VANILLA
    if len(ANG_words) == 0:
      VANILLA_output.append({
        COL_ANG_TEXT: ANG_text,
        COL_EN_TEXT: EN_text
      })
      continue

    # Get unique words
    ANG_words_unique = ANG_words[COL_ANG_WORD].unique()
    

    # Get the number of words to be explained
    n_words = min(COD_words_max, int(COD_words_ratio * len(ANG_words_unique)))
    # Select the words by frequency
    ANG_words_selected = top_k_infrequent(frequency_dataset, ANG_words_unique, n_words)

    # If no words are selected, skip and push to VANILLA
    if len(ANG_words_selected) == 0:
      VANILLA_output.append({
        COL_ANG_TEXT: ANG_text,
        COL_EN_TEXT: EN_text
      })
      continue
    
    # Build the CoD
    explanation_ang_en = ""
    explanation_en_ang = ""
    prefix = ""
    for ANG_word in ANG_words_selected:
      # Get the translation in context
      EN_word = ANG_words[ANG_words[COL_ANG_WORD] == ANG_word][COL_EN_WORD].values[0]

      # Add newline
      explanation_ang_en += prefix
      explanation_en_ang += prefix
      prefix = " "

      # Append to explanation
      explanation_ang_en += f"\"{ANG_word}\" means \"{EN_word}\"."
      explanation_en_ang += f"\"{EN_word}\" means \"{ANG_word}\"."

    # Append to COD_output
    COD_output.append({
      COL_ANG_TEXT: ANG_text,
      COL_EN_TEXT: EN_text,
      COL_COD_ANG2EN: explanation_ang_en,
      COL_COD_EN2ANG: explanation_en_ang
    })

  # Init VANILLA_output with the remaining rows
  VANILLA_input = input_dataset.iloc[offset_vanilla:]
  for i, (index, row) in enumerate(VANILLA_input.iterrows()):
    if i % 1000 == 0:
      print(f"V: Processing {i}th row from {VANILLA_input.shape[0]} rows")
    VANILLA_output.append({
      COL_ANG_TEXT: row[COL_ANG_TEXT],
      COL_EN_TEXT: row[COL_EN_TEXT]
    })


  # Turn the output into dataframes
  COD_output = pd.DataFrame(COD_output)
  VANILLA_output = pd.DataFrame(VANILLA_output)

  # Return the output
  return COD_output, VANILLA_output

In [None]:
# Get the most common words
word_frecuency_lookup = word_frecuency(doec_df["ANG_text"])

# Enrich using COD
translate_cod_df, translate_vanilla_df  = apply_COD(
  translate_train_df,
  word_translate_df,
  word_frecuency_lookup
)

print("========= Translation datasets =========")
getDatasetStats({"COD": translate_cod_df, "Vanilla": translate_vanilla_df})

### Corpus filtering
Due to the nature of the DOEC corpus, it contains a lot of noise and irrelevant information. To filter out the noise, we will use the following criteria:
- Remove fragments with less than 5 words.
- Remove fragments with less than 20 characters.
- Select the remaining fragments between q1 and q3 based on word and character count.

In [None]:

def corpus_filter(corpus: pd.DataFrame) -> pd.DataFrame:
  """
  Removes low quality rows from the corpus
  """
  # Measure initial size
  initial_size = corpus.shape[0]

  # Remove NaN
  corpus = corpus.dropna()

  # Drop low quality rows by rules
  # (Some rows are too short to be useful)
  # - more than 5 words
  corpus = corpus[corpus[COL_ANG_TEXT].str.split().str.len() > 5]
  # - more than 20 characters
  corpus = corpus[corpus[COL_ANG_TEXT].str.len() > 20]

  # Drop by quantiles
  # Filter out the rows that are outside of the 1st and 3rd quantiles

  # Get quantiles
  char_length_q1 = corpus[COL_ANG_TEXT].str.len().quantile(0.25)
  char_length_q3 = corpus[COL_ANG_TEXT].str.len().quantile(0.75)
  word_count_q1 = corpus[COL_ANG_TEXT].str.split().str.len().quantile(0.25)
  word_count_q3 = corpus[COL_ANG_TEXT].str.split().str.len().quantile(0.75)

  # Filter based on quantiles
  corpus = corpus[
    # Must be longer than Q1 and shorter than Q3
    # Using * 1.0 to convert to float
    (corpus[COL_ANG_TEXT].str.len() * 1.0 > char_length_q1) &
    (corpus[COL_ANG_TEXT].str.len() * 1.0 < char_length_q3) &
    (corpus[COL_ANG_TEXT].str.split().str.len() * 1.0 > word_count_q1) &
    (corpus[COL_ANG_TEXT].str.split().str.len() * 1.0 < word_count_q3) 
  ]

  # Measure final size
  final_size = corpus.shape[0]

  print(f"Filtered {initial_size - final_size} rows from {initial_size} to {final_size} (- {(initial_size - final_size) / initial_size * 100:.2f}%)")

  return corpus

In [None]:
corpus_filtered_df = corpus_filter(doec_df)

### Corpus splitting

The corpus will be used into different steps of training. To avoid the model from memorizing the translations, we will split the corpus into two parts: train and unseen. The train will be used for the pretraining of the model, while the unseen will be used for data augmentation.



In [None]:
# Split the corpus into Train and Unseen
train_corpus_df, unseen_corpus_df = split_tt(corpus_filtered_df, train_size=0.5, random_state=RANDOM_SEEDS["corpus0"])
# Show stats
print("=== Corpus Split ===")
getDatasetStats({"Train": train_corpus_df, "Unseen": unseen_corpus_df})

In [None]:
# Show stats after splitting
print("=== Train dataset after sampling ===")
getDatasetStats({"Dictionary": dictionary_df, "Translate CoD": translate_cod_df, "Translate Vanilla": translate_vanilla_df, "Corpus": train_corpus_df}) # type: ignore

## Format

In [None]:


language_codes = {
    "ANG": "Anglo-Saxon",
    "EN": "English"
}


def display_dataset_sample(title, ds, sample_index=0):
    print(f"=== {title} ===")
    print(f"Prompt: {ds['prompt'][sample_index]}")
    print(f"Answer: {ds['answer'][sample_index]}")
    print(f"Text: {ds['text'][sample_index]}")
    # Assert that the text is the prompt + answer
    if ds['text'][sample_index] != ds['prompt'][sample_index] + ds['answer'][sample_index]:
        print("ERROR: Text is not prompt + answer")

def display_dataset_sample_prompt_only(title, ds, sample_index=0):
    print(f"=== {title} ===")
    print(f"Prompt: {ds['prompt'][sample_index]}")


def get_translation_prompt_only(src, tgt, text):
    target_language = language_codes[tgt]
    source_language = language_codes[src]
    prompt = f"[INST]Translate the following {source_language} fragment to {target_language}[/INST]\n[{src}]{text}[/{src}]\n[{tgt}]"
    return {"prompt": prompt}

def get_definition_prompt(word, definition):
    prompt = f"[INST]What is the English definition of the following word in Anglo-Saxon?[/INST]\n[ANG]{word}[/ANG]\n[EN]"
    answer = f"{definition}[/EN]"
    text = f"{prompt}{answer}"
    return {"prompt": prompt, "answer": answer, "text": text}

def get_translation_prompt(src, tgt, text, translation):
    target_language = language_codes[tgt]
    source_language = language_codes[src]
    prompt = f"[INST]Translate the following {source_language} fragment to {target_language}[/INST]\n[{src}]{text}[/{src}]\n[{tgt}]"
    answer = f"{translation}[/{tgt}]"
    text = f"{prompt}{answer}"
    return {"prompt": prompt, "answer": answer, "text": text}

def get_cod_prompt(src, tgt, text, translation, cod):
    target_language = language_codes[tgt]
    source_language = language_codes[src]
    prompt = f"[INST]Translate the following {source_language} fragment to {target_language}[/INST]\n[{src}]{text}[/{src}]\n[DICT]{cod}[DICT]\n[{tgt}]"
    answer = f"{translation}[/{tgt}]"
    text = f"{prompt}{answer}"
    return {"prompt": prompt, "answer": answer, "text": text}

def get_corpus_prompt(src, text):
    source_language = language_codes[src]
    # Split text into words
    words = text.split()
    # Select half
    first_half = " ".join(words[:len(words)//2])
    second_half = " ".join(words[len(words)//2:])
    prompt = f"[{src}]{first_half} "
    answer = f"{second_half}[/{src}]"
    text = f"{prompt}{answer}"
    return {"prompt": prompt, "answer": answer, "text": text}

def build_presynth_dataset(
    corpus_df: pd.DataFrame,
):
    ## Format using translation prompt only
    corpus_ds = Dataset.from_pandas(corpus_df)
    corpus_ds = corpus_ds.map(
        lambda x: get_translation_prompt_only(LANG_CODE_ANG, LANG_CODE_EN, x[COL_ANG_TEXT]),
        remove_columns=corpus_ds.column_names
    )
    ## Print stats
    # Get the number of rows in each dataset and their % of the total
    total_rows = len(corpus_ds)
    #Count characters in prompts
    total_chars = sum([len(x['prompt']) for x in corpus_ds]) #type: ignore
    #Print stats
    print(f"Total rows: {total_rows}")
    print(f"Total characters: {total_chars}")
    # Display a sample
    display_dataset_sample_prompt_only("Presynth", corpus_ds)
    return corpus_ds

def build_test_dataset(
    parallel_df : pd.DataFrame,
):
    ## Parallel forward dataset
    parallel_ds = Dataset.from_pandas(parallel_df)
    parallel_forward_ds = parallel_ds.map(
        lambda x: get_translation_prompt(LANG_CODE_ANG, LANG_CODE_EN, x[COL_ANG_TEXT], x[COL_EN_TEXT]),
        remove_columns=parallel_ds.column_names
    )

    ## Parallel backward dataset
    parallel_backward_ds = parallel_ds.map(
        lambda x: get_translation_prompt(LANG_CODE_EN, LANG_CODE_ANG, x[COL_EN_TEXT], x[COL_ANG_TEXT]),
        remove_columns=parallel_ds.column_names
    )

    # Res
    test_datasets={
        "Forward": parallel_forward_ds,
        "Backward": parallel_backward_ds
    }

    #Display samples
    for name, ds in test_datasets.items():
        display_dataset_sample(name, ds)

    return test_datasets

def build_train_dataset(
    dictionary_df: pd.DataFrame,
    corpus_df: pd.DataFrame,
    parallel_vanilla_df: pd.DataFrame,
    parallel_cod_df: pd.DataFrame
):
    ## Dictionary dataset
    dictionary_ds = Dataset.from_pandas(dictionary_df)
    dictionary_ds = dictionary_ds.map(
        lambda x: get_definition_prompt(x[COL_ANG_WORD], x[COL_EN_DEF]),
        remove_columns=dictionary_ds.column_names
    )

    ## Corpus dataset
    corpus_ds = Dataset.from_pandas(corpus_df)
    corpus_ds = corpus_ds.map(
        lambda x: get_corpus_prompt(LANG_CODE_ANG, x[COL_ANG_TEXT]),
        remove_columns=corpus_ds.column_names
    )

    ## Parallel Vanilla dataset
    parallel_vanilla_ds = Dataset.from_pandas(parallel_vanilla_df)
    parallel_vanilla_forward_ds = parallel_vanilla_ds.map(
        lambda x: get_translation_prompt(LANG_CODE_ANG, LANG_CODE_EN, x[COL_ANG_TEXT], x[COL_EN_TEXT]),
        remove_columns=parallel_vanilla_ds.column_names
    )
    parallel_vanilla_backward_ds = parallel_vanilla_ds.map(
        lambda x: get_translation_prompt(LANG_CODE_EN, LANG_CODE_ANG, x[COL_EN_TEXT], x[COL_ANG_TEXT]),
        remove_columns=parallel_vanilla_ds.column_names
    )

    ## Parallel COD dataset
    parallel_cod_ds = Dataset.from_pandas(parallel_cod_df)
    parallel_cod_forward_ds = parallel_cod_ds.map(
        lambda x: get_cod_prompt(LANG_CODE_ANG, LANG_CODE_EN, x[COL_ANG_TEXT], x[COL_EN_TEXT], x[COL_COD_ANG2EN]),
        remove_columns=parallel_cod_ds.column_names
    )
    parallel_cod_backward_ds = parallel_cod_ds.map(
        lambda x: get_cod_prompt(LANG_CODE_EN, LANG_CODE_ANG, x[COL_EN_TEXT], x[COL_ANG_TEXT], x[COL_COD_EN2ANG]),
        remove_columns=parallel_cod_ds.column_names
    )
    
    ## Print stats
    # Get the number of rows in each dataset and their % of the total
    train_datasets = {
        "Dictionary": dictionary_ds,
        "Corpus": corpus_ds,
        "Parallel Vanilla Forward": parallel_vanilla_forward_ds,
        "Parallel Vanilla Backward": parallel_vanilla_backward_ds,
        "Parallel COD Forward": parallel_cod_forward_ds,
        "Parallel COD Backward": parallel_cod_backward_ds,
    }
    #Count rows
    total_rows = sum([len(ds) for ds in train_datasets.values()])
    #Count characters in prompts
    total_chars = 0
    for ds in train_datasets.values():
        total_chars += sum([len(x['prompt']) for x in ds]) #type: ignore	
    #Print stats
    print(f"Total rows: {total_rows}")
    print(f"Total characters: {total_chars}")
    for name, ds in train_datasets.items():
        print(f"{name}: {len(ds)} ({len(ds) / total_rows * 100:.2f}%)")
        current_chars = sum([len(x['prompt']) for x in ds]) #type: ignore	
        print(f"  - Characters: {current_chars} ({current_chars / total_chars * 100:.2f}%)")

    # Print a sample of each dataset
    for name, ds in train_datasets.items():
        display_dataset_sample(name, ds)

    # v1: Parallel + Corpus
    train_ds_v1 = concatenate_datasets(
        [train_datasets[ds] for ds in train_datasets if "Dictionary" not in ds]
    )
    train_ds_v1 = train_ds_v1.shuffle(seed=42)

    # v2: Parallel + Corpus + Dictionary
    train_ds_v2 = concatenate_datasets(
        [train_datasets[ds] for ds in train_datasets]
    )
    train_ds_v2 = train_ds_v2.shuffle(seed=42)

    return {
        "v1": train_ds_v1,
        "v2": train_ds_v2
    }


In [None]:
train_datasets = build_train_dataset(
  dictionary_df,
  train_corpus_df,
  translate_vanilla_df,
  translate_cod_df
)
presynth_dataset = build_presynth_dataset(
  unseen_corpus_df
)
test_datasets = build_test_dataset(
  translate_test_df
)
val_datasets = build_test_dataset(
  translate_val_df
)

## Upload

In [None]:
# Create repository
api = HfApi()
repo_url = api.create_repo(repo_id=HF_REPO,private=True,exist_ok=True)
# Upload readme (init)
api.upload_file(path_or_fileobj="README.md", path_in_repo="README.md",      repo_id=HF_REPO, commit_message="Add README.md")
# Create repository object
repo = Repository(local_dir="hf-repo", clone_from=repo_url)

# Save datasets
## Train
train_datasets["v1"].save_to_disk("hf-repo/train_v1")
train_datasets["v2"].save_to_disk("hf-repo/train_v2")
## Pre-Synthetic
presynth_dataset.save_to_disk("hf-repo/presynth")
## Test
test_datasets["Forward"].save_to_disk("hf-repo/test_forward")
test_datasets["Backward"].save_to_disk("hf-repo/test_backward")
## Val
val_datasets["Forward"].save_to_disk("hf-repo/val_forward")
val_datasets["Backward"].save_to_disk("hf-repo/val_backward")
## Push to repo
repo.push_to_hub(commit_message="Add datasets")