In [68]:
!pip3 install datasets
!pip3 install transformers



In [73]:
import pandas as pd
import numpy as np
import datasets
from transformers import T5Tokenizer

In [52]:
raw_data = pd.read_csv("Sentence pairs in French-English - 2025-04-13.tsv", sep='\t', header=None,
                       names=[0,'input_to_translate',2,'label'])

In [58]:
def generate_prompt(x):
    language_mapping = {"en":"English", "es":"Spanish", "de":"German", "fr": "French", "it":"Italian", "pt":"Portuguese"}
    source_text = x["input_to_translate"]
    language = x["language_pair"].split('_')[1]
    input_text = f"Translate French to {language_mapping[language]}: {source_text}"
    return input_text

In [59]:
training_features = pd.concat([raw_data[['input_to_translate']],raw_data[['label']]],axis=1)
training_features["language_pair"] = "fr_en"
training_features["ID"] = range(len(training_features))
training_features["prompt"] = training_features.apply(generate_prompt, axis=1)

Unnamed: 0,input_to_translate,label,language_pair,ID,prompt
0,Lorsqu'il a demandé qui avait cassé la fenêtre...,"When he asked who had broken the window, all t...",fr_en,0,Translate French to English: Lorsqu'il a deman...
1,Lorsqu'il a demandé qui avait cassé la fenêtre...,"Then, when he asked who had broken the window,...",fr_en,1,Translate French to English: Lorsqu'il a deman...


In [60]:
training_features.head(2)

"Translate French to English: Lorsqu'il a demandé qui avait cassé la fenêtre, tous les garçons ont pris un air innocent."

In [61]:
training_features.iloc[0]["prompt"]

"Translate French to English: Lorsqu'il a demandé qui avait cassé la fenêtre, tous les garçons ont pris un air innocent."

In [70]:
train_ds_raw = datasets.Dataset.from_pandas(training_features)


In [71]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [75]:
tokenized_source_training = train_ds_raw.map(
    lambda x: tokenizer(x["prompt"], truncation=True),
    batched=True, remove_columns=['ID', 'input_to_translate', 'label', 'language_pair', 'prompt'])

source_lengths_training = [len(x) for x in tokenized_source_training["input_ids"]]

Map:   0%|          | 0/417144 [00:00<?, ? examples/s]

In [76]:
tokenized_target_training = train_ds_raw.map(
    lambda x: tokenizer(x["label"], truncation=True),
    batched=True, remove_columns=['ID', 'input_to_translate', 'label', 'language_pair', 'prompt'])
target_lengths_training = [len(x) for x in tokenized_target_training["input_ids"]]

Map:   0%|          | 0/417144 [00:00<?, ? examples/s]

In [77]:
print(f"Max source length: {max(source_lengths_training)}")
print(f"95% source length: {int(np.percentile(source_lengths_training, 95))}")
print(f"Max target length: {max(target_lengths_training)}")
print(f"95% target length: {int(np.percentile(target_lengths_training, 95))}")

Max source length: 512
95% source length: 32
Max target length: 443
95% target length: 17


In [78]:
# reference: https://www.philschmid.de/fine-tune-flan-t5-deepspeed
def preprocess_function(sample, padding="max_length"):

    # tokenize inputs
    model_inputs = tokenizer(sample["prompt"], max_length=128, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["label"], max_length=128, padding=padding, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [82]:
tokenized_train_ds = train_ds_raw.map(
    preprocess_function, batched=True,
    remove_columns=['ID', 'input_to_translate', 'language_pair'])

Map:   0%|          | 0/417144 [00:00<?, ? examples/s]

In [83]:
tokenized_train_ds[0]

{'label': 'When he asked who had broken the window, all the boys put on an air of innocence.',
 'prompt': "Translate French to English: Lorsqu'il a demandé qui avait cassé la fenêtre, tous les garçons ont pris un air innocent.",
 'input_ids': [30355,
  15,
  2379,
  12,
  1566,
  10,
  15591,
  31,
  173,
  3,
  9,
  21088,
  285,
  3,
  3925,
  212,
  17230,
  50,
  25301,
  6,
  1739,
  110,
  27746,
  7,
  30,
  17,
  8060,
  73,
  799,
  16679,
  5,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1

In [84]:
tokenized_train_ds.save_to_disk("tokenized_train_ds")

Saving the dataset (0/2 shards):   0%|          | 0/417144 [00:00<?, ? examples/s]

In [85]:
!zip -r tatoeba_tokenized.zip tokenized_train_ds

  adding: tokenized_train_ds/ (stored 0%)
  adding: tokenized_train_ds/dataset_info.json (deflated 72%)
  adding: tokenized_train_ds/state.json (deflated 48%)
  adding: tokenized_train_ds/data-00000-of-00002.arrow (deflated 95%)
  adding: tokenized_train_ds/data-00001-of-00002.arrow (deflated 95%)
