### Machine Translation (MT) Experiments

This notebook was executed on Google Colab using a T4 GPU. Please note that the document names in the outputs may differ from those in the GitHub repository. For consistency, refer to the file names in the code rather than those in the output, as the names were updated later for clarity.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

!pip install transformers sacrebleu ctranslate2 unbabel-comet
!ct2-transformers-converter --model Helsinki-NLP/opus-mt-tc-big-en-lt --output_dir enlt_ctranslate2

Mounted at /content/drive
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ctranslate2
  Downloading ctranslate2-4.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting unbabel-comet
  Downloading unbabel_comet-2.2.4-py3-none-any.whl.metadata (19 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting entmax<2.0,>=1.1 (from unbabel-comet)
  Downloading entmax-1.3-py3-none-any.whl.metadata (348 bytes)
Collecting jsonargparse==3.13.1 (from unbabel-comet)
  Downloading jsonargparse-3.13.1-py3-none-any.whl.metadata (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m5.2 MB/s

In [2]:
!pip install sacremoses



In [12]:
import ctranslate2
from transformers import AutoTokenizer
import pandas as pd

In [13]:
# Figuring out the inner workings

src_lang = "en"
tgt_lang = "lt"

translator = ctranslate2.Translator("enlt_ctranslate2", device = "cuda")
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-lt")

input_text = "Hello, how are you?"

input_tokens = tokenizer.encode(input_text, return_tensors = "pt", add_special_tokens = True)
input_tokens_str = tokenizer.convert_ids_to_tokens(input_tokens[0].tolist())

results = translator.translate_batch([input_tokens_str], beam_size=1)
output_tokens = results[0].hypotheses[0]

output_text = tokenizer.decode(tokenizer.convert_tokens_to_ids(output_tokens))
print("Translated text:", output_text)

!echo "Labas, kaip sekasi?" > wmt23_example-ref.en-lt.lt  # Reference translation
!echo "Sveiki, kaip esate?" > wmt23_example-sys.en-lt.en  # System output
!sacrebleu wmt23_example-ref.en-lt.lt -i wmt23_example-sys.en-lt.en -m bleu chrf ter

!echo "Hello, how are you?" > wmt23_example.en-lt.en  # Source text
!comet-score -s wmt23_example.en-lt.en -t wmt23_example-sys.en-lt.en -r wmt23_example-ref.en-lt.lt

Translated text: Labas, kaip sekasi?




[
{
 "name": "BLEU",
 "score": 23.6,
 "signature": "nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.3",
 "verbose_score": "60.0/25.0/16.7/12.5 (BP = 1.000 ratio = 1.000 hyp_len = 5 ref_len = 5)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "13a",
 "smooth": "exp",
 "version": "2.4.3"
},
{
 "name": "chrF2",
 "score": 21.9,
 "signature": "nrefs:1|case:mixed|eff:yes|nc:6|nw:0|space:no|version:2.4.3",
 "nrefs": "1",
 "case": "mixed",
 "eff": "yes",
 "nc": "6",
 "nw": "0",
 "space": "no",
 "version": "2.4.3"
},
{
 "name": "TER",
 "score": 66.7,
 "signature": "nrefs:1|case:lc|tok:tercom|norm:no|punct:yes|asian:no|version:2.4.3",
 "nrefs": "1",
 "case": "lc",
 "tok": "tercom",
 "norm": "no",
 "punct": "yes",
 "asian": "no",
 "version": "2.4.3"
}
]
[0m2024-12-10 02:41:16.474286: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-10 02:4

In [43]:
# Using my whole dataset - 1st Experiment

dataset_path = "train-00000-of-00001.csv"
data = pd.read_csv(dataset_path)

translator = ctranslate2.Translator("enlt_ctranslate2", device="cuda")
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-lt")

def translate_text(input_text, src_lang, tgt_lang):
    input_tokens = tokenizer.encode(input_text, return_tensors="pt", add_special_tokens = True)
    input_tokens_str = tokenizer.convert_ids_to_tokens(input_tokens[0].tolist())

    results = translator.translate_batch([input_tokens_str], beam_size = 1)
    output_tokens = results[0].hypotheses[0]

    output_text = tokenizer.decode(tokenizer.convert_tokens_to_ids(output_tokens))
    return output_text

translated_sentences = []
for index, row in data.iterrows():
    source_text = row.iloc[2]  # English is in column 2
    target_text = row.iloc[1]  # Lithuanian is in column 1

    translated_text = translate_text(source_text, "en", "lt")
    translated_sentences.append(translated_text)

data['translated_text'] = translated_sentences

output_path = "translated_dataset_1.csv"
data.to_csv(output_path, index=False)
print(f"Translated dataset saved to {output_path}")



Translated dataset saved to translated_dataset.csv


In [44]:
# Evaluation

from sacrebleu import corpus_bleu, corpus_chrf
from comet import download_model, load_from_checkpoint

reference_texts = data["lt"].tolist()
system_outputs = data['translated_text'].tolist()

# Compute BLEU score
bleu_score = corpus_bleu(system_outputs, [reference_texts])
print(f"BLEU score: {bleu_score.score}")

# Compute CHRF score
chrf_score = corpus_chrf(system_outputs, [reference_texts])
print(f"CHRF score: {chrf_score.score}")

# The COMET model
model = download_model("wmt20-comet-da")
model = load_from_checkpoint(model)

comet_inputs = [{"src": source, "mt": mt, "ref": ref}
                for source, mt, ref in zip(data["en"], system_outputs, reference_texts)]

# Compute COMET scores
comet_scores = model.predict(comet_inputs, batch_size=8)
print(f"Average COMET score: {sum(comet_scores.scores) / len(comet_scores.scores)}")

BLEU score: 51.045566714196475
CHRF score: 75.19829112934362


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.3.5 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/unbabel_comet/wmt20-comet-da/checkpoints/model.ckpt`
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 66/66 [00:09<00:00,  6.99it/s]


Average COMET score: 0.702174056086326


In [45]:
data['BLEU_Score'] = bleu_score.score
data['CHRF_Score'] = chrf_score.score
data['COMET_Score'] = comet_scores.scores

data.to_csv("evaluation_results_1.csv", index = False)

In [19]:
# Starting again (improving the model)

dataset_path = "train-00000-of-00001.csv"
data = pd.read_csv(dataset_path)

# Creating batches
ranges = [
    (1, 17), (18, 33), (34, 53), (54, 71), (72, 77), (78, 94), (95, 107),
    (108, 128), (129, 139), (140, 148), (149, 160), (161, 171), (172, 184),
    (185, 191), (192, 197), (198, 204), (205, 218), (219, 229), (230, 248),
    (249, 261), (262, 273), (274, 283), (284, 294), (295, 306), (307, 315),
    (316, 325), (326, 341), (342, 354), (355, 369), (370, 376), (377, 386),
    (387, 397), (398, 409), (410, 419), (420, 431), (432, 441), (442, 458),
    (459, 483), (484, 498), (499, 523)]

batch_data = []

for start, end in ranges:
    batch = data[(data.iloc[:, 0] >= start) & (data.iloc[:, 0] <= end)]

    batch_info = {'range': f"{start}-{end}",
                  'en': batch.iloc[:, 1].tolist(),
                  'lt': batch.iloc[:, 2].tolist()}
    batch_data.append(batch_info)

batch_df = pd.DataFrame(batch_data)

final_data = []
for i, batch in enumerate(batch_data, start = 1):
    for en_sentence, lt_sentence in zip(batch['en'], batch['lt']):
        final_data.append({'range': batch['range'],
                           'en': en_sentence,
                           'lt': lt_sentence})

final_df = pd.DataFrame(final_data)

output_path = "batches_translated_2.csv"
final_df.to_csv(output_path, index = False)
print(f"Batches saved to {output_path}")

Batches saved to batches_translated.csv


In [26]:
# Translating again - 2nd Experiment

batch_file_path = "batches_translated.csv"
batch_data = pd.read_csv(batch_file_path)

translator = ctranslate2.Translator("enlt_ctranslate2", device="cuda")
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-lt")

def translate_batch(batch_texts):
    if isinstance(batch_texts, pd.Series):
        batch_texts = batch_texts.tolist()

    tokenized_texts = [tokenizer.encode(text, return_tensors = "pt", add_special_tokens = True) for text in batch_texts]
    tokenized_texts_str = [tokenizer.convert_ids_to_tokens(tokens[0].tolist()) for tokens in tokenized_texts]

    results = translator.translate_batch(tokenized_texts_str, beam_size=1)
    translated_texts = [result.hypotheses[0] for result in results]

    translated_texts_decoded = []
    for tokens in translated_texts:
        decoded_text = tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens), skip_special_tokens=True)
        translated_texts_decoded.append(decoded_text)

    return translated_texts_decoded

translated_batches = []
for _, batch in batch_data.iterrows():
    sentence_range = batch['range']
    english_sentences = batch['en']

    if isinstance(english_sentences, str):
        english_sentences = [english_sentences]

    translated_sentences = translate_batch(english_sentences)

    for en, lt in zip(english_sentences, translated_sentences):
        translated_batches.append(list(batch) + [lt])

translated_df = pd.DataFrame(translated_batches, columns=batch_data.columns.tolist() + ['translated_text'])
translated_df.to_csv("translated_sentences_2.csv", index=False)
print("Translation complete and saved to 'translated_sentences_2.csv'")



Translation complete and saved to 'translated_sentences_batches.csv'


In [31]:
# Evaluation

reference_texts = translated_df["lt"].tolist()
system_outputs = translated_df['translated_text'].tolist()

# Compute BLEU score
bleu_score = corpus_bleu(system_outputs, [reference_texts])
print(f"BLEU score: {bleu_score.score}")

# Compute CHRF score
chrf_score = corpus_chrf(system_outputs, [reference_texts])
print(f"CHRF score: {chrf_score}")

# The COMET model
model = download_model("wmt20-comet-da")
model = load_from_checkpoint(model)

comet_inputs = [{"src": source, "mt": mt, "ref": ref}
                for source, mt, ref in zip(translated_df["en"], system_outputs, reference_texts)]

# Compute COMET scores
comet_scores = model.predict(comet_inputs, batch_size=8)
average_comet_score = sum(comet_scores.scores) / len(comet_scores.scores) if comet_scores.scores else None
print(f"Average COMET score: {average_comet_score}")

BLEU score: 23.217024112151968
CHRF score: chrF2 = 51.11


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.3.5 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/unbabel_comet/wmt20-comet-da/checkpoints/model.ckpt`
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 66/66 [00:08<00:00,  7.57it/s]


Average COMET score: 0.678733026295416


In [32]:
translated_df['BLEU_Score'] = bleu_score.score
translated_df['CHRF_Score'] = chrf_score.score
translated_df['COMET_Score'] = comet_scores.scores

translated_df.to_csv("evaluation_results_2.csv", index = False)

In [40]:
# ... Translating again - 3rd Experiment

dataset_path = "train-00000-of-00001.csv"
data = pd.read_csv(dataset_path)

translator = ctranslate2.Translator("enlt_ctranslate2", device="cuda")
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-lt")

def translate_text(input_text, src_lang, tgt_lang):
    input_text = input_text.lower()
    input_tokens = tokenizer.encode(input_text, return_tensors="pt", add_special_tokens = True)
    input_tokens_str = tokenizer.convert_ids_to_tokens(input_tokens[0].tolist())

    results = translator.translate_batch([input_tokens_str], beam_size = 1)
    output_tokens = results[0].hypotheses[0]

    output_text = tokenizer.decode(tokenizer.convert_tokens_to_ids(output_tokens))
    return output_text

translated_sentences = []
for index, row in data.iterrows():
    source_text = row.iloc[2]  # English is in column 2
    target_text = row.iloc[1]  # Lithuanian is in column 1

    translated_text = translate_text(source_text, "en", "lt")
    translated_sentences.append(translated_text)

data['translated_text'] = translated_sentences

output_path = "translated_dataset_3.csv"
data.to_csv(output_path, index=False)
print(f"Translated dataset saved to {output_path}")



Translated dataset saved to translated_dataset_2.csv


In [41]:
# Evaluation

from sacrebleu import corpus_bleu, corpus_chrf
from comet import download_model, load_from_checkpoint

reference_texts = data["lt"].tolist()
system_outputs = data['translated_text'].tolist()

# Compute BLEU score
bleu_score = corpus_bleu(system_outputs, [reference_texts])
print(f"BLEU score: {bleu_score.score}")

# Compute CHRF score
chrf_score = corpus_chrf(system_outputs, [reference_texts])
print(f"CHRF score: {chrf_score.score}")

# The COMET model
model = download_model("wmt20-comet-da")
model = load_from_checkpoint(model)

comet_inputs = [{"src": source, "mt": mt, "ref": ref}
                for source, mt, ref in zip(data["en"], system_outputs, reference_texts)]

# Compute COMET scores
comet_scores = model.predict(comet_inputs, batch_size=8)
print(f"Average COMET score: {sum(comet_scores.scores) / len(comet_scores.scores)}")

BLEU score: 15.575873330891495
CHRF score: 57.73777628176294


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.3.5 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/unbabel_comet/wmt20-comet-da/checkpoints/model.ckpt`
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 66/66 [00:08<00:00,  7.69it/s]


Average COMET score: 0.32297127624442995


In [42]:
data['BLEU_Score'] = bleu_score.score
data['CHRF_Score'] = chrf_score.score
data['COMET_Score'] = comet_scores.scores

data.to_csv("evaluation_results_3.csv", index = False)

In [47]:
# ... Translating again and again... - 4th Experiment

dataset_path = "train-00000-of-00001.csv"
data = pd.read_csv(dataset_path)

translator = ctranslate2.Translator("enlt_ctranslate2", device = "cuda")
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-lt")

def translate_text(input_text, src_lang, tgt_lang):
    input_tokens = tokenizer.encode(input_text, return_tensors = "pt", add_special_tokens = True)
    input_tokens_str = tokenizer.convert_ids_to_tokens(input_tokens[0].tolist())

    results = translator.translate_batch([input_tokens_str], beam_size = 5)
    output_tokens = results[0].hypotheses[0]

    output_text = tokenizer.decode(tokenizer.convert_tokens_to_ids(output_tokens))
    return output_text

translated_sentences = []
for index, row in data.iterrows():
    source_text = row.iloc[2]  # English is in column 2
    target_text = row.iloc[1]  # Lithuanian is in column 1

    translated_text = translate_text(source_text, "en", "lt")
    translated_sentences.append(translated_text)

data['translated_text'] = translated_sentences

output_path = "translated_dataset_4.csv"
data.to_csv(output_path, index=False)
print(f"Translated dataset saved to {output_path}")



Translated dataset saved to translated_dataset_3.csv


In [48]:
# Evaluation

from sacrebleu import corpus_bleu, corpus_chrf
from comet import download_model, load_from_checkpoint

reference_texts = data["lt"].tolist()
system_outputs = data['translated_text'].tolist()

# Compute BLEU score
bleu_score = corpus_bleu(system_outputs, [reference_texts])
print(f"BLEU score: {bleu_score.score}")

# Compute CHRF score
chrf_score = corpus_chrf(system_outputs, [reference_texts])
print(f"CHRF score: {chrf_score.score}")

# The COMET model
model = download_model("wmt20-comet-da")
model = load_from_checkpoint(model)

comet_inputs = [{"src": source, "mt": mt, "ref": ref}
                for source, mt, ref in zip(data["en"], system_outputs, reference_texts)]

# Compute COMET scores
comet_scores = model.predict(comet_inputs, batch_size=8)
print(f"Average COMET score: {sum(comet_scores.scores) / len(comet_scores.scores)}")

BLEU score: 58.45374263757147
CHRF score: 80.04940936491896


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.3.5 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/unbabel_comet/wmt20-comet-da/checkpoints/model.ckpt`
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 66/66 [00:08<00:00,  8.00it/s]


Average COMET score: 0.8725779490014735


In [49]:
data['BLEU_Score'] = bleu_score.score
data['CHRF_Score'] = chrf_score.score
data['COMET_Score'] = comet_scores.scores

data.to_csv("evaluation_results_4.csv", index = False)