# Statistical Machine Translation

### Plan
- Preprocess dataset
- Train Model
- Evaluate Performace

In [49]:
%pip install -r requirements.txt

Collecting eflomal (from -r requirements.txt (line 5))
Collecting eflomal (from -r requirements.txt (line 5))
  Downloading eflomal-2.0.0.tar.gz (132 kB)
  Downloading eflomal-2.0.0.tar.gz (132 kB)
  Installing build dependencies ... [?25l  Installing build dependencies ... [?25l-done
[?25h  Getting requirements to build wheel ... [?25done
[?25h  Getting requirements to build wheel ... [?25l-done
[?25h  Preparing metadata (pyproject.toml) ... [?25done
[?25h  Preparing metadata (pyproject.toml) ... [?25l-done
done
Collecting Cython (from eflomal->-r requirements.txt (line 5))
  Using cached cython-3.2.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (6.7 kB)
Using cached cython-3.2.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.4 MB)
Building wheels for collected packages: eflomal
  Building wheel for eflomal (pyproject.toml) ... [?25lCollecting Cython (from eflomal->-r requirements.txt (line 5

Based on the Language Clustering done in MCO2 we have chosen to train a Machine Translation Model for these pairs of languages.

In [2]:
from pathlib import Path

language_pairs_file = {
    "Bikolano_Pangasinan": "Bikolano_Pangasinan_Parallel.tsv",
    "Bikolano_Tagalog": "Bikolano_Tagalog_Parallel.tsv",
    "Ivatan_Pangasinan": "Ivatan_Pangasinan_Parallel.tsv",
    "Ivatan_Yami": "Ivatan_Yami_Parallel.tsv"
}

Clean up verses, tokenize the words and create the aligned verses for the two target languages

In [6]:
from utils.preprocess import preprocess_parallel_tsv, save_aligned_corpus

# Download NLTK punkt tokenizer if not already present
import nltk
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

data_path = Path("data/dataset")
# Preprocess the parallel TSV file
tsv_file = language_pairs_file["Bikolano_Tagalog"]
# get the src and target column name
source_col, target_col = tsv_file.replace(".tsv", "").replace("data/", "").split("_Parallel")[0].split("_")

print(f"Preprocessing {tsv_file}...")
print(f"Source language: {source_col}")
print(f"Target language: {target_col}\n")

source_tokens, target_tokens = preprocess_parallel_tsv(
    data_path / tsv_file, 
    source_col, 
    target_col,
    output_dir="data/aligned",
    prefix="en_tl_"
)

print(f"\nLoaded {len(source_tokens)} valid aligned pairs")
if source_tokens:
    print(f"First source sentence tokens: {source_tokens[0]}")
    print(f"First target sentence tokens: {target_tokens[0]}")

# Save aligned corpus
save_aligned_corpus(source_tokens, target_tokens, output_dir="data/aligned", prefix=f"{source_col}_{target_col}_")

Preprocessing Bikolano_Tagalog_Parallel.tsv...
Source language: Bikolano
Target language: Tagalog

Processed 33900 verses from data/dataset/Bikolano_Tagalog_Parallel.tsv
  Valid aligned pairs: 33873
  Skipped (missing/empty verses): 27

Loaded 33873 valid aligned pairs
First source sentence tokens: ['si', 'adan', 'iyo', 'an', 'ama', 'ni', 'set', 'asin', 'si', 'set', 'iyo', 'an', 'ama', 'ni', 'enos', 'na', 'ama', 'ni', 'kenan']
First target sentence tokens: ['sina', 'adan', 'set', 'enos']
Saved 33873 aligned pairs to:
  Source: data/aligned/Bikolano_Tagalog_source.txt
  Target: data/aligned/Bikolano_Tagalog_target.txt


## Training the Machine Translation Model
For this we will use the nltk IBMModels to train the parallel corpus that we have.

### Training IBMModel1

In [7]:
from nltk.translate import AlignedSent
aligned_verses = []
for i in range(len(source_tokens)):
    # Make sure neither verse is empty
    if source_tokens[i] and target_tokens[i]:
        # Note the order: AlignedSent(target_tokens, source_tokens)
        aligned_verses.append(
            AlignedSent(target_tokens[i], source_tokens[i])
        )

print(f"Created {len(aligned_verses)} AlignedSent objects.")

Created 33873 AlignedSent objects.


In [39]:
from nltk.translate import  IBMModel1
from pathlib import Path
import dill as pickle

print("\nTraining IBM Model 1...")

ibm1 = IBMModel1(aligned_verses, 5)

# Save the translation table (can't pickle the model directly due to lambda functions)
model1_path = Path("data/models/ibm_model_1.pkl")
model1_path.parent.mkdir(parents=True, exist_ok=True)

with open("t_table.pkl", "wb") as f:
    pickle.dump(ibm1, f)
print(f"IBM Model 1 translation table saved to {model1_path}")


Training IBM Model 1...
IBM Model 1 translation table saved to data/models/ibm_model_1.pkl
IBM Model 1 translation table saved to data/models/ibm_model_1.pkl


Lets test some common words and their equivalent word in the other language to see the probability if the source word will translate to the target word

In [None]:
print(ibm1.translation_table["sila"]["sinda"])
print(ibm1.translation_table["bahay"]["harong"])
print(ibm1.translation_table["apoy"]["kalayo"])
print(ibm1.translation_table["pagibig"]["pagkamoot"])
print(ibm1.translation_table["araw"]["aldaw"])
print(ibm1.translation_table["anak"]["aki"])

0.24296565843459514
0.8265653837415294
0.8772253051267173
0.6107352610070307
0.8147684148212581


### Training IBMModel2

In [46]:
from nltk.translate import IBMModel2
import dill as pickle

print("\nTraining IBM Model 2...")
ibm2 = IBMModel2(aligned_verses, 5)

# Save the trained IBM Model 2
model2_path = Path("data/models/ibm_model_2.pkl")
model2_path.parent.mkdir(parents=True, exist_ok=True)
with open(model2_path, "wb") as f:
    pickle.dump(ibm2.translation_table, f)
print(f"IBM Model 2 saved to {model2_path}")



Training IBM Model 2...
IBM Model 2 saved to data/models/ibm_model_2.pkl
IBM Model 2 saved to data/models/ibm_model_2.pkl


Lets test the earlier words using IBMModel2 this time

As we can see some words had a higher increase in probability, while some only increased a few percent.

In [None]:
print(ibm2.translation_table["sila"]["sinda"])
print(ibm2.translation_table["bahay"]["harong"])
print(ibm2.translation_table["apoy"]["kalayo"])
print(ibm2.translation_table["pagibig"]["pagkamoot"])
print(ibm2.translation_table["araw"]["aldaw"])
print(ibm2.translation_table["anak"]["aki"])

0.30840326048144967
0.948142201752353
0.9363940892990702
0.6399685190164236
0.953474926632886


### Training IBMModel3

In [None]:
from nltk.translate import IBMModel3
import dill as pickle

print("\nTraining IBM Model 3...")
ibm3 = IBMModel3(aligned_verses, 5)

# Save the trained IBM Model 3
model3_path = Path("data/models/ibm_model_3.pkl")
model3_path.parent.mkdir(parents=True, exist_ok=True)
with open(model3_path, "wb") as f:
    pickle.dump(ibm3.translation_table, f)
print(f"IBM Model 3 saved to {model3_path}")


Training IBM Model 3...
