<a href="https://colab.research.google.com/github/vriadi/CS614-Gen-AI-with-LLMs/blob/main/CS614_Individual_Assignment_NusaX_MT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation

In [None]:
%%capture
import os, re
import torch

if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2

In [None]:
from unsloth import FastModel

BASE_MODEL = "unsloth/gemma-3-4b-it"

model, tokenizer = FastModel.from_pretrained(
    model_name = BASE_MODEL,
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.10.3: Fast Gemma3 patching. Transformers: 4.55.4.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Gemma3 does not support SDPA - switching to fast eager.


model.safetensors:   0%|          | 0.00/4.56G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

In [None]:
model = FastModel.get_peft_model(
    model,
    finetune_language_layers=True,
    finetune_attention_modules=True,
    finetune_mlp_modules=True,
    r=16,              # increased expressiveness
    lora_alpha=16,
    lora_dropout=0.05, # small dropout to reduce overfitting
    bias="none"
)


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


Unsloth: Making `base_model.model.model.vision_tower.vision_model` require gradients


# Dataset

https://huggingface.co/datasets/indonlp/NusaX-MT

https://github.com/IndoNLP/nusax/blob/main/datasets/mt/train.csv

In [None]:
from datasets import Dataset, DatasetDict, load_dataset, Dataset
import pandas as pd
from itertools import islice


# URLs for the CSV files
train_url = "https://raw.githubusercontent.com/IndoNLP/nusax/main/datasets/mt/train.csv"
valid_url = "https://raw.githubusercontent.com/IndoNLP/nusax/main/datasets/mt/valid.csv"
test_url  = "https://raw.githubusercontent.com/IndoNLP/nusax/main/datasets/mt/test.csv"

# Load each as a Hugging Face Dataset
train_ds = Dataset.from_csv(train_url)
valid_ds = Dataset.from_csv(valid_url)
test_ds  = Dataset.from_csv(test_url)

print(train_ds)
print(valid_ds)
print(test_ds)

Downloading data:   0%|          | 0.00/935k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/184k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['Unnamed: 0', 'indonesian', 'acehnese', 'banjarese', 'english', 'madurese', 'ngaju', 'sundanese', 'balinese', 'buginese', 'javanese', 'minangkabau', 'toba_batak'],
    num_rows: 500
})
Dataset({
    features: ['Unnamed: 0', 'indonesian', 'acehnese', 'banjarese', 'english', 'madurese', 'ngaju', 'sundanese', 'balinese', 'buginese', 'javanese', 'minangkabau', 'toba_batak'],
    num_rows: 100
})
Dataset({
    features: ['Unnamed: 0', 'indonesian', 'acehnese', 'banjarese', 'english', 'madurese', 'ngaju', 'sundanese', 'balinese', 'buginese', 'javanese', 'minangkabau', 'toba_batak'],
    num_rows: 400
})


In [None]:
# Keep only english and indonesian
train_ds_clean = valid_ds.remove_columns([col for col in valid_ds.column_names if col not in ["english", "indonesian"]])
valid_ds_clean = valid_ds.remove_columns([col for col in valid_ds.column_names if col not in ["english", "indonesian"]])
test_ds_clean = test_ds.remove_columns([col for col in test_ds.column_names if col not in ["english", "indonesian"]])


# Convert to DatasetDict
dataset_sft_dict = DatasetDict({
    "train": train_ds_clean,
    "validation": valid_ds_clean,
    "test": test_ds_clean
})

In [None]:
dataset_sft_dict

DatasetDict({
    train: Dataset({
        features: ['indonesian', 'english'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['indonesian', 'english'],
        num_rows: 100
    })
    test: Dataset({
        features: ['indonesian', 'english'],
        num_rows: 400
    })
})

In [None]:
dataset_sft_dict["train"][5]

{'indonesian': 'Restoran bali yang memiliki konsep makan di sawah. Pemandangannya seperti di desa. Makanannya enak',
 'english': 'A Balinese restaurant with the concept of eating in the ricefields. Scenery resemblant of the villages. The food is excellent.'}

# Test Baseline model

In [None]:
!pip install nltk rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=28206fd8ae87b152ee9b67da7fba3ce1c895a527cf40bb0cb2df24ce912ae0f6
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sacrebleu.metrics import CHRF

# Select test set
test_dataset = dataset_sft_dict["test"]

def generate_translation(prompt, max_new_tokens=128):
    model.eval() # Ensure model is in evaluation mode
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)

    # Move tensors to GPU
    inputs = {key: value.cuda() for key, value in inputs.items()}

    if "attention_mask" in inputs:
        inputs["attention_mask"] = inputs["attention_mask"].to(model.dtype)


    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            do_sample=False
        )

    # decode only the generated part
    return tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()

In [None]:
def _get_text_tokenizer(tok):
    return getattr(tok, "tokenizer", tok)

def chat_generate_clean(
    model, tokenizer, prompt,
    system="You are an Indonesian language translator assistant.\n\nWhen the user asks a phrase, only reply with Indonesian:\n **<phrase>** (pronunciation: <...>)",
    max_new_tokens=128, do_sample=False, temperature=0.5, top_p=None
):
    model.eval()
    tok = _get_text_tokenizer(tokenizer)

    # Ensure pad token exists
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    tok.padding_side = "right"

    # Prepare chat messages
    msgs = [
        {"role": "system", "content": system},
        {"role": "user", "content": prompt},
    ]

    # Apply chat template if available
    apply_ct = getattr(tok, "apply_chat_template", None)
    if callable(apply_ct):
        templated = tok.apply_chat_template(msgs, add_generation_prompt=True, tokenize=False)
        enc = tok(templated, return_tensors="pt", padding=True, truncation=True)
    else:
        text = f"<|system|>\n{system}\n<|user|>\n{prompt}\n<|assistant|>\n"
        enc = tok(text, return_tensors="pt", padding=True, truncation=True)

    # Generate output
    with torch.no_grad():
        out = model.generate(
            **enc,
            max_new_tokens=max_new_tokens,
            do_sample=do_sample,
            temperature=temperature if do_sample else None,
            top_p=top_p if do_sample else None,
            pad_token_id=tok.eos_token_id,
            eos_token_id=tok.eos_token_id,
            return_dict_in_generate=True,
        )

    prompt_len = enc["input_ids"].shape[1]
    new_tokens = out.sequences[0, prompt_len:]
    return tok.decode(new_tokens, skip_special_tokens=True).strip()


In [None]:
n_eval = 5

base_prompts = [f"Translate from English to Indonesian Language: {row['english']} Indonesian Result:" for row in test_dataset.select(range(n_eval))]
base_refs = [row["indonesian"] for row in test_dataset.select(range(n_eval))]
basegen_outputs = [generate_translation(p) for p in base_prompts]

In [None]:
pd.DataFrame({
    "Prompt": base_prompts,
    "Reference": base_refs,
    "Output": basegen_outputs
})

Unnamed: 0,Prompt,Reference,Output
0,Translate from English to Indonesian Language:...,"Dekat dengan hotel saya menginap, hanya ditemp...","Di dekat hotel tempat saya menginap, dapat dic..."
1,Translate from English to Indonesian Language:...,"Iya benar, dia sedang jaga warung.","Ya, betul, dia sekarang yang menjaga toko ters..."
2,Translate from English to Indonesian Language:...,Kangkungnya lumayan tapi kepiting saus padangn...,"Selada airnya lumayan, tetapi udang dengan sau..."
3,Translate from English to Indonesian Language:...,Bertempat di braga city walk yang satu gedung ...,"Terletak di dalam Braga City Walk, yang berada..."
4,Translate from English to Indonesian Language:...,Gianyar terima bantuan sosial 2018 sebesar rp ...,"Gianyar menerima total 44,9 miliar Rupiah dari..."


In [None]:
smooth_fn = SmoothingFunction().method1
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
chrf_scorer = CHRF()

# Compute BLEU, ROUGE, chrF
basebleu_scores, baserouge1_scores, baserougeL_scores, basechrF_scores = [], [], [], []

for gen, ref in zip(basegen_outputs, base_refs):
    basebleu_scores.append(sentence_bleu([ref.split()], gen.split(), smoothing_function=smooth_fn))
    baserouge = scorer.score(ref, gen)
    baserouge1_scores.append(baserouge['rouge1'].fmeasure)
    baserougeL_scores.append(baserouge['rougeL'].fmeasure)
    basechrF_scores.append(chrf_scorer.sentence_score(gen, [ref]).score / 100)

# Save results
base_eval = pd.DataFrame({
    "Base Output": basegen_outputs,
    "Reference": base_refs,
    "Base BLEU": basebleu_scores,
    "Base ROUGE-1": baserouge1_scores,
    "Base ROUGE-L": baserougeL_scores,
    "Base chrF": basechrF_scores
})

display(base_eval.describe().T)
# df_eval.to_csv("gemma_nusax_en_id_eval.csv", index=False)
# print("Saved gemma_nusax_en_id_eval.csv")

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Base BLEU,5.0,0.019121,0.007249,0.010457,0.01316,0.020256,0.023957,0.027776
Base ROUGE-1,5.0,0.382259,0.146297,0.142857,0.35,0.434783,0.47619,0.507463
Base ROUGE-L,5.0,0.323403,0.12758,0.142857,0.25,0.347826,0.428571,0.447761
Base chrF,5.0,0.424982,0.08829,0.296033,0.381128,0.447528,0.479701,0.520519


# SFT

In [None]:
def format_for_sft(example):
    return {
        "prompt": f"Translate from English to Indonesian language: {example['english']}",
        "completion": example["indonesian"],
        "text": f"Translate {example['english']} to Indonesian: {example['indonesian']}"  # SFTTrainer field
    }

dataset_sft_dict = dataset_sft_dict.map(format_for_sft)


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [None]:
dataset_sft_dict

DatasetDict({
    train: Dataset({
        features: ['indonesian', 'english', 'prompt', 'completion', 'text'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['indonesian', 'english', 'prompt', 'completion', 'text'],
        num_rows: 100
    })
    test: Dataset({
        features: ['indonesian', 'english', 'prompt', 'completion', 'text'],
        num_rows: 400
    })
})

In [None]:
dataset_sft_dict["train"][5]

{'indonesian': 'Restoran bali yang memiliki konsep makan di sawah. Pemandangannya seperti di desa. Makanannya enak',
 'english': 'A Balinese restaurant with the concept of eating in the ricefields. Scenery resemblant of the villages. The food is excellent.',
 'prompt': 'Translate from English to Indonesian language: A Balinese restaurant with the concept of eating in the ricefields. Scenery resemblant of the villages. The food is excellent.',
 'completion': 'Restoran bali yang memiliki konsep makan di sawah. Pemandangannya seperti di desa. Makanannya enak',
 'text': 'Translate A Balinese restaurant with the concept of eating in the ricefields. Scenery resemblant of the villages. The food is excellent. to Indonesian: Restoran bali yang memiliki konsep makan di sawah. Pemandangannya seperti di desa. Makanannya enak'}

In [None]:
print(dataset_sft_dict["train"].column_names)

['indonesian', 'english', 'prompt', 'completion', 'text']


In [None]:
# Load tokenizer
tok = getattr(tokenizer, "tokenizer", tokenizer)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token
tok.padding_side = "right"

In [None]:
# from trl import SFTTrainer, SFTConfig
# from transformers import DataCollatorForLanguageModeling

# # Data collator for causal LM
# collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)

# # # SFTTrainer arguments
# # sft_args = SFTConfig(
# #     dataset_text_field="text",
# #     per_device_train_batch_size=4,
# #     gradient_accumulation_steps=4,
# #     warmup_steps=50,
# #     max_steps=500,
# #     learning_rate=1e-4,
# #     logging_steps=50,
# #     optim="adamw_8bit",
# #     weight_decay=0.01,
# #     lr_scheduler_type="linear",
# #     seed=3407,
# #     report_to="none",
# #     padding_free=False,
# #     packing=False,
# #     max_seq_length=512,
# #     remove_unused_columns=True,
# # )

# sft_args = SFTConfig(
#     dataset_text_field="prompt",  # <-- use correct field
#     per_device_train_batch_size=4,
#     gradient_accumulation_steps=4,
#     num_train_epochs=20,
#     learning_rate=1e-4,
#     warmup_steps=50,
#     logging_steps=50,
#     optim="adamw_8bit",
#     weight_decay=0.01,
#     lr_scheduler_type="linear",
#     seed=3407,
#     report_to="none",
#     padding_free=False,
#     packing=False,
#     max_seq_length=512,
#     remove_unused_columns=True,
#     eval_strategy="epoch",
#     logging_strategy="epoch"
# )


# trainer = SFTTrainer(
#     model=model,
#     tokenizer=tokenizer,
#     train_dataset=dataset_sft_dict["train"],
#     eval_dataset=dataset_sft_dict["validation"],
#     args=sft_args,
#     data_collator=collator,
#     formatting_func=None
# )

In [None]:
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForLanguageModeling

# Data collator for causal LM
collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)

sft_args = SFTConfig(
    dataset_text_field="text",  # <-- use correct field
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=10,
    learning_rate=1e-4,
    warmup_steps=50,
    logging_steps=50,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    report_to="none",
    padding_free=False,
    packing=False,
    max_seq_length=512,
    remove_unused_columns=True,
    eval_strategy="epoch",
    logging_strategy="epoch"
)


trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset_sft_dict["train"],
    eval_dataset=dataset_sft_dict["validation"],
    args=sft_args,
    data_collator=collator,
    formatting_func=None
)

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# Train LoRA adapter
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100 | Num Epochs = 10 | Total steps = 70
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 38,497,792 of 4,338,577,264 (0.89% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss
1,4.3608,4.384005
2,4.271,4.056793
3,3.7433,3.484424
4,3.2406,3.105966
5,2.9231,2.725834
6,2.6069,2.47882
7,2.3219,2.17587
8,2.0713,1.930417
9,1.8926,1.782894
10,1.7532,1.721951


Unsloth: Not an error, but Gemma3ForConditionalGeneration does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


TrainOutput(global_step=70, training_loss=2.918447576250349, metrics={'train_runtime': 285.1826, 'train_samples_per_second': 3.507, 'train_steps_per_second': 0.245, 'total_flos': 2551459375895808.0, 'train_loss': 2.918447576250349, 'epoch': 10.0})

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

best_model_path = "./bestmodel"
trainer.save_model(best_model_path)
tokenizer.save_pretrained(best_model_path)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bestmodel")

model, tokenizer = FastModel.from_pretrained(
    model_name = "bestmodel",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    device_map="auto"
    # token = "hf_...", # use one if using gated models
)

==((====))==  Unsloth 2025.10.3: Fast Gemma3 patching. Transformers: 4.55.4.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Gemma3 does not support SDPA - switching to fast eager.


# Evaluation

In [None]:
# from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
# from rouge_score import rouge_scorer
# from sacrebleu.metrics import CHRF

# Select test set
test_dataset = dataset_sft_dict["test"]

def generate_translation(prompt, max_new_tokens=128):
    model.eval() # Ensure model is in evaluation mode
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)

    # Move tensors to GPU
    inputs = {key: value.cuda() for key, value in inputs.items()}

    if "attention_mask" in inputs:
        inputs["attention_mask"] = inputs["attention_mask"].to(model.dtype)


    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            do_sample=False
        )

    # decode only the generated part
    return tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()

In [None]:
def _get_text_tokenizer(tok):
    return getattr(tok, "tokenizer", tok)

def chat_generate_clean(
    model, tokenizer, prompt,
    system="You are an Indonesian language translator assistant.\n\nWhen the user asks a phrase, only reply with Indonesian:\n **<phrase>** (pronunciation: <...>)",
    max_new_tokens=128, do_sample=False, temperature=0.5, top_p=None
):
    model.eval()
    tok = _get_text_tokenizer(tokenizer)

    # Ensure pad token exists
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    tok.padding_side = "right"

    # Prepare chat messages
    msgs = [
        {"role": "system", "content": system},
        {"role": "user", "content": prompt},
    ]

    # Apply chat template if available
    apply_ct = getattr(tok, "apply_chat_template", None)
    if callable(apply_ct):
        templated = tok.apply_chat_template(msgs, add_generation_prompt=True, tokenize=False)
        enc = tok(templated, return_tensors="pt", padding=True, truncation=True)
    else:
        text = f"<|system|>\n{system}\n<|user|>\n{prompt}\n<|assistant|>\n"
        enc = tok(text, return_tensors="pt", padding=True, truncation=True)

    # Generate output
    with torch.no_grad():
        out = model.generate(
            **enc,
            max_new_tokens=max_new_tokens,
            do_sample=do_sample,
            temperature=temperature if do_sample else None,
            top_p=top_p if do_sample else None,
            pad_token_id=tok.eos_token_id,
            eos_token_id=tok.eos_token_id,
            return_dict_in_generate=True,
        )

    prompt_len = enc["input_ids"].shape[1]
    new_tokens = out.sequences[0, prompt_len:]
    return tok.decode(new_tokens, skip_special_tokens=True).strip()


In [None]:
n_eval = 5

prompts = [f"Translate from English to Indonesian Language: {row['english']} Indonesian Result:" for row in test_dataset.select(range(n_eval))]
refs = [row["indonesian"] for row in test_dataset.select(range(n_eval))]
gen_outputs = [generate_translation(p) for p in prompts]

In [None]:
pd.DataFrame({
    "Prompt": prompts,
    "Reference": refs,
    "Output": gen_outputs
})

Unnamed: 0,Prompt,Reference,Output
0,Translate from English to Indonesian Language:...,"Dekat dengan hotel saya menginap, hanya ditemp...","Dekat hotel yang saya tinggali, bisa dengan ka..."
1,Translate from English to Indonesian Language:...,"Iya benar, dia sedang jaga warung.","Ya betul, dia sekarang menjaga toko tersebut. ..."
2,Translate from English to Indonesian Language:...,Kangkungnya lumayan tapi kepiting saus padangn...,Sot kecombrang oke cuma saos udang padang yang...
3,Translate from English to Indonesian Language:...,Bertempat di braga city walk yang satu gedung ...,"Terletak di dalam braga city walk, yang sama d..."
4,Translate from English to Indonesian Language:...,Gianyar terima bantuan sosial 2018 sebesar rp ...,"Gianyar mendapatkan total 44,9 miliar rupiah d..."


In [None]:
smooth_fn = SmoothingFunction().method1
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
chrf_scorer = CHRF()

# Compute BLEU, ROUGE, chrF
bleu_scores, rouge1_scores, rougeL_scores, chrF_scores = [], [], [], []

for gen, ref in zip(gen_outputs, refs):
    bleu_scores.append(sentence_bleu([ref.split()], gen.split(), smoothing_function=smooth_fn))
    rouge = scorer.score(ref, gen)
    rouge1_scores.append(rouge['rouge1'].fmeasure)
    rougeL_scores.append(rouge['rougeL'].fmeasure)
    chrF_scores.append(chrf_scorer.sentence_score(gen, [ref]).score / 100)

# Save results
df_eval = pd.DataFrame({
    "Base Output": basegen_outputs,
    "Finetuned Output": gen_outputs,
    "Reference": refs,
    "BLEU": bleu_scores,
    "ROUGE-1": rouge1_scores,
    "ROUGE-L": rougeL_scores,
    "chrF": chrF_scores
})

display(df_eval.describe().T)
# df_eval.to_csv("gemma_nusax_en_id_eval.csv", index=False)
# print("Saved gemma_nusax_en_id_eval.csv")

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
BLEU,5.0,0.02397,0.038986,0.001862,0.003,0.006981,0.014898,0.093109
ROUGE-1,5.0,0.176872,0.120995,0.019417,0.114943,0.166667,0.25,0.333333
ROUGE-L,5.0,0.174308,0.116918,0.019417,0.114943,0.166667,0.25,0.320513
chrF,5.0,0.278839,0.156243,0.072262,0.158492,0.357079,0.361668,0.444694


In [None]:
# base_eval = pd.DataFrame({
#     "Base Output": basegen_outputs,
#     "Reference": base_refs,
#     "Base BLEU": basebleu_scores,
#     "Base ROUGE-1": baserouge1_scores,
#     "Base ROUGE-L": baserougeL_scores,
#     "Base chrF": basechrF_scores
# })

display(base_eval.describe().T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Base BLEU,5.0,0.019121,0.007249,0.010457,0.01316,0.020256,0.023957,0.027776
Base ROUGE-1,5.0,0.382259,0.146297,0.142857,0.35,0.434783,0.47619,0.507463
Base ROUGE-L,5.0,0.323403,0.12758,0.142857,0.25,0.347826,0.428571,0.447761
Base chrF,5.0,0.424982,0.08829,0.296033,0.381128,0.447528,0.479701,0.520519


# cendol-llama2-7b-inst

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

# pipe = pipeline("text-generation", model="indonlp/cendol-llama2-7b-inst")

pipe = pipeline(
    "text-generation",
    model="indonlp/cendol-llama2-7b-inst",
    device_map="auto",
    load_in_8bit=True   # saves VRAM
)


config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

pytorch_model.bin.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("indonlp/cendol-llama2-7b-inst")
model = AutoModelForCausalLM.from_pretrained("indonlp/cendol-llama2-7b-inst")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
tokenizer.pad_token = tokenizer.eos_token  # set once

def translate(text, max_new_tokens=64):
    # Use the input text properly
    prompt = f"Terjemahkan kalimat bahasa Inggris ke bahasa Indonesia dan sertakan cara pengucapan fonetiknya:\n{text}\nIndonesian:"

    # ✅ Tokenize the *single* string, not 'prompts' list
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            do_sample=False
        )

    # ✅ Decode the whole output sequence
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation.strip()


In [None]:
prompts

['Translate from English to Indonesian Language: Near the hotel I stayed in, reachable by foor, so many food choice here, the place is huge, and fun Indonesian Result:',
 "Translate from English to Indonesian Language: Yeah that's right, he's looking after the store now Indonesian Result:",
 'Translate from English to Indonesian Language: The water spinach was alright but the crab with Padang sauce was disappointing. We were given a hollow crab. In the end we decided not to eat the crab and returned it. Indonesian Result:',
 'Translate from English to Indonesian Language: Located inside the Braga City Walk, which is in the same building as Aston and Fave Hotel, this is the perfect hangout spot. The coffee-tea mix that I tried for the first time was actually amazing. Combined with a sunny-side up egg and you got yourself the perfect meal for chatting with your friends. The smoke-free zone just adds to the comforting feeling as you watch the view of people coming and going in this mall I

In [None]:
cendolbase = [translate(p) for p in prompts]

In [None]:
df_eval = pd.DataFrame({
    "Prompt": prompts,
    "Base Output": basegen_outputs,
    "Finetuned Output": gen_outputs,
    "Reference": refs,
    "Cendol": cendolbase
})

display(df_eval)

Unnamed: 0,Prompt,Base Output,Finetuned Output,Reference,Cendol
0,Translate from English to Indonesian Language:...,"Di dekat hotel tempat saya menginap, dapat dic...","Dekat hotel yang saya tinggali, bisa dengan ka...","Dekat dengan hotel saya menginap, hanya ditemp...",Terjemahkan kalimat bahasa Inggris ke bahasa I...
1,Translate from English to Indonesian Language:...,"Ya, betul, dia sekarang yang menjaga toko ters...","Ya betul, dia sekarang menjaga toko tersebut. ...","Iya benar, dia sedang jaga warung.",Terjemahkan kalimat bahasa Inggris ke bahasa I...
2,Translate from English to Indonesian Language:...,"Selada airnya lumayan, tetapi udang dengan sau...",Sot kecombrang oke cuma saos udang padang yang...,Kangkungnya lumayan tapi kepiting saus padangn...,Terjemahkan kalimat bahasa Inggris ke bahasa I...
3,Translate from English to Indonesian Language:...,"Terletak di dalam Braga City Walk, yang berada...","Terletak di dalam braga city walk, yang sama d...",Bertempat di braga city walk yang satu gedung ...,Terjemahkan kalimat bahasa Inggris ke bahasa I...
4,Translate from English to Indonesian Language:...,"Gianyar menerima total 44,9 miliar Rupiah dari...","Gianyar mendapatkan total 44,9 miliar rupiah d...",Gianyar terima bantuan sosial 2018 sebesar rp ...,Terjemahkan kalimat bahasa Inggris ke bahasa I...



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

