# **Base Evaluation with Data Parallelism**

## **Import Libraries**

In [1]:
import os
import gc
import torch
import pandas as pd
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from accelerate import Accelerator
import evaluate

# Inisialisasi Accelerator
accelerator = Accelerator()
device = accelerator.device

# Token Hugging Face dan direktori cache
hf_token = "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"
cache_folder = "./model_cache"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Deteksi jumlah GPU
n_gpus = torch.cuda.device_count()
print(f"Detected {n_gpus} CUDA device(s).")

Detected 8 CUDA device(s).


## **Load Dataset**

In [3]:
dataset_path = "cqa_datasets.jsonl"
qa_df = pd.read_json(dataset_path, lines=True)
qa_df.sample(1)

Unnamed: 0,context,question,answer,file_url,regulation_number,title,filename,n_pairs_requested
2,Peraturan juga mencakup kewajiban bank untuk m...,Apa kewajiban bank terkait lembaga central cou...,Bank diwajibkan untuk memperhitungkan eksposur...,https://www.ojk.go.id/id/regulasi/Documents/Pa...,27 Tahun 2022,Perubahan Kedua Atas Peraturan Otoritas Jasa K...,ojk-peraturan_ojk-27_tahun_2022-28122022-perub...,3


## **Load Models and Metrics**

In [4]:
# Daftar model
model_names = {
    "Meta-Llama-3.1-8B": "meta-llama/Llama-3.1-8B-Instruct",
    "Aya-23-8B":         "CohereLabs/aya-23-8B",
    "SeaLLMs-v3-7B":     "SeaLLMs/SeaLLMs-v3-7B",
    "Sahabat-AI-8B":     "GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct"
}

# File untuk menyimpan metrik
metrics_file = "evaluation_metrics.csv"
if not os.path.exists(metrics_file):
    pd.DataFrame(columns=[
        "model", "exact_match", "rouge1_f1", "rouge2_f1", "rougeL_f1", "bleu", "meteor"
    ]).to_csv(metrics_file, index=False)

# Inisialisasi metrik
em = evaluate.load("exact_match")
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/llmsosmed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/llmsosmed/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/llmsosmed/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## **Inference**

In [5]:
# Proses inferensi untuk setiap model
for model_key, model_id in model_names.items():
    print(f"\n▶ Evaluating {model_key}")

    # Load tokenizer dan model
    tokenizer = AutoTokenizer.from_pretrained(
        model_id,
        cache_dir=cache_folder,
        use_fast=True,
        token=hf_token
    )
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        cache_dir=cache_folder,
        token=hf_token
    )

    # Siapkan model dan tokenizer dengan Accelerator
    model, tokenizer = accelerator.prepare(model, tokenizer)
    model.eval()

    preds, refs, details = [], [], []
    batch_size = 8
    pbar = tqdm(total=len(qa_df), desc=model_key)

    for i in range(0, len(qa_df), batch_size):
        batch = qa_df.iloc[i: i + batch_size]
        prompts = [
            row.context.strip() + "\n\nPertanyaan: " + row.question.strip() + "\nJawaban:"
            for row in batch.itertuples()
        ]

        inputs = tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=64,
                do_sample=False,
                temperature=1.0,
                top_p=1.0,
                pad_token_id=tokenizer.eos_token_id
            )

        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        for prompt, full in zip(prompts, decoded):
            preds.append(full[len(prompt):].strip().split("\n")[0])

        refs.extend(batch.answer.str.strip().tolist())
        for idx, row in enumerate(batch.itertuples()):
            details.append({
                "context":      row.context,
                "question":     row.question,
                "ground_truth": row.answer,
                model_key:      preds[i + idx]
            })

        pbar.update(len(batch))
    pbar.close()

    # Hitung metrik
    r_em = em.compute(predictions=preds, references=refs)
    r_rouge = rouge.compute(predictions=preds, references=refs)
    r_bleu = bleu.compute(predictions=preds, references=[[r] for r in refs])
    r_meteor = meteor.compute(predictions=preds, references=refs)

    row = {
        "model":       model_key,
        "exact_match": r_em["exact_match"],
        "rouge1_f1":   r_rouge["rouge1"],
        "rouge2_f1":   r_rouge["rouge2"],
        "rougeL_f1":   r_rouge["rougeL"],
        "bleu":        r_bleu["bleu"],
        "meteor":      r_meteor["meteor"]
    }
    pd.DataFrame([row]).to_csv(metrics_file, mode="a", header=False, index=False)

    # Simpan detail hasil
    pd.DataFrame(details).to_json(f"detailed_{model_key}.jsonl", orient="records", lines=True)
    print(f"→ Saved detailed_{model_key}.jsonl")

    # Bersihkan VRAM
    del model, tokenizer, inputs, outputs, decoded, preds, refs, details
    torch.cuda.empty_cache()
    gc.collect()


▶ Evaluating Meta-Llama-3.1-8B


Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.28it/s]
Meta-Llama-3.1-8B: 100%|██████████| 34/34 [00:13<00:00,  2.55it/s]


→ Saved detailed_Meta-Llama-3.1-8B.jsonl

▶ Evaluating Aya-23-8B


Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 90.96it/s]
Aya-23-8B: 100%|██████████| 34/34 [00:12<00:00,  2.81it/s]


→ Saved detailed_Aya-23-8B.jsonl

▶ Evaluating SeaLLMs-v3-7B


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards: 100%|██████████| 7/7 [00:02<00:00,  2.84it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 130.00 MiB. GPU 0 has a total capacity of 39.39 GiB of which 14.81 MiB is free. Process 347256 has 2.31 GiB memory in use. Including non-PyTorch memory, this process has 37.05 GiB memory in use. Of the allocated memory 36.36 GiB is allocated by PyTorch, and 203.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## **Results**

In [None]:
# 1) Summary Metrics
df_metrics = pd.read_csv(metrics_file)
print("\n=== Summary Metrics ===")
print(df_metrics.to_markdown(index=False))

In [None]:
merged = None
for model_key in model_names:
    df = pd.read_json(f"detailed_{model_key}.jsonl", lines=True)
    cols = ["context","question","ground_truth",model_key]
    df = df[cols]
    merged = df if merged is None else merged.merge(
        df,
        on=["context","question","ground_truth"],
        how="outer"
    )
    
print("\n=== Combined Predictions (3 Examples) ===")
merged.head(3).rename(columns={"ground_truth":"ground_truth_answer"})