# **SFT Evaluation**

## **Import Libraries**

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5"  # Sesuaikan dengan GPU yang tersedia

In [2]:
import gc
import torch
import pandas as pd
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
import evaluate
import time

# Konfigurasi lingkungan
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

pd.set_option("display.max_colwidth", None)

# Token Hugging Face dan direktori cache
hf_token = "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"
cache_folder = "../model_cache"

  from .autonotebook import tqdm as notebook_tqdm


[2025-05-03 04:54:17,617] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


In [3]:
# Deteksi perangkat
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if device.type == "cuda":
    n_gpus = torch.cuda.device_count()
    print(f"Detected {n_gpus} CUDA device(s):")
    for i in range(n_gpus):
        name = torch.cuda.get_device_name(i)
        print(f"  • GPU {i}: {name}")

Using device: cuda
Detected 6 CUDA device(s):
  • GPU 0: NVIDIA A100-SXM4-40GB
  • GPU 1: NVIDIA A100-SXM4-40GB
  • GPU 2: NVIDIA A100-SXM4-40GB
  • GPU 3: NVIDIA A100-SXM4-40GB
  • GPU 4: NVIDIA A100-SXM4-40GB
  • GPU 5: NVIDIA A100-SXM4-40GB


## **Load Dataset**

In [4]:
dataset_path = "../datasets/cqa_test.jsonl"
qa_df = pd.read_json(dataset_path, lines=True)
qa_df.sample(1)

Unnamed: 0,context,question,answer,file_url,regulation_number,title,filename,n_pairs_requested
21,"Peraturan ini juga menetapkan sanksi administratif bagi pihak yang melanggar ketentuan yang berlaku. Jika terjadi pelanggaran, Otoritas Jasa Keuangan dapat memberikan sanksi berupa peringatan tertulis, denda, pembatasan kegiatan usaha, hingga pencabutan izin usaha. Sanksi ini bertujuan untuk menegakkan kepatuhan terhadap peraturan dan menjaga integritas pasar modal.",Apa saja sanksi yang dapat dijatuhkan kepada pihak yang melanggar peraturan ini?,"Sanksi yang dapat dijatuhkan termasuk peringatan tertulis, denda, pembatasan kegiatan usaha, hingga pencabutan izin usaha bagi pihak yang melanggar ketentuan.",https://www.ojk.go.id/id/regulasi/Documents/Pages/Transaksi-Efek/pojk%2022-2019.pdf,22/POJK.04/2019,Transaksi Efek,ojk-peraturan_ojk-22_pojk_04_2019-12092019-transaksi_efek.pdf,6


## **Load Models and Metrics**

In [5]:
# Inisialisasi metrik
em     = evaluate.load("exact_match")
rouge  = evaluate.load("rouge")
bleu   = evaluate.load("bleu")
meteor = evaluate.load("meteor")

# Load tokenizer & SFT-adapter model
model_id   = "sft_mp_final"  # folder di mana adapter tersimpan
# model_id   = "sft_dp_final"  # folder di mana adapter tersimpan
base_id    = "SeaLLMs/SeaLLMs-v3-7B"   # untuk tokenizer
tokenizer  = AutoTokenizer.from_pretrained(base_id, cache_dir=cache_folder, use_fast=True, token=hf_token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    cache_dir=cache_folder,   # tambahkan ini
    torch_dtype=torch.float16,
    device_map="auto"
)
model.eval()

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/llmsosmed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/llmsosmed/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/llmsosmed/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards: 100%|██████████| 7/7 [00:06<00:00,  1.15it/s]


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=3584, out_features=3584, bias=True)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=3584, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=3584, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): lora.Linear(
            (base_layer): Linear(in_features=3584, out_features=512, bi

## **Inference**

In [6]:
# Setup penyimpanan hasil
metrics_file = "sft_evaluation_metrics.csv"
if not os.path.exists(metrics_file):
    pd.DataFrame(columns=[
        "model", "exact_match", "rouge1_f1", "rouge2_f1",
        "rougeL_f1", "bleu", "meteor", "inference_time_sec"
    ]).to_csv(metrics_file, index=False)

preds, refs, details = [], [], []
batch_size = 16

# Warm-up (optional)
dummy = tokenizer("Warm up", return_tensors="pt", padding=True, truncation=True, max_length=32).to(model.device)
with torch.no_grad():
    _ = model.generate(**dummy, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id)

# Mulai timer
start_time = time.time()

pbar = tqdm(total=len(qa_df), desc="SFT-Eval")
for i in range(0, len(qa_df), batch_size):
    batch = qa_df.iloc[i : i + batch_size]
    prompts = [
        f"{r.context.strip()}\n\nPertanyaan: {r.question.strip()}\nJawaban:"
        for r in batch.itertuples()
    ]

    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=382)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=64,
            do_sample=False,
            temperature=1.0,
            top_p=1.0,
            pad_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    for prompt, full in zip(prompts, decoded):
        preds.append(full[len(prompt):].strip().split("\n")[0])

    refs.extend(batch.answer.str.strip().tolist())
    for idx, r in enumerate(batch.itertuples()):
        details.append({
            "context":       r.context,
            "question":      r.question,
            "ground_truth":  r.answer,
            "prediction":    preds[-len(batch) + idx]
        })

    pbar.update(len(batch))
pbar.close()

# Hitung waktu
inference_time = time.time() - start_time


SFT-Eval: 100%|██████████| 86/86 [00:18<00:00,  4.77it/s]


In [7]:
# 1) Compute
r_em     = em.compute(predictions=preds, references=refs)
r_rouge  = rouge.compute(predictions=preds, references=refs)
r_bleu   = bleu.compute(predictions=preds, references=[[r] for r in refs])
r_meteor = meteor.compute(predictions=preds, references=refs)

# 2) Append ke CSV
row = {
    "model":              "SFT-SeaLLMs-v3-7B-LoRA",
    "exact_match":        r_em["exact_match"],
    "rouge1_f1":          r_rouge["rouge1"],
    "rouge2_f1":          r_rouge["rouge2"],
    "rougeL_f1":          r_rouge["rougeL"],
    "bleu":               r_bleu["bleu"],
    "meteor":             r_meteor["meteor"],
    "inference_time_sec": inference_time
}
pd.DataFrame([row]).to_csv(metrics_file, mode="a", header=False, index=False)

# 3) Simpan detail prediksi
pd.DataFrame(details).to_json(
    "sft_detailed_predictions.jsonl",
    orient="records",
    lines=True
)

## **Results**

In [8]:
# 1) Summary Metrics
df_metrics = pd.read_csv(metrics_file)
print("\n=== Summary Metrics ===")
print(df_metrics.to_markdown(index=False))


=== Summary Metrics ===
| model                  |   exact_match |   rouge1_f1 |   rouge2_f1 |   rougeL_f1 |     bleu |   meteor |   inference_time_sec |
|:-----------------------|--------------:|------------:|------------:|------------:|---------:|---------:|---------------------:|
| SFT-SeaLLMs-v3-7B-LoRA |             0 |    0.681486 |    0.526719 |      0.6298 | 0.443512 | 0.665771 |              18.0179 |


In [29]:
# 2) Combine a few examples across models
# load base model’s detailed predictions
df_base = pd.read_json(
    "../base-evaluation/detailed_SeaLLMs-v3-7B.jsonl", 
    lines=True
)[["context","question","ground_truth","SeaLLMs-v3-7B"]]
df_base = df_base.rename(
    columns={"SeaLLMs-v3-7B": "prediction_base"}
)

# load your SFT predictions
df_sft = pd.read_json(
    "sft_detailed_predictions.jsonl", 
    lines=True
)[["context","question","ground_truth","prediction"]]
df_sft = df_sft.rename(
    columns={"prediction": "prediction_sft"}
)

# merge them
merged = df_base.merge(
    df_sft,
    on=["context","question","ground_truth"],
    how="inner"
)

print("\n=== Combined Predictions (3 Examples) ===")
display(
    merged[22:23]
    .rename(columns={"ground_truth": "ground_truth_answer"})
)


=== Combined Predictions (3 Examples) ===


Unnamed: 0,context,question,ground_truth_answer,prediction_base,prediction_sft
22,"Peraturan Otoritas Jasa Keuangan Republik Indonesia Nomor 32/POJK.04/2014 mengatur mengenai Rapat Umum Pemegang Saham (RUPS) untuk Perusahaan Terbuka. RUPS adalah organ penting dalam tata kelola perusahaan yang berperan untuk melindungi hak-hak pemegang saham. RUPS terdiri dari RUPS tahunan dan RUPS lainnya, dan wajib diselenggarakan sesuai dengan ketentuan yang berlaku dalam waktu tertentu setelah tahun buku berakhir.",Apa yang diatur dalam Peraturan Otoritas Jasa Keuangan Nomor 32/POJK.04/2014?,"Peraturan ini mengatur penyelenggaraan Rapat Umum Pemegang Saham (RUPS) untuk Perusahaan Terbuka, termasuk ketentuan mengenai RUPS tahunan dan lainnya serta perlindungan hak-hak pemegang saham.",Peraturan Otoritas Jasa Keuangan Nomor 32/POJK.04/2014 mengatur mengenai Rapat Umum Pemegang Saham (RUPS) untuk Perusahaan Terbuka. RUPS adalah organ penting dalam tata kelola,"Peraturan tersebut mengatur mengenai Rapat Umum Pemegang Saham (RUPS) untuk Perusahaan Terbuka, termasuk RUPS tahunan dan RUPS lainnya, serta waktu dan ketentuan penyelenggaraannya."
