In [None]:
%%capture
!pip install lm-eval==0.4.9.1
!pip install autoawq

In [None]:
import numpy as np
import pandas as pd
import os
import json
import lm_eval
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from datasets import load_dataset
from peft import PeftModel
import torch
from tqdm import tqdm

In [None]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# mcq

In [None]:
mcq_task = """
task: mcq
dataset_path: vohuutridung/Public-Test
dataset_name: mcq
output_type: multiple_choice
validation_split: train
doc_to_text: "Câu hỏi: {{question}}\n\nCác lựa chọn:\n{% for c in choices %}{{ loop.index0 }}. {{ c }}\n{% endfor %}\n\nĐáp án đúng là:"
doc_to_target: answer
doc_to_choice: choices
metric_list:
  - metric: acc

"""
with open("mcq.yaml", "w") as f:
    f.write(mcq_task)

In [None]:
%env LOGLEVEL=INFO
!lm_eval \
    --model hf \
    --model_args pretrained=vohuutridung/qwen3-1.7b-legal-pretrain,peft=vohuutridung/merged3-v8 \
    --include_path ./ \
    --tasks mcq

# nli

In [None]:
nli_task = """
task: nli
dataset_path: vohuutridung/Public-Test
dataset_name: nli
output_type: multiple_choice
validation_split: train
doc_to_text: "Đoạn luật sau đây: {{legal_document}}\n\nCâu hỏi chi tiết: {{specific_question}}\n\nCâu hỏi: {{question}}\n\nĐáp án đúng là:"
doc_to_target: answer
doc_to_choice: choices
metric_list:
  - metric: acc
"""
with open('nli.yaml', 'w') as f:
    f.write(nli_task)

In [None]:
%env LOGLEVEL=INFO
!lm_eval \
    --model hf \
    --model_args pretrained=vohuutridung/qwen3-1.7b-legal-pretrain,peft=vohuutridung/merged3-v8 \
    --include_path ./ \
    --tasks nli

# sqa

In [None]:
TEMPLATE = """
    You are a neutral and highly reliable legal judge AI.
    Your responsibility is to evaluate a model-generated legal reasoning based on Vietnamese law using strict, unbiased, reference-grounded criteria.
    
    IMPORTANT RULES:
    - You MUST NOT hallucinate legal rules not present in the question or the ground truth.
    - You MUST NOT evaluate based on style, wording, or length.
    - Your evaluation must be based ONLY on legal correctness, logical reasoning, and factual alignment with the ground truth.
    - You must avoid position bias: do NOT favor the ground truth blindly; evaluate logically.
    - You must avoid verbosity bias: short or long answers are NOT penalized.
    - You must avoid semantic drift: stay anchored to the question and ground truth.

    You will be given:
    1. The legal question
    2. The ground-truth expert answer
    3. The model-generated answer

    ---QUESTION---
    {question}
    ---END QUESTION---

    --- GROUND TRUTH ANSWER ---
    {answer}
    --- END GROUND TRUTH ANSWER ---

    --- MODEL ANSWER ---
    {response}
    --- END MODEL ANSWER ---

    Your evaluation criteria (equal weight):
    1. **Major Premise Accuracy** – correct identification of relevant legal norms  
    2. **Minor Premise Accuracy** – correct interpretation of facts  
    3. **Logical Structure (Syllogism)** – correct linkage between premises and conclusion  
    4. **Legal Compliance** – aligns with Vietnamese law  
    5. **Faithfulness** – no contradictions with ground truth reasoning  
    6. **Absence of Hallucination** – does not invent legal facts, rules, or conclusions  
    7. **Completeness** – fully addresses the question  
    8. **Clarity** – reasoning is understandable and coherent 

    Provide a **score from 1 to 10**.
    Your response MUST be ONLY the number.
    """

In [None]:
# --------------------------------------------EVAL---------------------------------------------------
def prepare_eval_model():
    eval_model_name = 'vohuutridung/qwen3-1.7b-legal-pretrain'
    eval_adapter = 'vohuutridung/merged3-v8' #*************************************************************************
    
    eval_tokenizer = AutoTokenizer.from_pretrained(eval_model_name)
    eval_model = AutoModelForCausalLM.from_pretrained(
        eval_model_name,
        torch_dtype="auto",
        device_map="balanced"
    )
    if eval_adapter:
        eval_model = PeftModel.from_pretrained(eval_model, eval_adapter)
        eval_model = eval_model.merge_and_unload()
        print('Merged lora adapter to base model for faster inference.')

    eval_model.eval()
    return eval_model, eval_tokenizer


def prepare_eval_dataset():
    dataset = load_dataset('vohuutridung/Public-Test', 'sqa', split='train')
    print(dataset)
    return dataset


def generate_response(model, tokenizer, question):
    prompt = (
        "Bạn là một chuyên gia pháp luật Việt Nam.\n"
        "Nhiệm vụ của bạn là phân tích tình huống và áp dụng quy định pháp luật hiện hành "
        "để suy luận theo kiểu suy luận logic (syllogism) và đưa ra kết luận rõ ràng.\n\n"
        "YÊU CẦU:\n"
        "- Trình bày lập luận theo từng ý rõ ràng (có thể chia thành: Tiền đề lớn, Tiền đề nhỏ, Kết luận).\n"
        "- Dẫn chiếu điều luật, nghị định, văn bản pháp luật liên quan nếu có.\n"
        "- Cuối cùng nêu KẾT LUẬN rõ ràng về quyền, nghĩa vụ hoặc kết quả pháp lý trong tình huống.\n\n"
        f"TÌNH HUỐNG / CÂU HỎI:\n{question}\n\n"
        "TRẢ LỜI:\n"
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1024,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )

    generated_ids = outputs[0][inputs.input_ids.shape[1]:]
    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    return response

# --------------------------------------------JUDGE---------------------------------------------------
def prepare_judge_model():
    judge_model_name = "Qwen/Qwen3-32B-AWQ"
    
    judge_tokenizer = AutoTokenizer.from_pretrained(judge_model_name)
    judge_model = AutoModelForCausalLM.from_pretrained(
        judge_model_name,
        torch_dtype="auto",
        device_map='balanced',
    )
    judge_model.eval()

    return judge_model, judge_tokenizer


def judge(model, tokenizer, q, a, r):
    prompt = TEMPLATE.format(question=q, answer=a, response=r)
    messages = [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=16,
        do_sample=False
    )
    output = tokenizer.decode(
        generated_ids[0][model_inputs.input_ids.shape[1]:],
        skip_special_tokens=True
    ).strip()
    
    return output

In [None]:
eval_model, eval_tokenizer = prepare_eval_model()
dataset = prepare_eval_dataset()

eval_dataset = []
for item in tqdm(dataset):
    question = item['question']
    answer = item['answer']
    response = generate_response(eval_model, eval_tokenizer, question)
    
    eval_dataset.append({
        'question': question,
        'answer': answer,
        'response': response,
    })
    
print(f'There are {len(eval_dataset)} eval samples.')

In [None]:
judge_model, judge_tokenizer = prepare_judge_model()

scores = []
for sample in tqdm(eval_dataset):
    q, a, r = sample['question'], sample['answer'], sample['response']
    prompt = TEMPLATE.format(question=q, answer=a, response=r)
    output = judge(judge_model, judge_tokenizer, q, a, r)
    scores.append(int(output))

In [None]:
round((sum(scores) / len(scores)) * 10, 4)