In [None]:
# ============================================================
# self_verification.py (v2)
# Performs 4-sample self-verification using Llama3
# ============================================================
import os
import json
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from huggingface_hub import notebook_login
from functools import partial
import transformers
import torch

In [None]:
# login your hugging face suing notebook_login

In [None]:
!hf auth whoami

In [None]:
# https://drive.google.com/file/d/1Rc_vefQY5I_Ou4nxqv9S1tyshpjPPsBM/view?usp=sharing
!pip install -q gdown
!gdown --id 1Rc_vefQY5I_Ou4nxqv9S1tyshpjPPsBM # Download the dataset
!unzip dataset.zip -d ./dataset

Downloading...
From: https://drive.google.com/uc?id=1Rc_vefQY5I_Ou4nxqv9S1tyshpjPPsBM
To: /content/dataset.zip
100% 6.95M/6.95M [00:00<00:00, 18.6MB/s]
Archive:  dataset.zip
  inflating: ./dataset/cnli_short.jsonl  
  inflating: ./dataset/coqa_short.jsonl  
  inflating: ./dataset/narrative_qa_short.jsonl  
  inflating: ./dataset/qasper_short.jsonl  
  inflating: ./dataset/quality_short.jsonl  


In [None]:
# https://drive.google.com/file/d/1GSv7tfKn7x8F_YIDQChic-REldQvGJWS/view?usp=drive_link
!pip install -q gdown
!gdown --id 1GSv7tfKn7x8F_YIDQChic-REldQvGJWS # Download the dataset
!unzip outputs.zip -d ./outputs

Downloading...
From: https://drive.google.com/uc?id=1GSv7tfKn7x8F_YIDQChic-REldQvGJWS
To: /content/outputs.zip
100% 215k/215k [00:00<00:00, 126MB/s]
Archive:  outputs.zip
  inflating: ./outputs/baseline_output_cnli_short.json  
  inflating: ./outputs/baseline_output_coqa_short.json  
  inflating: ./outputs/baseline_output_narrative_qa_short.json  
  inflating: ./outputs/baseline_output_qasper_short.json  


In [None]:
# ============================================================
# Configurations
# ============================================================

DATASET_NAME = "narrative_qa_short"  # coqa_short / qasper_short / narrative_qa_short / cnli_short
DATASET_PATH = f"dataset/{DATASET_NAME}.jsonl"
BASELINE_PATH = f"outputs/baseline_output_{DATASET_NAME}.json"
VERIF_OUTPUT_PATH = f"verification/self_ver_{DATASET_NAME}.json"

os.makedirs(os.path.dirname(VERIF_OUTPUT_PATH), exist_ok=True)


In [None]:
# ============================================================
# load original dataset and baseline outputs
# ============================================================

print(f"Loading dataset from {DATASET_PATH}")
df = pd.read_json(DATASET_PATH, lines=True, orient="records")

print(f"Loading baseline predictions from {BASELINE_PATH}")
with open(BASELINE_PATH, "r", encoding="utf-8") as f:
    baseline = json.load(f)

slm_answers = baseline["llama3_pred"]  # only do self-verification for small-language-model
# slm_answers = baseline["gpt_pred"]

assert len(slm_answers) == len(df), \
    f"length inconsistent: dataset={len(df)}, llama3_pred={len(slm_answers)}" # Check if the length of inputs the same as baseline outputs

Loading dataset from dataset/narrative_qa_short.jsonl
Loading baseline predictions from outputs/baseline_output_narrative_qa_short.json


In [None]:
# ============================================================
# Initialize Llama3 pipeline
# ============================================================

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"  # Model name

print(f"Loading Llama3 model: {model_id}")
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

MAX_NEW_TOKENS = 64

In [None]:
# ============================================================
# Construct self-verification prompt
# ============================================================
def build_verification_prompt(context: str, question: str, answer: str) -> str:
  # print(f"answer from slm is {answer}")

  prompt = f"""Context: The manuscript, discovered in 1980 in a dusty attic, turned out to be a lost work of Shakespeare.

  Question: Whose lost work was discovered in a dusty attic in 1980?

  AI Generated Answer: Shakespeare

  Instruction: Your task is to evaluate if the AI Generated Answer is correct, based on the provided context and question. Provide the judgement and reasoning for each case. Choose between Correct or Incorrect.

  Evaluation: The context specifically mentions that a lost work of Shakespeare was discovered in 1980 in a dusty attic.

  Verification Decision: The AI generated answer is Correct.

  ---

  Context: The celestial event, known as the Pink Moon, is unique to the month of April and has cultural significance in many indigenous tribes.

  Question: In which month does the celestial event, the Pink Moon, occur?

  AI Generated Answer: July

  Instruction: Your task is to evaluate if the AI Generated Answer is correct, based on the provided context and question. Provide the judgement and reasoning for each case. Choose between Correct or Incorrect.

  Evaluation: The context clearly states that the Pink Moon is unique to the month of April.

  Verification Decision: The AI generated answer is Incorrect.

  ---

  Context: The Mona Lisa, housed in the Louvre Museum, is believed to be a portrait of Lisa Gherardini, painted by Leonardo da Vinci in the early 16th century.

  Question: Who is believed to have painted the Mona Lisa in the early 16th century?

  AI Generated Answer: Vincent van Gogh

  Instruction: Your task is to evaluate if the AI Generated Answer is correct, based on the provided context and question. Provide the judgement and reasoning for each case. Choose between Correct or Incorrect.

  Evaluation: The context specifies that the Mona Lisa was painted by Leonardo da Vinci in the early 16th century.

  Verification Decision: The AI generated answer is Incorrect.

  ---

  Context: The planet Kepler-442b, located 1,100 light-years away, is one of the most Earth-like planets ever discovered, having a similar size and orbiting within its star's habitable zone.

  Question: How far away is the planet Kepler-442b?

  AI Generated Answer: 1,100 light-years

  Instruction: Your task is to evaluate if the AI Generated Answer is correct, based on the provided context and question. Provide the judgement and reasoning for each case. Choose between Correct or Incorrect.

  Evaluation: The context states that Kepler-442b is located 1,100 light-years away.

  Verification Decision: The AI generated answer is Correct.

  ---

  Context: {context}

  Question: {question}

  AI Generated Answer: {answer}

  Instruction: Your task is to evaluate if the AI Generated Answer is correct, based on the provided context and question. Provide the judgement and reasoning for each case. Choose between Correct or Incorrect.

  Evaluation:"""

  return prompt



In [None]:
# ============================================================
# Call Llama3 for verification
# ============================================================

def llama3_verify(prompt: str, temperature: float = 0.05,max_new_tokens: int = MAX_NEW_TOKENS) -> str:
    """
    Call Llama3 for verification.
    :param prompt: The verification prompt.
    :param temperature: The temperature of the generation.
    :param max_new_tokens: The maximum number of new tokens to generate.
    """
    messages = [
        {
            "role": "system",
            "content": "You are an AI assistant that verifies if an answer is correct. "
                       "You MUST output exactly one line starting with "
                       "'Verification Decision:' followed by 'Correct' or 'Incorrect'."
        },
        {"role": "user", "content": prompt},
    ]

    outputs = pipeline(
        messages,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
    )

    text = outputs[0]["generated_text"][-1]["content"]
    return text.strip()


In [None]:
# ============================================================
# Binary Output
# ============================================================

def parse_verification_score(text: str) -> int:
    lower = text.lower()
    if "verification decision" not in lower:
        return 0
    last_line = lower.strip().splitlines()[-1]
    if "correct" in last_line and "incorrect" not in last_line:
        return 1
    if "incorrect" in last_line and "correct" not in last_line:
        return 0
    return 0


In [None]:

# ============================================================
# Conduct K verification and return confidence score
# ============================================================

K = 4  # Number of self-verification

def verify_with_confidence(prompt: str, k: int = K) -> float:
    scores = []
    for _ in range(k-1):
        text = llama3_verify(prompt, temperature=0.05)
        # print("-"*80)
        # print(f"verfication:{text}")
        # print("-"*80)
        score = parse_verification_score(text)
        scores.append(score)

    final_score = sum(scores) / k
    print(final_score)
    return final_score

In [None]:
# ============================================================
# Excute self-verification in batch
# ============================================================

def run_self_verification(df: pd.DataFrame,
                          slm_answers,
                          max_workers: int = 4):
    prompts = []
    for i, row in df.iterrows():
        context = row.get("base_ctx", row.get("context", ""))
        question = row["question"]
        answer = slm_answers[i]
        prompts.append(build_verification_prompt(context, question, answer))

    confidences = []
    verify_func = partial(verify_with_confidence, k=K)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for conf in tqdm(executor.map(verify_func, prompts),
                         total=len(prompts),
                         desc=f"Self-verifying (K={K}) with Llama3 on {DATASET_NAME}"):
            confidences.append(conf)

    return confidences

In [None]:

# ============================================================
# Save the results
# ============================================================

print(f"Running {K}-sample self-verification on {DATASET_NAME} ...")
slm_ver_confidence = run_self_verification(df, slm_answers, max_workers=2)

output_obj = {
    "slm_ver_confidence": slm_ver_confidence
}

with open(VERIF_OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(output_obj, f, ensure_ascii=False, indent=2)

print(f"Self-verification confidence saved to {VERIF_OUTPUT_PATH}")