# Hallucination Detection

In [248]:
import json
import os
from openai import OpenAI
from tqdm import tqdm
import requests
import httpx
from RefChecker.refchecker.extractor import extractor_prompts
# import RefChecker
import numpy as np
import spacy
from scorer import recompute_hard_labels
import glob
import re
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [249]:
from openai import OpenAI

client = OpenAI(
    api_key="")


## 1. Extracting Claims (Extractor) - Each claim is a merger of triple-structured knowledge.

In [250]:
LLM_TRIPLET_EXTRACTION_PROMPT_Q = extractor_prompts.LLM_TRIPLET_EXTRACTION_PROMPT_Q
LLM_Triplet_To_Claim_PROMPT_Q = extractor_prompts.LLM_Triplet_To_Claim_PROMPT_Q
LLM_CLAIM_EXTRACTION_PROMPT_Q = extractor_prompts.LLM_CLAIM_EXTRACTION_PROMPT_Q

In [251]:
def extract_triplets_to_claims(question, model_output_text):
    prompt = LLM_CLAIM_EXTRACTION_PROMPT_Q.format(q=question, r=model_output_text)

    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are an AI assistant who extracts claims."},
                {"role": "user", "content": prompt}
            ],
            model="gpt-3.5-turbo",
        )

        response_content = chat_completion.choices[0].message.content

        if not response_content:
            print(f"No response for the prompt: {prompt}")
            return []

        return response_content

    except Exception as e:
        print(f"OpenAI API Error: {e}")
        return []


## 2. Obtain Complete References

In [252]:
def get_reference_for_claim(claim):
    prompt = f"""
    Please expand, provide additional relevant factual information and verify the following claim:
    Claims: {claim}

    If the claim is accurate, return the original claim.
    If the claim is inaccurate or incomplete, return a corrected, more detailed statement.
    """

    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are an AI assistant verifying claims."},
                {"role": "user", "content": prompt}
            ],
            model="gpt-3.5-turbo",
        )

        if not chat_completion.choices or len(chat_completion.choices) == 0:
            print(f"No response for the prompt: {prompt}")
            return []

        response_content = chat_completion.choices[0].message.content

        if not response_content.strip():
            print(f"No content in the response for the prompt: {prompt}")
            return []

        return response_content

    except Exception as e:
        print(f"OpenAI API Error: {e}")
        return []


In [253]:
def extract_and_get_references(claims, context):
    references = []
    for claim in claims:
        verified_reference = get_reference_for_claim(claim)
        references.append(verified_reference)

    final_reference = " ".join(references) + " " + context

    return final_reference

## 3. Validate claims, `model_input`, `model_output_text`, and References (Checker)

The validation results should be mapped back to the `model_output_text`, marking hallucination positions and probabilities, and outputting them as `soft_labels`.

In [254]:
def extract_hallucination_positions(model_output_text, hallucination_results):
    # parse JSON data
    try:
        hallucination_results = json.loads(hallucination_results)
    except json.JSONDecodeError:
        print("Failed to decode JSON. Returning empty labels.")
        return {"soft_labels": []}

    soft_labels = []

    # find the position in the original text
    for result in hallucination_results:
        word = result['word']
        prob = result['prob']

        start = 0
        while True:
            start = model_output_text.find(word, start)
            if start == -1:
                break
            end = start + len(word)

            # save soft_labels
            soft_labels.append({
                "start": start,
                "end": end,
                "prob": prob
            })
            start = end

    return {"soft_labels": soft_labels}


In [255]:
def triplets_and_references_checker(claims, model_output_text, references, question):
    prompt = f"""
   Evaluate hallucinations in the model output text using the question, claims, and references.

    ### Question (Model Input)
    {question}

    ### Claims
    {claims}

    ### References
    {references}

    ### Model Output Text
    {model_output_text}

    ### Instructions
    1. Compare each claim with the provided references, question, and existing knowledge.
    2. Mark unsupported claims in `model output text` and return hallucinated words with character offsets and probabilities.
    3. Assign probabilities based on:
    0.7-1.0: Fully fabricated content.
    0.4-0.7: Partially incorrect content.
    0.1-0.4: Minor inaccuracies.
    4. Merge or adjust overlapping hallucinated words appropriately.
    5. Include hallucinated words even with low probabilities and return them strictly in JSON format:
    [
        {{"word": <example_word>, "prob": <probability>}},
     {{"word": <another_word>, "prob": <probability>}}
    ]

    """
    prompt = truncate_text_to_max_tokens(prompt, MAX_TOKENS // 3)

    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "system",
                 "content": "You are an AI assistant for hallucination detection."},
                {"role": "user", "content": prompt}
            ],
            model="gpt-3.5-turbo",
        )

        if not chat_completion.choices or len(chat_completion.choices) == 0:
            print("Error during hallucination detection: No response choices")
            return {"soft_labels": []}

        raw_labels = chat_completion.choices[0].message.content

        return extract_hallucination_positions(model_output_text, raw_labels)

    except Exception as e:
        print(f"OpenAI API Error: {e}")
        return {"soft_labels": []}


## Main Logic

In [256]:
def hallucination_detect(question, model_output_text, context):
    claims = extract_triplets_to_claims(question, model_output_text)
    references = extract_and_get_references(claims, context)
    hallucination_results = triplets_and_references_checker(claims, model_output_text, references, question)

    soft_labels = hallucination_results.get("soft_labels", [])
    hard_labels = recompute_hard_labels(soft_labels)

    return soft_labels, hard_labels

## Apply on My Dataset

In [257]:
import tiktoken
MAX_TOKENS = 16385
def truncate_text_to_max_tokens(text, max_tokens):
    tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
    tokens = tokenizer.encode(text)
    if len(tokens) > max_tokens:
        tokens = tokens[:max_tokens]
    return tokenizer.decode(tokens)

def process_data(question, context, model_output_text, prompt):
    """
    Process data and ensure token limits are respected by truncating text proportionally.
    """
    tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")

    # Calculate token count for each part
    question_tokens = len(tokenizer.encode(question))
    context_tokens = len(tokenizer.encode(context))
    model_output_tokens = len(tokenizer.encode(model_output_text))
    prompt_tokens = len(tokenizer.encode(prompt))

    total_tokens = question_tokens + context_tokens + model_output_tokens + prompt_tokens

    if total_tokens > MAX_TOKENS:
        print(f"Warning: Total tokens ({total_tokens}) exceed the maximum limit ({MAX_TOKENS}). Adjusting inputs.")

    # Calculate the amount to truncate
    excess_tokens = total_tokens - MAX_TOKENS

    # Proportional length adjustment for each part
    proportion_question = question_tokens / total_tokens
    proportion_context = context_tokens / total_tokens
    proportion_model_output = model_output_tokens / total_tokens
    proportion_prompt = prompt_tokens / total_tokens

    # Truncate each part proportionally
    question = truncate_text_to_max_tokens(question, int(question_tokens - proportion_question * excess_tokens))
    context = truncate_text_to_max_tokens(context, int(context_tokens - proportion_context * excess_tokens))
    model_output_text = truncate_text_to_max_tokens(model_output_text, int(model_output_tokens - proportion_model_output * excess_tokens))
    prompt = truncate_text_to_max_tokens(prompt, int(prompt_tokens - proportion_prompt * excess_tokens))

    # Update token count
    total_tokens = len(tokenizer.encode(question)) + len(tokenizer.encode(context)) + len(tokenizer.encode(model_output_text)) + len(tokenizer.encode(prompt))

    return question, context, model_output_text, prompt, total_tokens

In [258]:
# process the dataset and save the results
def process_dataset(input_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    input_files = glob.glob(os.path.join(input_folder, "*.jsonl"))

    with tqdm(total=len(input_files), desc="Processing Files", unit="file") as file_progress:
        for file_path in input_files:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = [json.loads(line) for line in f]

            output_data = []

            with tqdm(total=len(data), desc=f"Processing {os.path.basename(file_path)}", unit="entry",
                      leave=False) as entry_progress:
                for entry in data:
                    try:
                        # question = truncate_text(entry.get("model_input", ""), MAX_TOKENS // 3)
                        # model_output_text = truncate_text(entry.get("model_output_text", ""), MAX_TOKENS // 3)
                        # context = truncate_text(entry.get("context_googlecse", ""), MAX_TOKENS // 3)

                        question = truncate_text_to_max_tokens(entry.get("model_input", ""), MAX_TOKENS // 3)
                        model_output_text = truncate_text_to_max_tokens(entry.get("model_output_text", ""), MAX_TOKENS // 3)
                        context = truncate_text_to_max_tokens(entry.get("context_googlecse", ""), MAX_TOKENS // 3)


                        soft_labels, hard_labels = hallucination_detect(
                            question, model_output_text, context
                        )

                        question = entry.get("model_input", "")
                        model_output_text = entry.get("model_output_text", "")
                        context = entry.get("context_googlecse", "")

                        soft_labels, hard_labels = hallucination_detect(
                            question, model_output_text, context
                        )

                        output_entry = {
                            "id": entry.get("id"),
                            "lang": entry.get("lang"),
                            "model_input": entry.get("model_input"),
                            "model_output_text": entry.get("model_output_text"),
                            "model_id": entry.get("model_id"),
                            "soft_labels": soft_labels,
                            "hard_labels": hard_labels,
                            "model_output_logits": entry.get("model_output_logits"),
                            "model_output_tokens": entry.get("model_output_tokens")
                        }

                        output_data.append(output_entry)

                    except Exception as e:
                        print(f"OpenAI API Error: {e}")
                        # return []
                        continue
                    entry_progress.update(1)

            output_file = os.path.join(output_folder, os.path.basename(file_path))
            with open(output_file, 'w', encoding='utf-8') as f:
                for item in output_data:
                    f.write(json.dumps(item, ensure_ascii=False) + '\n')

            file_progress.update(1)
            print(f"Processed and saved: {output_file}")

In [259]:
import json
import re

def extract_hallucination_positions(model_output_text, hallucination_results):
    print("hallucination_results:", hallucination_results)

    json_matches = re.findall(r'\[\s*\{.*?\}\s*\]', hallucination_results, re.DOTALL)

    if not json_matches:
        print("No valid JSON found. Returning empty labels.")
        return {"soft_labels": []}

    try:
        hallucination_results = json.loads(json_matches[0])
    except json.JSONDecodeError as e:
        print(f"Failed to decode extracted JSON. Error: {e}. Returning empty labels.")
        return {"soft_labels": []}

    soft_labels = []

    # find the position in the original text
    for result in hallucination_results:
        word = result['word']
        prob = result['prob']

        start = 0
        while True:
            start = model_output_text.find(word, start)
            if start == -1:
                break
            end = start + len(word)

            # save soft_labels
            soft_labels.append({
                "start": start,
                "end": end,
                "prob": prob
            })
            start = end

    return {"soft_labels": soft_labels}

In [260]:
import os

def get_project_root():
    return os.path.dirname(os.getcwd())

input_folder = os.path.join(get_project_root(), "data/exknowledge/")
output_folder = os.path.join(get_project_root(), "data/detect_gpt/")

print("Input Folder Absolute Path:", input_folder)
process_dataset(input_folder, output_folder)


Input Folder Absolute Path: /Users/wt/SemEvalTask3/NCL-UoR/Jalynn/Method1/data/exknowledge/


Processing Files:   0%|          | 0/10 [00:00<?, ?file/s]
Processing mushroom.ar-val.v2.jsonl:   0%|          | 0/50 [00:00<?, ?entry/s][A

hallucination_results: ```json
[
    {"word": "كوريا الجنوبية", "prob": 0.9},
    {"word": "16", "prob": 0.8}
]
```  

The hallucinated words in the model output text are "كوريا الجنوبية" and "16" with estimated probabilities of 0.9 and 0.8, respectively. These words do not align with the provided claim and references, indicating a likelihood of fabricated or partially incorrect content.



Processing mushroom.ar-val.v2.jsonl:   2%|▏         | 1/50 [00:49<40:09, 49.18s/entry][A

hallucination_results: ```json
[
    {"word": "كوريا الجنوبية", "prob": 0.9}
]
```
hallucination_results: ```json
[
    {"word": "هالوسينغ", "prob": 0.9},
    {"word": "هيلبي", "prob": 0.8}
]
```



Processing mushroom.ar-val.v2.jsonl:   4%|▍         | 2/50 [01:58<48:56, 61.17s/entry][A

hallucination_results: ```json
[
    {"word": "هالوسينغ", "prob": 1.0},
    {"word": "هيلبي", "prob": 1.0}
]
```
hallucination_results: Based on the provided text, it seems that there was an error or incompleteness in the input related to evaluating hallucinations in the model output text. The claims section includes information about a sports stadium in Austria rather than any content that would indicate hallucinations. For accurate evaluation of hallucinations, a different text or relevant information related to hallucinations is needed. If you have a specific text or claim related to hallucinations that you would like to analyze, please provide that for further evaluation.
No valid JSON found. Returning empty labels.



Processing mushroom.ar-val.v2.jsonl:   6%|▌         | 3/50 [10:41<3:33:05, 272.02s/entry][A

hallucination_results: Based on the model output text, it appears that there is an issue with the claim provided as it is incomplete and lacks specific information or context. The text mainly consists of repetitive prompts asking for a claim to verify, without a clear statement to evaluate for hallucinations. To accurately detect hallucinations, a complete and coherent claim related to hallucinations or false perceptions is required. If you have specific claims or statements related to hallucinations that you would like me to analyze, please provide them for further evaluation.
No valid JSON found. Returning empty labels.
hallucination_results: ```json
[
    {"word": "تشنسلر", "prob": 0.8},
    {"word": "جون ثان", "prob": 0.9},
    {"word": "بَينيت", "prob": 0.7}
]
```  

The model output text hallucinated on the words "تشنسلر", "جون ثان", and "بَينيت" with the provided probabilities based on the references and question. These words are not supported by the claims and references provid


Processing mushroom.ar-val.v2.jsonl:   8%|▊         | 4/50 [12:17<2:35:19, 202.59s/entry][A

hallucination_results: ```json
[
    {"word": "الكافي", "prob": 0.8},
    {"word": "بَينيت", "prob": 0.9}
]
``` 

In the model output text, the words "الكافي" and "بَينيت" are hallucinated with probabilities indicating partially incorrect content and fully fabricated content, respectively. These words do not align with the provided references, question, and existing knowledge.
hallucination_results: ```json
[
    {"word": "هميلتن", "prob": 0.9},
    {"word": "أكادميكل", "prob": 0.8}
]
```



Processing mushroom.ar-val.v2.jsonl:  10%|█         | 5/50 [13:27<1:56:04, 154.78s/entry][A

hallucination_results: ```json
[
    {"word": "هميلتن", "prob": 0.9},
    {"word": "أكادميكل", "prob": 0.8}
]
```
hallucination_results: The model output text seems to have repeated a set of responses in the Arabic language related to not receiving a specific claim to verify. There is no direct evaluation of hallucinations within the provided information. If you have a specific claim or information related to hallucinations that you would like to evaluate, please provide it, and I will be happy to assist you.
No valid JSON found. Returning empty labels.



Processing mushroom.ar-val.v2.jsonl:  12%|█▏        | 6/50 [21:53<3:21:00, 274.10s/entry][A

hallucination_results: It appears that there was a system error in the input text. Could you please rephrase your question or provide the specific claim you would like me to assess for hallucinations? Once you provide the claim or context, I will be able to evaluate it based on the information available. Thank you.
No valid JSON found. Returning empty labels.
hallucination_results: ```json
[
    {"word": "مونغا", "prob": 0.9},
    {"word": "يُلعب", "prob": 0.7},
    {"word": "يلَعب", "prob": 0.8}
]
```  



Processing mushroom.ar-val.v2.jsonl:  14%|█▍        | 7/50 [25:14<2:59:22, 250.29s/entry][A

hallucination_results: ```json
[
    {"word": "إديو مونغا", "prob": 0.8},
    {"word": "يُعب", "prob": 0.7},
    {"word": "بونجا", "prob": 0.6}
]
```  

In the model output text, the words "إديو مونغا" hallucinated into "إديو مونغا" and "بونجا." The word "يُعب" is also a hallucination from the correct word "يَلعب." The probabilities assigned indicate the level of inaccuracy in the output text based on the provided claims and references.
hallucination_results: ```json
[
    {"word": "دافوس", "prob": 0.9}
]
``` 

Explanation:
- The model output text contains the hallucinated word "دافوس" which is not supported by the provided references. The probability assigned is 0.9 indicating that this word is a fully fabricated content in the output.



Processing mushroom.ar-val.v2.jsonl:  16%|█▌        | 8/50 [34:38<4:05:00, 350.01s/entry][A

hallucination_results: ```json
[
    {"word": "دافوس", "prob": 0.9}
]
```
hallucination_results: ```json
[
    {"word": "بطريرك", "prob": 0.8},
    {"word": "قسطنطينية", "prob": 0.9},
    {"word": "1334.", "prob": 0.6}
]
```



Processing mushroom.ar-val.v2.jsonl:  18%|█▊        | 9/50 [1:10:26<10:23:12, 912.01s/entry][A

hallucination_results: ```json
[]
```
No valid JSON found. Returning empty labels.
OpenAI API Error: Request timed out.
OpenAI API Error: sequence item 51: expected str instance, list found
OpenAI API Error: Request timed out.
OpenAI API Error: sequence item 48: expected str instance, list found
hallucination_results: ```json
[
    {"word": "جوائز", "prob": 0.9}
]
```  

In the model output text, the word "جوائز" (awards) is hallucinated as it was not mentioned in the claims or references provided. The probability assigned to this hallucinated word is 0.9, indicating it is a fully fabricated content.



Processing mushroom.ar-val.v2.jsonl:  20%|██        | 10/50 [2:29:18<23:14:20, 2091.50s/entry][A

hallucination_results: ```json
[
    {"word": "جوائز", "prob": 0.8},
    {"word": "هوگو", "prob": 0.9}
]
```  

The model output text contains hallucinated words "جوائز" and "هوگو" with probabilities of 0.8 and 0.9 respectively. These words were not supported by the references provided and the content is considered partly incorrect.
hallucination_results: ```json
[
    {"word": "الكاش", "prob": 0.9},
    {"word": "بيلي", "prob": 0.8}
]
``` 

Explanation:
- "الكاش" is hallucinated as the correct name is "كاش" based on the references.
- "بيلي" is hallucinated, and the correct name is "جوني" based on the provided references.



Processing mushroom.ar-val.v2.jsonl:  22%|██▏       | 11/50 [2:42:13<18:17:35, 1688.60s/entry][A

hallucination_results: ```json
[
    {"word": "الكاش", "prob": 0.9},
    {"word": "بيلي هيل", "prob": 0.8}
]
```  
hallucination_results: ```json
[
    {"word": "هوففليشيم", "prob": 0.9}
]
``` 

The word "هوففليشيم" in the model output text is hallucinated with a probability of 0.9 as it does not match the correct term "هوففلسهايم" based on the reference provided.



Processing mushroom.ar-val.v2.jsonl:  24%|██▍       | 12/50 [2:56:35<15:10:07, 1437.03s/entry][A

hallucination_results: ```json
[
    {"word": "هوففليشيم", "prob": 0.9},
    {"word": "تقع", "prob": 0.6}
]
```
OpenAI API Error: Request timed out.
OpenAI API Error: sequence item 59: expected str instance, list found
hallucination_results: ```json
[
    {"word": "منطقة", "prob": 0.8},
    {"word": "فالكاوو", "prob": 0.9},
    {"word": "مترًا", "prob": 0.7}
]
```  

The model output text contains hallucinated words "منطقة", "فالكاوو", and "مترًا" with associated probabilities as mentioned above. These words do not align with the original claims and references provided.
OpenAI API Error: Request timed out.
OpenAI API Error: sequence item 45: expected str instance, list found
hallucination_results: ```json
[
    {"word": "يوم الأحد", "prob": 0.9}
]
```



Processing mushroom.ar-val.v2.jsonl:  26%|██▌       | 13/50 [3:34:01<17:17:21, 1682.20s/entry][A

hallucination_results: ```json
[
    {"word": "يوم", "prob": 1.0},
    {"word": ".", "prob": 1.0}
]
```
OpenAI API Error: Request timed out.
OpenAI API Error: Request timed out.
OpenAI API Error: sequence item 32: expected str instance, list found
hallucination_results: ```json
[
    {"word": "PHP", "prob": 0.8},
    {"word": "مبرمج", "prob": 0.9},
    {"word": "ومؤسس", "prob": 0.7}
]
```  

In the model output text, the words "PHP", "مبرمج" (programmer), and "ومؤسس" (and founder) are hallucinated with probabilities indicating partially incorrect or fully fabricated content. These words were not supported by the references provided or the given claims and question.
OpenAI API Error: Request timed out.
OpenAI API Error: sequence item 129: expected str instance, list found
OpenAI API Error: Request timed out.
OpenAI API Error: sequence item 67: expected str instance, list found
OpenAI API Error: Request timed out.
OpenAI API Error: sequence item 19: expected str instance, list found
hall


Processing mushroom.ar-val.v2.jsonl:  28%|██▊       | 14/50 [5:01:46<27:38:31, 2764.20s/entry][A

hallucination_results: ```json
[
    {"word": "الرمزى", "prob": 0.8},
    {"word": "النظم", "prob": 0.9},
    {"word": "الخاصه", "prob": 0.7}
]
```  
OpenAI API Error: Request timed out.
OpenAI API Error: sequence item 49: expected str instance, list found
OpenAI API Error: Request timed out.
OpenAI API Error: sequence item 52: expected str instance, list found
hallucination_results: ```json
[
    {"word": "بونزونى", "prob": 0.9},
    {"word": "بونزا", "prob": 0.8},
    {"word": "بوينت ديل ريو", "prob": 0.7}
]
```



Processing Files:   0%|          | 0/10 [5:44:49<?, ?file/s]                                  [A


KeyboardInterrupt: 

## Evaluation

In [64]:
import pandas as pd
import json
import os
from scorer import load_jsonl_file_to_records, score_iou, score_cor, main, recompute_hard_labels
import argparse as ap
import ast

In [65]:
def evaluate_iou_and_cor(val_dir, detect_dir, output_file):
    """
    Evaluate IoU and Spearman correlation between the reference (val) and detected (detect) files.

    :param val_dir: Directory containing the ground truth files (e.g., data/val/val/)
    :param detect_dir: Directory containing the detected files (e.g., data/detect/)
    :param output_file: Path to save the evaluation results (optional)
    """
    # List all files in the validation directory
    val_files = os.listdir(val_dir)
    detect_files = os.listdir(detect_dir)

    # Ensure that we are comparing the same files (same lang)
    for val_file in val_files:
        # Skip non-JSONL files
        if not val_file.endswith('.jsonl'):
            continue

        # Check if the corresponding detect file exists
        detect_file_path = os.path.join(detect_dir, val_file)

        if not os.path.exists(detect_file_path):
            print(f"Warning: {detect_file_path} not found, skipping.")
            continue

        # Load ground truth (val) and detected (detect) data
        ref_dicts = load_jsonl_file_to_records(os.path.join(val_dir, val_file))
        pred_dicts = load_jsonl_file_to_records(detect_file_path)

        # Calculate IoU and Spearman correlation
        try:
            ious, cors = main(ref_dicts, pred_dicts)
        except IndexError as e:
            print(f"IndexError occurred for file: {val_file}, skipping this file. Error: {e}")
            continue

        # Print or save the results
        print(f"Results for {val_file}:")
        print(f"  Mean IoU: {ious.mean():.8f}")
        print(f"  Mean Spearman Correlation: {cors.mean():.8f}")

        # Optionally, save the results to a file
        if output_file:
            with open(output_file, 'a', encoding='utf-8') as f:
                f.write(f"Results for {val_file}:\n")
                f.write(f"  Mean IoU: {ious.mean():.8f}\n")
                f.write(f"  Mean Spearman Correlation: {cors.mean():.8f}\n\n")


val_dir = 'data/val/val/'
detect_dir = 'data/detect_gpt/'
output_file = 'evaluation_results_gpt.txt'
evaluate_iou_and_cor(val_dir, detect_dir, output_file)

FileNotFoundError: [Errno 2] No such file or directory: 'data/val/val/'