In [1]:
main_folder = ""

train_folder   = f"dataset/train/train"
val_folder     = f"dataset/val/val"
sample_folder  = f"dataset/sample/sample"

val_filenames = ["mushroom.ar-val.v2.jsonl",
                "mushroom.de-val.v2.jsonl",
                "mushroom.en-val.v2.jsonl",
                "mushroom.es-val.v2.jsonl",
                "mushroom.fi-val.v2.jsonl",
                "mushroom.fr-val.v2.jsonl",
                "mushroom.hi-val.v2.jsonl",
                "mushroom.it-val.v2.jsonl",
                "mushroom.sv-val.v2.jsonl",
                "mushroom.zh-val.v2.jsonl"]

N = 5

api_key = "..."

In [2]:
import os
import wikipediaapi
import json
from yake import KeywordExtractor
from openai import OpenAI
import scorer
import pandas as pd


client = OpenAI(
    api_key=api_key,
)


def load_json_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = [json.loads(line) for line in file]
    return data


def get_gpt_response(message):

  chat_completion = client.chat.completions.create(
      messages=[
          {
              "role": "user",
              "content": message,
          }
      ],
      model="gpt-4",
  )

  response_content = chat_completion.choices[0].message.content
  return response_content


def retrieve_context_from_wikipedia(keyword):
    wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI,
        user_agent="MyPythonApp/1.0 (https://example.com; contact@example.com)"
    )
    
    page = wiki.page(keyword)
    if not page.exists():
        return ""
    return page.summary


def merge_ranges(ranges):

    ranges = [item for sublist in ranges for item in sublist]
    
    if not ranges:
        return []

    # Sort ranges by start index
    ranges.sort(key=lambda x: x[0])
    merged_ranges = [ranges[0]]

    for current in ranges[1:]:
        last = merged_ranges[-1]
        # Check for overlap or adjacency
        if current[0] <= last[1]:  # Overlapping or adjacent
            merged_ranges[-1] = (last[0], max(last[1], current[1]))
        else:
            merged_ranges.append(current)

    return merged_ranges
    

def compute_average_probability(merged_ranges, lists):
    avg_probabilities = []

    for (ms, me) in merged_ranges:
        merged_length = me - ms
        total_probability = 0

        for list_ranges in lists:
            overlap_sum = 0

            for (s, e) in list_ranges:
                # Compute overlap
                overlap = max(0, min(me, e) - max(ms, s))
                overlap_sum += overlap

            # Add to total probability
            total_probability += overlap_sum / merged_length if merged_length > 0 else 0

        # Average across all lists
        avg_probabilities.append(total_probability / len(lists))

    return avg_probabilities

In [3]:
def predict(data, kw_extractor):
    question = data["model_input"]
    answer = data["model_output_text"]
    logtis = data["model_output_logits"]

    # Extract keywords
    keywords = kw_extractor.extract_keywords(question)
    print(question)
    print(answer)
    print(keywords)
        
    # Get contexts
    contexts = []
    for keyword, _ in keywords:
        context = retrieve_context_from_wikipedia(keyword)
        contexts.append(context)
    print('---------------------')

    # Prepare prompt
    combined_context = "\n".join(contexts)
    list_of_tokens = str(data["model_output_tokens"])
    prompt = f"""Context: {combined_context}\nSentence: {answer}\n"""
    prompt = prompt + f"""Which tokens in the sentence are not supported by the context above?\n"""
    prompt = prompt + f"""Provide the answer in the form of a list of hallucination tokens sperated by '|' without accompanying texts."""
    
    # Ask LLM
    all_ranges = []
    all_responses = []
    all_hall_tokens = []
    for n in range(N):
        # Get response
        response = get_gpt_response(prompt)

        # Parse response
        hall_tokens = response.split("|")
        pred_ranges = []
        for hall_token in hall_tokens:
            hall_token = hall_token.strip()
            pred_index = answer.find(hall_token)

            if pred_index >= 0:
                pred_ranges.append((pred_index, pred_index+len(hall_token)))

        all_ranges.append(pred_ranges)
        all_responses.append(response)
        all_hall_tokens.append(hall_tokens)
        print(f"{n} prediction:", pred_ranges)
    
    # Merge predicted ranges
    merged_ranges = merge_ranges(all_ranges)

    # Compute average probabilities for merged ranges
    average_probabilities = compute_average_probability(merged_ranges, all_ranges)
        
    # Display Results
    soft_labels = []
    for i, prob in enumerate(average_probabilities):
        print(f"Range {merged_ranges[i]} Average Probability: {prob:.2f}")
        soft_label = {
            'start': merged_ranges[i][0],
            'end': merged_ranges[i][1],
            'prob': prob,
        }
        soft_labels.append(soft_label)

    return combined_context, all_responses, all_hall_tokens, soft_labels


# Init keyword extractor
kw_extractor = KeywordExtractor()
avg_ious = []
avg_cors = []
filenames = []

for filename in val_filenames:
    
    file_path = os.path.join(val_folder, filename)
    dataset = load_json_data(file_path)

    ref_dicts = []
    pred_dicts = []

    for data in dataset:

        answer = data["model_output_text"]
        combined_context, all_responses, all_hall_tokens, soft_labels = predict(data, kw_extractor)

        # Add soft and hard labels to the datapoint
        pred_data = {**data}
        pred_data["soft_labels"] = soft_labels
        pred_data["hard_labels"] = scorer.recompute_hard_labels(soft_labels)

        pred_data["text_len"] = len(answer)
        data["text_len"] = len(answer)

        pred_data["combined_context"] = combined_context
        pred_data["all_responses"] = all_responses
        pred_data["all_hall_tokens"] = all_hall_tokens
        
        pred_dicts.append(pred_data)
        ref_dicts.append(data)

        
        # Show prediction results
        print("++++++++++++++++++++++++++++++++++++++++++")
        print(answer)
        print(all_hall_tokens)
        
        print("GT:", data["hard_labels"])
        for label in data["hard_labels"]:
            print(answer[label[0]:label[1]])

        print("PD soft", soft_labels)
        print("PD hard", scorer.recompute_hard_labels(soft_labels))
        print("++++++++++++++++++++++++++++++++++++++++++")

        # Evaluate each iter
        ious, cors = scorer.main(ref_dicts, pred_dicts)
        print("Avg IoU: ", sum(ious)/len(ious))
        print("Avg Cor:", sum(cors)/len(cors))
        print('')

    # Evaluate
    ious, cors = scorer.main(ref_dicts, pred_dicts)
    avg_ious.append(sum(ious)/len(ious))
    avg_cors.append(sum(cors)/len(cors))
    filenames.append(filename)
    print("Avg IoU: ", sum(ious)/len(ious))
    print("Avg Cor:", sum(cors)/len(cors))
    print('')

    eval_results = pd.DataFrame.from_dict({"filename": filenames, "iou": avg_ious, "cor":avg_cors})
    eval_results.to_csv("eval_results.csv", index=False)

    pred_file_path = f"predictions/{filename}"
    with open(pred_file_path, 'w') as fp:
        json.dump(pred_dicts, fp, indent=4)


What did Petra van Staveren win a gold medal for?
Petra van Stoveren won a silver medal in the 2008 Summer Olympics in Beijing, China.
[('Petra van Staveren', np.float64(0.00688415006041516)), ('van Staveren win', np.float64(0.015042304599106411)), ('Petra van', np.float64(0.04498862876540802)), ('van Staveren', np.float64(0.04498862876540802)), ('Staveren win', np.float64(0.04498862876540802)), ('win a gold', np.float64(0.09700399286574239)), ('gold medal', np.float64(0.09700399286574239)), ('Petra', np.float64(0.1447773057422032)), ('Staveren', np.float64(0.1447773057422032)), ('van', np.float64(0.29736558256021506)), ('win', np.float64(0.29736558256021506)), ('gold', np.float64(0.29736558256021506)), ('medal', np.float64(0.29736558256021506))]
---------------------


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}