In [12]:
!pip install networkx matplotlib pyvis openai python-dotenv numpy torch transformers tqdm huggingface-hub



In [13]:
from typing import List, Optional
import json
from openai import OpenAI
import numpy as np
import json
import networkx as nx
import matplotlib.pyplot as plt

from dotenv import load_dotenv
import os

load_dotenv()

# Define your API key securely
# @ANISHA - Enter your OpenAI API key here
# - If you don't have an API key, you can get one by signing up at https://platform.openai.com/signup
API_KEY = os.getenv("OPENAI_API_KEY")

# Initialize the OpenAI client
client = OpenAI(api_key=API_KEY)



def extract_information(text: str):

    client = OpenAI()

    completion = client.chat.completions.create(
        model="gpt-4o-mini",  # Use an appropriate model
        messages=[
            {"role": "system", "content": """You are an expert at extracting information in structured formats to build a knowledge graph.

    Step 1 - Entity detection: Identify all entities in the raw text. Make sure not to miss any out. Entities should be basic and simple, they are akin to Wikipedia nodes.

    Step 2 - Coreference resolution: Find all expressions in the text that refer to the same entity. Make sure entities are not duplicated. In particular do not include entities that are more specific versions themselves, e.g. "a detailed view of jupiter's atmosphere" and "jupiter's atmosphere", only include the most specific version of the entity.

    Step 3 - Relation extraction: Identify semantic relationships between the entities you have identified.

    Format your response as a JSON array of objects, where each object must have exactly these three fields:
    - "subject": The first entity
    - "verb": The relationship between entities
    - "object": The second entity

    Important Tips:
    1. Make sure all information is included in the knowledge graph.
    2. Each triple must have exactly three non-empty strings.
    3. Do not split up related information into separate triples because this could change the meaning.
    4. Before adding a triple to the knowledge graph, check if concatenating subject+verb+object makes sense as a sentence. If not, discard it.
    5. Keep entities and relationships concise but meaningful.
    6. Convert pronouns to their proper noun references when possible.
    7. Keep everything lowercase and in present tense when appropriate.
    8. The output should be a JSON array of objects, each object containing the fields "subject", "verb", and "object", with the starting and ending tags ```json and ``` respectively.
    """},
            {"role": "user", "content": f"Use the given format to extract information from the following input: <input>{text}</input>. Skip the preamble and output the result as a JSON array within <json></json> tags."}
        ]
    )


    if completion.choices:
        response_message = str(completion.choices[0].message.content)
        # process response_message from string to JSON
        # print("DEBUG: response_message = ", response_message)
        # if it contains leading and trailing ``` characters, remove them
        if response_message.startswith("```") and response_message.endswith("```"):
            response_message = response_message[3:-3]
        # if it contains leading "json" characters, remove them
        if response_message.startswith("json"):
            response_message = response_message[4:]
        response_message = json.loads(response_message)
        # print(response_message)
        # if the response message is a single JSON object, convert it to a list of JSON objects
        if type(response_message) == dict:
            response_message = [response_message]
        return response_message
    else:
        print("No response received.")
        return []




In [14]:
from pyvis.network import Network
import json

def visualize_json(json_data):
    net = Network(height="750px", width="100%", bgcolor="#222222", font_color="white")

    if isinstance(json_data, str):
        data = json.loads(json_data)
    else:
        data = json_data

    for entry in data:
        subject = entry.get('subject')
        verb = entry.get('verb')
        object = entry.get('object')

        net.add_node(subject, title=subject, color='skyblue')
        net.add_node(object, title=object, color='lightgreen')
        net.add_edge(subject, object, title=verb)

    net.show("knowledge_graph.html", notebook=False)



# Example text to analyze
text = "According to my knowledge, Mihai Surdeanu is an Associate Professor in the departments of Cognitive Science - GIDP, Computer Science, and BIO5 Institute at the University of Arizona. He earned his Ph.D. in Computer Science from Southern Methodist University in 2001 and has over 15 years of experience in building systems driven by natural language processing (NLP) and machine learning. Surdeanu has published over 80 peer-reviewed articles and has been a leader or member of teams that ranked in the top three at seven highly competitive international evaluations of end-user NLP systems such as question answering and information extraction. His work has been funded by several government organizations and private foundations. Surdeanu's research focuses on NLP and machine learning."
# Extract information from the text
json_data = extract_information(text)
print(json_data)

# Visualize the extracted information as a knowledge graph
visualize_json(json_data)


[{'subject': 'mihai surdeanu', 'verb': 'is', 'object': 'associate professor'}, {'subject': 'mihai surdeanu', 'verb': 'works in', 'object': 'cognitive science - gidp'}, {'subject': 'mihai surdeanu', 'verb': 'works in', 'object': 'computer science'}, {'subject': 'mihai surdeanu', 'verb': 'works in', 'object': 'bio5 institute'}, {'subject': 'mihai surdeanu', 'verb': 'works at', 'object': 'university of arizona'}, {'subject': 'mihai surdeanu', 'verb': 'earned', 'object': 'ph.d. in computer science'}, {'subject': 'ph.d. in computer science', 'verb': 'is from', 'object': 'southern methodist university'}, {'subject': 'mihai surdeanu', 'verb': 'has', 'object': 'over 15 years of experience'}, {'subject': 'mihai surdeanu', 'verb': 'focuses on', 'object': 'natural language processing'}, {'subject': 'mihai surdeanu', 'verb': 'focuses on', 'object': 'machine learning'}, {'subject': 'mihai surdeanu', 'verb': 'has published', 'object': 'over 80 peer-reviewed articles'}, {'subject': 'mihai surdeanu', 

In [15]:
from pyvis.network import Network
import json

def visualize_comparison(source_json_data, output_json_data, display='both'):
    net = Network(height="750px", width="100%", bgcolor="#222222", font_color="white")

    # Parse JSON data
    source_data = json.loads(source_json_data) if isinstance(source_json_data, str) else source_json_data
    output_data = json.loads(output_json_data) if isinstance(output_json_data, str) else output_json_data

    # Extract nodes and verbs from data
    source_nodes = set(entry['subject'] for entry in source_data) | set(entry['object'] for entry in source_data)
    output_nodes = set(entry['subject'] for entry in output_data) | set(entry['object'] for entry in output_data)
    
    # Determine common nodes
    common_nodes = source_nodes & output_nodes
    
    # Function to add nodes and edges to the network
    def add_nodes_edges(data, node_color):
        for entry in data:
            subject, verb, object = entry['subject'], entry['verb'], entry['object']
            # Conditionally color nodes if they are common
            sub_color = 'yellow' if subject in common_nodes else node_color
            obj_color = 'yellow' if object in common_nodes else node_color
            net.add_node(subject, title=subject, color=sub_color)
            net.add_node(object, title=object, color=obj_color)
            net.add_edge(subject, object, title=verb)

    # Add source graph nodes and edges
    if display in ['source', 'both']:
        add_nodes_edges(source_data, 'blue')

    # Add output graph nodes and edges
    if display in ['output', 'both']:
        add_nodes_edges(output_data, 'green')

    # Save and show the graph
    net.show("knowledge_graph_comparison.html", notebook=False)

# Example JSON data for source and output
source_json = json.dumps([
    {"subject": "Fox", "verb": "jumps", "object": "Dog"},
    {"subject": "Dog", "verb": "barks", "object": "loudly"}
])

output_json = json.dumps([
    {"subject": "Fox", "verb": "runs", "object": "fast"},
    {"subject": "Dog", "verb": "barks", "object": "quietly"}
])

# Visualize comparison allowing the user to choose which parts to display
visualize_comparison(source_json, output_json, display='both')


knowledge_graph_comparison.html


In [16]:
# Generate source text
source_text = "There was once a boy named Jack who lived in a small village. Jack was known for his bravery and kindness. One day, Jack decided to climb a giant beanstalk that had grown in his backyard. At the top of the beanstalk, he found a castle inhabited by a giant. The giant had a magical hen that laid golden eggs. Jack managed to steal the hen and escape from the giant. The giant chased Jack down the beanstalk, but Jack managed to cut it down, causing the giant to fall to his doom. Jack returned to his village with the magical hen and lived happily ever after."

# Generate output text with subtle hallucinations
output_text = "In the town of Kingman, Arizona, there lived a courageous and kind-hearted boy named Jack. One adventurous day, he ascended a massive beanstalk sprouting in his backyard. At its peak, Jack discovered a castle, home to a formidable giant. This giant possessed a hen with the ability to produce golden eggs. Jack cleverly captured the hen and fled from the giant's clutches. In a thrilling chase, the giant pursued Jack along the beanstalk. In a daring move, Jack chopped down the beanstalk, leading to the giant's tragic fall and demise. Triumphantly, Jack returned to his village, where he and the magical hen enjoyed a prosperous and joyful life."

# Extract information from source text
source_json = extract_information(source_text)
output_json = extract_information(output_text)

print("Source JSON:", source_json)
print("Output JSON:", output_json)

# Compare the graphs
visualize_comparison(source_json, output_json, display='both')


Source JSON: [{'subject': 'jack', 'verb': 'lives in', 'object': 'a small village'}, {'subject': 'jack', 'verb': 'is known for', 'object': 'his bravery and kindness'}, {'subject': 'jack', 'verb': 'decides to climb', 'object': 'a giant beanstalk'}, {'subject': 'the giant beanstalk', 'verb': 'has grown in', 'object': 'his backyard'}, {'subject': 'jack', 'verb': 'finds', 'object': 'a castle'}, {'subject': 'the castle', 'verb': 'is inhabited by', 'object': 'a giant'}, {'subject': 'the giant', 'verb': 'has', 'object': 'a magical hen'}, {'subject': 'the magical hen', 'verb': 'lays', 'object': 'golden eggs'}, {'subject': 'jack', 'verb': 'manages to steal', 'object': 'the hen'}, {'subject': 'the giant', 'verb': 'chases', 'object': 'jack'}, {'subject': 'jack', 'verb': 'cuts down', 'object': 'the beanstalk'}, {'subject': 'the giant', 'verb': 'falls to', 'object': 'his doom'}, {'subject': 'jack', 'verb': 'returns to', 'object': 'his village'}, {'subject': 'jack', 'verb': 'lives', 'object': 'happil

In [17]:
# RAG prompt question-answering examples



def ask_question(question: str, context: str, model="gpt-4-turbo"):
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are an assistant specialized in answering questions based on a given context. Your task is to provide accurate and concise answers to the questions asked. If the answer is not present in the context, you should respond with 'The answer is not present in the given context.'"},
            {"role": "system", "content": "Context: " + context},
            {"role": "user", "content": f"Question: {question}"}
        ]
    )

    if completion.choices:
        response_message = str(completion.choices[0].message.content)
        return response_message
    else:
        print("No response received.")
        return None
    
# Example context and questions
context = """

In the town of Quillhaven—a community renowned for its historic libraries and scholarly traditions—a rare astronomical event was predicted to occur for the first time in over a century. The event, a total solar eclipse, was shrouded in both scientific intrigue and centuries-old folklore. Local legends, drawing on influences from ancient Greek philosophy to indigenous spiritual practices, suggested that eclipses held the power to influence human thought and communal harmony.

At the heart of the community’s preparations was Dr. Lila Montrose, a respected astrophysicist whose career had been dedicated to unraveling the mysteries of cosmic events. Dr. Montrose saw the eclipse as an opportunity to blend modern scientific inquiry with the town’s rich cultural heritage. She proposed a series of public lectures and interactive exhibitions designed to educate residents on the mechanics of the eclipse, while also acknowledging its historical and psychological significance. Her balanced approach aimed to respect both empirical evidence and the symbolic narratives that had long captivated the community.

Meanwhile, Mr. Edmund Blackwell, the town’s dedicated historian, uncovered an ancient manuscript in the dusty archives of Quillhaven’s old library. The manuscript, penned in a mix of archaic English and Latin, detailed elaborate rituals and communal activities that had been performed during similar eclipses in bygone eras. Blackwell argued that these practices were more than mere superstition—they were intrinsic to the town’s identity and had once fostered unity and prosperity. His findings ignited a fervent debate about whether these ancient rituals should be revived as a way to reconnect with Quillhaven’s storied past.

This emerging debate quickly divided the community into two factions. The progressive wing, led by Dr. Montrose, advocated for a balanced, modern approach that combined scientific exploration with cultural respect. In contrast, the conservative faction, inspired by Mr. Blackwell’s manuscript, pushed for a return to the traditional rituals that they believed had been the cornerstone of the town’s former unity and success. As the day of the eclipse drew near, these conflicting views not only highlighted an ideological rift but also symbolized a broader struggle between embracing modernity and preserving historical identity.

In a final effort to bridge these divergent perspectives, town leaders organized a public forum. The forum featured spirited debates and passionate speeches, ultimately concluding with a proposal for collaboration. Both factions agreed that the eclipse could serve as a catalyst for dialogue, urging the community to forge a common path that honored both its scientific curiosity and its deep-rooted traditions.

"""



question = "Explain how the contrasting views of Dr. Lila Montrose and Mr. Edmund Blackwell regarding the upcoming eclipse illustrate the broader tension between modern scientific inquiry and traditional cultural practices. What solution did the town of Quillhaven ultimately propose to address this conflict?"

# Get the answer to the question
answer = ask_question(question, context)
print("Answer:", answer)

source_json = extract_information(context)
output_json = extract_information(answer)

print("Source JSON:", source_json)
print("Output JSON:", output_json)

client_embed = OpenAI(api_key=os.getenv("EMBEDDING_API_KEY"), base_url=os.getenv("EMBEDDING_BASE_URL"))
# Find semantically similar claims out of source and output JSON: "convert these back to text by s-v-o format and compare them", for each claim in output_json, find the two most similar claims in source_json
def find_similar_claims(source_json, output_json):
    source_graphs = []
    output_graphs = []
    for entry in source_json:
        source_graphs.append(f"{entry['subject']} {entry['verb']} {entry['object']}")
    for entry in output_json:
        output_graphs.append(f"{entry['subject']} {entry['verb']} {entry['object']}")

    # create embeddings for source and output graphs
    source_embeddings = client_embed.embeddings.create(input=source_graphs, model="text-embedding-3-small")
    output_embeddings = client_embed.embeddings.create(input=output_graphs, model="text-embedding-3-small")
    # print("Source embeddings:", source_embeddings)
    similar_claims = {}

    for i, output_entry in enumerate(output_json):
        output_embedding = output_embeddings.data[i].embedding  
        similarities = []
        for j, source_entry in enumerate(source_json):
            source_embedding = source_embeddings.data[j].embedding
            # Calculate cosine similarity between embeddings
            similarity = np.dot(output_embedding, source_embedding) / (np.linalg.norm(output_embedding) * np.linalg.norm(source_embedding))
            similarities.append((source_entry, similarity))
        similarities.sort(key=lambda x: x[1], reverse=True)
        # print(similarities)
        output_entry_string = f"{output_entry['subject']} {output_entry['verb']} {output_entry['object']}"
        similar_claims[output_entry_string] = similarities[:3]

    return similar_claims

similar_claims = find_similar_claims(source_json, output_json)

print("Similar Claims:")
for output_entry, source_entries in similar_claims.items():
    print(f"Output Claim: {output_entry}")
    for source_entry, similarity in source_entries:
        print(f"Similar Source Claim: {source_entry} (Similarity: {similarity})")
    print()


Answer: The contrasting views of Dr. Lila Montrose and Mr. Edmund Blackwell regarding the upcoming eclipse in Quillhaven illustrate the broader tension between modern scientific inquiry and traditional cultural practices. Dr. Montrose believed in a balanced approach that integrated scientific understanding with a respect for cultural heritage. She focused on educating the community about the scientific mechanics of the eclipse while appreciating its historical and symbolic significance. On the other hand, Mr. Blackwell, who uncovered ancient rituals in a manuscript, advocated for a revival of these traditions, arguing that they were essential to the town's identity and past unity. His stance highlighted a preference for honoring and preserving historical practices over modern scientific methods.

To address this conflict, the town of Quillhaven proposed a collaborative solution during a public forum. Both factions—those supporting a modern scientific approach and those favoring traditi

In [18]:
from huggingface_hub import login  
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

login(token=os.getenv("HUGGINGFACE_TOKEN"))
# Generate pairs from similar claims
claim_pairs = []
for output_entry, source_entries in similar_claims.items():
    source_claims = [f"{source_entry['subject']} {source_entry['verb']} {source_entry['object']}" for source_entry, _ in source_entries]
    source_claims = ";".join(source_claims)
    claim_pairs.append((source_claims, output_entry))

for pair in claim_pairs:
    # Print the pairs in a beautiful format
    print("Input:", pair[0])
    print("Output:", pair[1])
    print()

# Step 1: Load the model
model = AutoModelForSequenceClassification.from_pretrained('vectara/hallucination_evaluation_model', trust_remote_code=True)

# Step 2: Evaluate the hallucination model
# Prompt the pairs
prompt = "<pad> Determine if the hypothesis is true given the premise?\n\nPremise: {text1}\n\nHypothesis: {text2}"
input_pairs = [prompt.format(text1=pair[0], text2=pair[1]) for pair in claim_pairs]
print(input_pairs[0])

# Use text-classification pipeline to predict
classifier = pipeline(
            "text-classification",
            model='vectara/hallucination_evaluation_model',
            tokenizer=AutoTokenizer.from_pretrained('google/flan-t5-base'),
            trust_remote_code=True
        )
full_scores = classifier(input_pairs, top_k=None) # List[List[Dict[str, float]]]

# Optional: Extract the scores for the 'consistent' label
simple_scores = [score_dict['score'] for score_for_both_labels in full_scores for score_dict in score_for_both_labels if score_dict['label'] == 'consistent']

print(simple_scores)

Input: dr. lila montrose aims to respect empirical evidence;dr. lila montrose is a respected astrophysicist;dr. lila montrose aims to respect symbolic narratives
Output: dr. lila montrose believes in a balanced approach

Input: dr. lila montrose aims to respect symbolic narratives;dr. lila montrose aims to respect empirical evidence;dr. lila montrose sees the eclipse as an opportunity to blend cultural heritage
Output: dr. lila montrose integrates scientific understanding with cultural heritage

Input: dr. lila montrose sees the eclipse as an opportunity to blend modern scientific inquiry;dr. lila montrose sees the eclipse as an opportunity to blend cultural heritage;dr. lila montrose has dedicated her career to unraveling mysteries of cosmic events
Output: dr. lila montrose focuses on educating the community about the scientific mechanics of the eclipse

Input: dr. lila montrose sees the eclipse as an opportunity to blend cultural heritage;dr. lila montrose sees the eclipse as an oppo

You are using a model of type HHEMv2Config to instantiate a model of type HHEMv2. This is not supported for all configurations of models and can yield errors.


<pad> Determine if the hypothesis is true given the premise?

Premise: dr. lila montrose aims to respect empirical evidence;dr. lila montrose is a respected astrophysicist;dr. lila montrose aims to respect symbolic narratives

Hypothesis: dr. lila montrose believes in a balanced approach


You are using a model of type HHEMv2Config to instantiate a model of type HHEMv2. This is not supported for all configurations of models and can yield errors.
You are using a model of type HHEMv2Config to instantiate a model of type HHEMv2. This is not supported for all configurations of models and can yield errors.
Device set to use mps:0


[0.012414200231432915, 0.03012670949101448, 0.014904123730957508, 0.5892153382301331, 0.951510488986969, 0.9311442971229553, 0.20835311710834503, 0.5713628530502319, 0.9764062762260437, 0.8465678691864014, 0.8210894465446472, 0.08822452276945114]


In [57]:
import pandas as pd

# Login using e.g. `huggingface-cli login` to access this dataset
df = pd.read_parquet("hf://datasets/mteb/summeval/data/test-00000-of-00001-35901af5f6649399.parquet")
df

Unnamed: 0,machine_summaries,human_summaries,relevance,coherence,fluency,consistency,text,id
0,"[donald sterling , nba team last year . sterli...",[V. Stiviano must pay back $2.6 million in gif...,"[1.6666666666666667, 1.6666666666666667, 2.333...","[1.3333333333333333, 3.0, 1.0, 2.6666666666666...","[1.0, 4.666666666666667, 4.333333333333333, 4....","[1.0, 2.3333333333333335, 4.666666666666667, 5...",(CNN)Donald Sterling's racist remarks cost him...,cnn-test-404f859482d47c127868964a9a39d1a7645dd2e9
1,[north pacific gray whale has earned a spot in...,"[The whale, Varvara, swam a round trip from Ru...","[2.3333333333333335, 4.666666666666667, 3.6666...","[1.3333333333333333, 4.666666666666667, 3.6666...","[1.0, 5.0, 4.666666666666667, 3.66666666666666...","[1.3333333333333333, 5.0, 5.0, 4.3333333333333...",(CNN)A North Pacific gray whale has earned a s...,cnn-test-4761dc6d8bdf56b9ada97104113dd1bcf4aed3f1
2,[russian fighter jet intercepted a u.s. reconn...,[The incident occurred on April 7 north of Pol...,"[4.0, 4.0, 4.0, 3.3333333333333335, 3.33333333...","[3.3333333333333335, 4.333333333333333, 1.6666...","[3.6666666666666665, 4.333333333333333, 5.0, 4...","[5.0, 5.0, 4.666666666666667, 5.0, 5.0, 5.0, 5...",(CNN)After a Russian fighter jet intercepted a...,cnn-test-5139ccfabee55ddb83e7937f5802c0a67aee8975
3,[michael barnett captured the fire on intersta...,[Country band Lady Antebellum's bus caught fir...,"[2.0, 3.0, 2.6666666666666665, 3.3333333333333...","[2.0, 3.0, 2.6666666666666665, 3.3333333333333...","[2.6666666666666665, 5.0, 5.0, 5.0, 5.0, 5.0, ...","[2.3333333333333335, 5.0, 5.0, 5.0, 5.0, 5.0, ...",(CNN)Lady Antebellum singer Hillary Scott's to...,cnn-test-88c2481234e763c9bbc68d0ab1be1d2375c1349a
4,[deep reddish color caught seattle native tim ...,[Smoke from massive fires in Siberia created f...,"[1.6666666666666667, 3.6666666666666665, 3.333...","[1.6666666666666667, 3.6666666666666665, 1.666...","[5.0, 5.0, 5.0, 5.0, 4.666666666666667, 5.0, 5...","[2.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...",(CNN)A fiery sunset greeted people in Washingt...,cnn-test-a02e362c5b8f049848ce718b37b96117485461cf
...,...,...,...,...,...,...,...,...
95,[chelsea have made an offer for fc tokyo 's du...,[Naoki Ogane says that Chelsea have made an of...,"[3.0, 4.333333333333333, 4.0, 4.0, 4.333333333...","[2.0, 4.0, 4.333333333333333, 3.66666666666666...","[3.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...","[2.0, 5.0, 5.0, 4.333333333333333, 5.0, 5.0, 5...",Chelsea have made an offer for FC Tokyo's 22-y...,dm-test-f26d8400ae49b90d109c165d0f44b8f6ca253c08
96,[christopher lawler said he was pinned to a ch...,[Christopher Lawler claims he was pinned to a ...,"[5.0, 4.333333333333333, 4.333333333333333, 4....","[3.0, 4.333333333333333, 3.6666666666666665, 4...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 3.0, ...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 1.333...",Police are investigating claims by a former ro...,dm-test-f37fd6e9b6cc18a7132568e307ef3b130931e809
97,[eden hazard scored a 1-0 lead against manches...,[Eden Hazard scored the opening goal for Chels...,"[3.3333333333333335, 3.3333333333333335, 3.666...","[2.0, 2.0, 1.6666666666666667, 1.6666666666666...","[3.3333333333333335, 4.666666666666667, 5.0, 5...","[1.6666666666666667, 5.0, 5.0, 5.0, 4.0, 5.0, ...",After Chelsea forward Eden Hazard had scored g...,dm-test-f468efac7b3c54f8c42c2c81dff108c52ebe0d7d
98,[evangelos patoulidis is regarded as one of th...,[Evangelos Patoulidis also attracted interest ...,"[3.6666666666666665, 4.333333333333333, 4.0, 4...","[2.6666666666666665, 5.0, 5.0, 4.3333333333333...","[4.333333333333333, 5.0, 5.0, 4.66666666666666...","[5.0, 5.0, 5.0, 5.0, 5.0, 4.666666666666667, 5...",Manchester City are keen to sign Anderlecht te...,dm-test-f5fead94ee884800e84a212cc0edc78b11c4ba9f


In [61]:
# This dataset contains machine_summaries, human_summaries, text and consistency. Apart from the text, all other columns are lists of strings. The text column contains the original text, while the machine_summaries and human_summaries columns contain the machine-generated and human-written summaries, respectively. The consistency column contains a list of consistency scores for each summary.
# Build a new dataframe, where each row contains the text, machine_summary, human_summary, and consistency score for a single summary. This will make it easier to work with the data.
# Create a new dataframe with the desired columns

# Verify that for each row the lengths of machine_summaries, human_summaries, and consistency are equal.



# If it is expand the df
df_expanded = pd.DataFrame({
    'text': df['text'].repeat(df['machine_summaries'].str.len()).reset_index(drop=True),
    'machine_summary': [summary for summaries in df['machine_summaries'] for summary in summaries],
    # 'human_summary': [summary for summaries in df['human_summaries'] for summary in summaries],
    'consistency': [score for scores in df['consistency'] for score in scores]
})


df_expanded

Unnamed: 0,text,machine_summary,consistency
0,(CNN)Donald Sterling's racist remarks cost him...,"donald sterling , nba team last year . sterlin...",1.000000
1,(CNN)Donald Sterling's racist remarks cost him...,donald sterling accused stiviano of targeting ...,2.333333
2,(CNN)Donald Sterling's racist remarks cost him...,a los angeles judge has ordered v. stiviano to...,4.666667
3,(CNN)Donald Sterling's racist remarks cost him...,donald sterling 's wife sued stiviano of targe...,5.000000
4,(CNN)Donald Sterling's racist remarks cost him...,donald sterling 's racist remarks cost him an ...,5.000000
...,...,...,...
1595,A 23-year-old mother-of-two is at risk of bein...,a 23-year-old mother-of-two is at risk of bein...,5.000000
1596,A 23-year-old mother-of-two is at risk of bein...,"Gemma , 23 , has two children under five by tw...",5.000000
1597,A 23-year-old mother-of-two is at risk of bein...,"gemma , named only as gemma , has two children...",5.000000
1598,A 23-year-old mother-of-two is at risk of bein...,"the woman , named only as gemma , has two chil...",5.000000


In [64]:
# Change consistency to 0 or 1, 1 if score >=5 else 0
df_expanded['weak_consistency'] = df_expanded['consistency'].apply(lambda x: 1 if float(x) >=4  else 0)
df_expanded['strong_consistency'] = df_expanded['consistency'].apply(lambda x: 1 if float(x) >= 5 else 0)


In [65]:
print(df_expanded['strong_consistency'].value_counts())
print(df_expanded['weak_consistency'].value_counts())

strong_consistency
1    1306
0     294
Name: count, dtype: int64
weak_consistency
1    1439
0     161
Name: count, dtype: int64


In [None]:
import json
from json import JSONDecodeError
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer

# 1️⃣ One‐time model + tokenizer init
_tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')
_grapheval_clf = pipeline(
    "text-classification",
    model='vectara/hallucination_evaluation_model',
    tokenizer=_tokenizer,
    trust_remote_code=True
)

# 2️⃣ Prompt template & threshold
_PROMPT = (
    "<pad> Determine if the hypothesis is true given the premise?\n\n"
    "Premise: {premise}\n\n"
    "Hypothesis: {hypothesis}"
)
_THRESHOLD = 0.5

# 3️⃣ Cache for extracted source JSON
_source_cache: dict[str, dict] = {}

def perform_grapheval(text: str, machine_output: str, threshold: float = _THRESHOLD) -> bool:
    # — get or compute source_json (cached)
    try:
        if text not in _source_cache:
            _source_cache[text] = extract_information(text)
        source_json = _source_cache[text]
    except JSONDecodeError as e:
        print(f"[Warning] extract_information on source failed: {e}")
        return -1

    # — always re-extract & compare for the summary
    try:
        output_json = extract_information(machine_output)
    except JSONDecodeError as e:
        print(f"[Warning] extract_information on summary failed: {e}")
        return -1

    similar = find_similar_claims(source_json, output_json)
    if not similar:
        return 1

    # — build the prompts
    prompts = [
        _PROMPT.format(
            premise=";".join(f"{s['subject']} {s['verb']} {s['object']}" for s, _ in sources),
            hypothesis=hyp
        )
        for hyp, sources in similar.items()
    ]

    # — run the hallucination classifier
    results = _grapheval_clf(prompts, top_k=None)
    scores = [
        entry['score']
        for res in results
        for entry in res
        if entry.get('label', '').lower() == 'consistent'
    ]

    return 1 if all(score >= threshold for score in scores) else 0

# 4️⃣ Apply across your DataFrame with a safe wrapper
tqdm.pandas()

def safe_eval(row):
    try:
        return int(perform_grapheval(row['text'], row['machine_summary']))
    except Exception as e:
        print(f"[Error] Row {row.name} failed evaluation: {e}")
        return -1

df_expanded['grapheval_consistency'] = df_expanded.progress_apply(safe_eval, axis=1)


You are using a model of type HHEMv2Config to instantiate a model of type HHEMv2. This is not supported for all configurations of models and can yield errors.
You are using a model of type HHEMv2Config to instantiate a model of type HHEMv2. This is not supported for all configurations of models and can yield errors.
Device set to use mps:0
  1%|          | 11/1600 [01:51<3:17:48,  7.47s/it]



  7%|▋         | 105/1600 [16:53<3:17:59,  7.95s/it]



  9%|▊         | 137/1600 [21:24<3:31:16,  8.66s/it]



 11%|█▏        | 182/1600 [33:21<4:02:31, 10.26s/it] 



 11%|█▏        | 183/1600 [33:27<3:27:35,  8.79s/it]



 12%|█▏        | 189/1600 [34:29<3:13:56,  8.25s/it]



 17%|█▋        | 276/1600 [58:47<5:47:45, 15.76s/it] 



 20%|█▉        | 317/1600 [1:10:26<4:24:14, 12.36s/it] 



 20%|██        | 327/1600 [1:12:53<5:02:09, 14.24s/it]



 33%|███▎      | 522/1600 [2:25:05<11:45:32, 39.27s/it]



 49%|████▉     | 782/1600 [3:45:44<4:39:17, 20.49s/it] 



 49%|████▉     | 790/1600 [3:48:19<3:51:35, 17.16s/it]



 53%|█████▎    | 845/1600 [4:01:25<2:36:22, 12.43s/it]



 59%|█████▊    | 939/1600 [4:22:29<1:44:54,  9.52s/it]



 62%|██████▏   | 997/1600 [4:40:10<3:01:13, 18.03s/it]



 66%|██████▌   | 1058/1600 [4:55:55<1:06:21,  7.35s/it]



 66%|██████▌   | 1059/1600 [4:56:01<1:04:30,  7.15s/it]



 67%|██████▋   | 1066/1600 [4:57:22<1:23:12,  9.35s/it]



 67%|██████▋   | 1070/1600 [4:58:28<1:59:58, 13.58s/it]



 70%|███████   | 1122/1600 [5:18:03<2:18:02, 17.33s/it]



 71%|███████▏  | 1143/1600 [5:22:38<43:45,  5.75s/it]  



 72%|███████▏  | 1146/1600 [5:23:07<1:06:53,  8.84s/it]



 74%|███████▍  | 1188/1600 [5:33:09<1:12:31, 10.56s/it]



 77%|███████▋  | 1225/1600 [5:44:41<1:30:16, 14.44s/it]



 77%|███████▋  | 1229/1600 [5:45:27<1:10:35, 11.42s/it]



 77%|███████▋  | 1234/1600 [5:46:33<1:18:14, 12.83s/it]



 81%|████████▏ | 1303/1600 [6:05:06<57:33, 11.63s/it]  



 82%|████████▏ | 1316/1600 [6:07:49<1:08:54, 14.56s/it]



 85%|████████▌ | 1361/1600 [6:20:22<52:00, 13.06s/it]  



 86%|████████▌ | 1378/1600 [6:25:02<1:11:50, 19.42s/it]



 89%|████████▉ | 1421/1600 [6:37:10<31:21, 10.51s/it]  



 90%|█████████ | 1444/1600 [6:43:39<30:19, 11.66s/it]  



 92%|█████████▏| 1467/1600 [6:51:13<31:21, 14.15s/it]  



 93%|█████████▎| 1485/1600 [6:55:36<23:20, 12.18s/it]



 96%|█████████▋| 1540/1600 [7:08:22<09:16,  9.27s/it]



 98%|█████████▊| 1575/1600 [7:13:13<02:39,  6.39s/it]



100%|██████████| 1600/1600 [7:17:02<00:00, 16.39s/it]


In [None]:
# Get the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

# Calculate the balanced accuracy score
balanced_accuracy = balanced_accuracy_score(df_expanded['consistency'], df_expanded['grapheval_consistency'])
print(f"Balanced Accuracy Score: {balanced_accuracy:.2f}")


# Find the number of each values in grapheval_consistency
grapheval_counts = df_expanded['grapheval_consistency'].value_counts()
print(grapheval_counts)


Balanced Accuracy Score: 0.53
grapheval_consistency
0    1471
1     129
Name: count, dtype: int64


In [None]:
import json
from json import JSONDecodeError
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer
import numpy as np
from openai import OpenAI
import os

# 1️⃣ One‐time model + tokenizer init
_tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')
_grapheval_clf = pipeline(
    "text-classification",
    model='vectara/hallucination_evaluation_model',
    tokenizer=_tokenizer,
    trust_remote_code=True
)

# 2️⃣ Prompt template & threshold
_PROMPT = (
    "<pad> Determine if the hypothesis is true given the premise?\n\n"
    "Premise: {premise}\n\n"
    "Hypothesis: {hypothesis}"
)
_THRESHOLD = 0.5

# 3️⃣ Cache for extracted source sentences
_source_cache: dict[str, list] = {}

# Initialize the OpenAI client for embeddings
client_embed = OpenAI(api_key=os.getenv("EMBEDDING_API_KEY"), base_url=os.getenv("EMBEDDING_BASE_URL"))

def extract_sentences(text: str) -> list:
    """Extract sentences from the given text."""
    # Simple sentence splitting - you can improve this as needed
    sentences = [s.strip() for s in text.split('.') if s.strip()]
    return sentences

def find_similar_sentences(source_sentences, output_sentences):
    """Find semantically similar sentences using embeddings."""
    # Create embeddings for source and output sentences
    source_embeddings = client_embed.embeddings.create(input=source_sentences, model="text-embedding-3-small")
    output_embeddings = client_embed.embeddings.create(input=output_sentences, model="text-embedding-3-small")
    
    similar_sentences = {}

    for i, output_sentence in enumerate(output_sentences):
        output_embedding = output_embeddings.data[i].embedding
        similarities = []
        for j, source_sentence in enumerate(source_sentences):
            source_embedding = source_embeddings.data[j].embedding
            # Calculate cosine similarity between embeddings
            similarity = np.dot(output_embedding, source_embedding) / (np.linalg.norm(output_embedding) * np.linalg.norm(source_embedding))
            similarities.append((source_sentence, similarity))
        similarities.sort(key=lambda x: x[1], reverse=True)
        similar_sentences[output_sentence] = similarities[:3]

    return similar_sentences

def perform_grapheval(text: str, machine_output: str, threshold: float = _THRESHOLD) -> int:
    # — get or compute source sentences (cached)
    if text not in _source_cache:
        _source_cache[text] = extract_sentences(text)
    source_sentences = _source_cache[text]
    
    # — extract sentences from machine output
    output_sentences = extract_sentences(machine_output)
    
    # — find similar sentences
    similar = find_similar_sentences(source_sentences, output_sentences)
    
    # If no similarities found, mark as inconsistent (original returned 1 here, keeping that logic)
    if not similar:
        return 1
        
    # — build the prompts for evaluation
    prompts = []
    for output_sent, sources in similar.items():
        # Create premise from source sentences (using only the sentences, not the similarity scores)
        premise = ";".join([s for s, _ in sources])
        hypothesis = output_sent
        prompts.append(_PROMPT.format(premise=premise, hypothesis=hypothesis))
    
    # — run the hallucination classifier
    results = _grapheval_clf(prompts, top_k=None)
    scores = [
        entry['score']
        for res in results
        for entry in res
        if entry.get('label', '').lower() == 'consistent'
    ]
    
    # Return consistent (1) if all sentences are consistent with threshold, otherwise inconsistent (0)
    return 1 if all(score >= threshold for score in scores) else 0

# 4️⃣ Apply across your DataFrame with a safe wrapper
tqdm.pandas()

def safe_eval(row):
    try:
        return int(perform_grapheval(row['text'], row['machine_summary']))
    except Exception as e:
        print(f"[Error] Row {row.name} failed evaluation: {e}")
        return -1

df_expanded['grapheval_consistency'] = df_expanded.progress_apply(safe_eval, axis=1)

You are using a model of type HHEMv2Config to instantiate a model of type HHEMv2. This is not supported for all configurations of models and can yield errors.
You are using a model of type HHEMv2Config to instantiate a model of type HHEMv2. This is not supported for all configurations of models and can yield errors.
Device set to use mps:0
 22%|██▏       | 347/1600 [19:06<1:25:43,  4.11s/it]

In [None]:
import json
from json import JSONDecodeError
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import numpy as np
import os
import re
from sentence_transformers import SentenceTransformer

# 1️⃣ One‐time model + tokenizer init
_tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')
_grapheval_clf = pipeline(
    "text-classification",
    model='vectara/hallucination_evaluation_model',
    tokenizer=_tokenizer,
    trust_remote_code=True
)

# Load NER model for entity detection
ner_tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
ner_model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, aggregation_strategy="simple")

# Load Sentence Transformer model for embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Smaller, efficient model

# 2️⃣ Prompt template & threshold
_PROMPT = (
    "<pad> Determine if the hypothesis is true given the premise?\n\n"
    "Premise: {premise}\n\n"
    "Hypothesis: {hypothesis}"
)
_THRESHOLD = 0.5

# 3️⃣ Cache for extracted source sentences
_source_cache: dict[str, list] = {}

def replace_pronouns_with_nouns(text):
    """Replace pronouns with their referent nouns using a rule-based approach with NER support."""
    # Define common pronouns we want to replace
    pronouns = ["he", "she", "it", "they", "his", "her", "its", "their", "them", "himself", "herself", "itself", "themselves"]
    pronoun_pattern = r'\b(' + '|'.join(pronouns) + r')\b'
    
    # Extract entities using NER
    entities = ner_pipeline(text)
    
    # Filter to focus on persons and organizations (the most common antecedents for pronouns)
    persons_orgs = [entity for entity in entities if entity["entity_group"] in ["PER", "ORG"]]
    
    if not persons_orgs:
        return text  # No suitable entities found
    
    # Map each sentence with its entities
    sentences = [s.strip() for s in re.split(r'[.!?]', text) if s.strip()]
    sentence_entities = {}
    
    for i, sentence in enumerate(sentences):
        sentence_entities[i] = []
        for entity in persons_orgs:
            if entity["word"] in sentence:
                sentence_entities[i].append(entity)
    
    # Process text to replace pronouns
    processed_text = ""
    current_entity_idx = 0
    
    for i, sentence in enumerate(sentences):
        # Find entities in current and previous sentences
        current_entities = sentence_entities.get(i, [])
        prev_entities = sentence_entities.get(i-1, []) if i > 0 else []
        
        # Get the most recent entity to use for pronoun replacement
        entity_to_use = None
        if current_entities:
            entity_to_use = current_entities[0]["word"]
        elif prev_entities:
            entity_to_use = prev_entities[0]["word"]
        
        # Replace pronouns if we have an entity
        if entity_to_use:
            processed_sentence = re.sub(
                pronoun_pattern, 
                entity_to_use, 
                sentence, 
                flags=re.IGNORECASE
            )
        else:
            processed_sentence = sentence
        
        processed_text += processed_sentence + ". "
    
    return processed_text.strip()

def extract_sentences(text: str) -> list:
    """Extract sentences from the given text with pronouns replaced."""
    # First replace pronouns with nouns
    text_with_nouns = replace_pronouns_with_nouns(text)
    
    # Then extract sentences
    sentences = [s.strip() for s in re.split(r'[.!?]', text_with_nouns) if s.strip()]
    return sentences

def find_similar_sentences(source_sentences, output_sentences):
    """Find semantically similar sentences using sentence transformer embeddings."""
    # Create embeddings for source and output sentences
    source_embeddings = embedding_model.encode(source_sentences)
    output_embeddings = embedding_model.encode(output_sentences)
    
    similar_sentences = {}

    for i, output_sentence in enumerate(output_sentences):
        output_embedding = output_embeddings[i]
        similarities = []
        for j, source_sentence in enumerate(source_sentences):
            source_embedding = source_embeddings[j]
            # Calculate cosine similarity between embeddings
            similarity = np.dot(output_embedding, source_embedding) / (np.linalg.norm(output_embedding) * np.linalg.norm(source_embedding))
            similarities.append((source_sentence, similarity))
        similarities.sort(key=lambda x: x[1], reverse=True)
        similar_sentences[output_sentence] = similarities[:3]

    return similar_sentences

def perform_grapheval(text: str, machine_output: str, threshold: float = _THRESHOLD) -> int:
    # — get or compute source sentences (cached)
    if text not in _source_cache:
        _source_cache[text] = extract_sentences(text)
    source_sentences = _source_cache[text]
    
    # — extract sentences from machine output
    output_sentences = extract_sentences(machine_output)
    
    # — find similar sentences
    similar = find_similar_sentences(source_sentences, output_sentences)
    
    # If no similarities found, mark as inconsistent (original returned 1 here, keeping that logic)
    if not similar:
        return 1
        
    # — build the prompts for evaluation
    prompts = []
    for output_sent, sources in similar.items():
        # Create premise from source sentences (using only the sentences, not the similarity scores)
        premise = ";".join([s for s, _ in sources])
        hypothesis = output_sent
        prompts.append(_PROMPT.format(premise=premise, hypothesis=hypothesis))
    
    # — run the hallucination classifier
    results = _grapheval_clf(prompts, top_k=None)
    scores = [
        entry['score']
        for res in results
        for entry in res
        if entry.get('label', '').lower() == 'consistent'
    ]
    
    # Return consistent (1) if all sentences are consistent with threshold, otherwise inconsistent (0)
    return 1 if all(score >= threshold for score in scores) else 0

# 4️⃣ Apply across your DataFrame with a safe wrapper
tqdm.pandas()

def safe_eval(row):
    try:
        return int(perform_grapheval(row['text'], row['machine_summary']))
    except Exception as e:
        print(f"[Error] Row {row.name} failed evaluation: {e}")
        return -1

df_expanded['grapheval_consistency'] = df_expanded.progress_apply(safe_eval, axis=1)