In [None]:
!pip install networkx matplotlib pyvis openai python-dotenv numpy torch transformers tqdm huggingface-hub


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [40]:
from typing import List, Optional
import json
from openai import OpenAI
import numpy as np
import json
import networkx as nx
import matplotlib.pyplot as plt

from dotenv import load_dotenv
import os

load_dotenv()

# Define your API key securely
# @ANISHA - Enter your OpenAI API key here
# - If you don't have an API key, you can get one by signing up at https://platform.openai.com/signup
API_KEY = os.getenv("OPENAI_API_KEY")

# Initialize the OpenAI client
client = OpenAI(api_key=API_KEY)



def extract_information(text: str):

    client = OpenAI()

    completion = client.chat.completions.create(
        model="gpt-4o",  # Use an appropriate model
        messages=[
            {"role": "system", "content": """You are an expert at extracting information in structured formats to build a knowledge graph.

    Step 1 - Entity detection: Identify all entities in the raw text. Make sure not to miss any out. Entities should be basic and simple, they are akin to Wikipedia nodes.

    Step 2 - Coreference resolution: Find all expressions in the text that refer to the same entity. Make sure entities are not duplicated. In particular do not include entities that are more specific versions themselves, e.g. "a detailed view of jupiter's atmosphere" and "jupiter's atmosphere", only include the most specific version of the entity.

    Step 3 - Relation extraction: Identify semantic relationships between the entities you have identified.

    Format your response as a JSON array of objects, where each object must have exactly these three fields:
    - "subject": The first entity
    - "verb": The relationship between entities
    - "object": The second entity

    Important Tips:
    1. Make sure all information is included in the knowledge graph.
    2. Each triple must have exactly three non-empty strings.
    3. Do not split up related information into separate triples because this could change the meaning.
    4. Before adding a triple to the knowledge graph, check if concatenating subject+verb+object makes sense as a sentence. If not, discard it.
    5. Keep entities and relationships concise but meaningful.
    6. Convert pronouns to their proper noun references when possible.
    7. Keep everything lowercase and in present tense when appropriate.
    8. The output should be a JSON array of objects, each object containing the fields "subject", "verb", and "object", with the starting and ending tags ```json and ``` respectively.
    """},
            {"role": "user", "content": f"Use the given format to extract information from the following input: <input>{text}</input>. Skip the preamble and output the result as a JSON array within <json></json> tags."}
        ]
    )


    if completion.choices:
        response_message = str(completion.choices[0].message.content)
        # process response_message from string to JSON
        print("DEBUG: response_message = ", response_message)
        # if it contains leading and trailing ``` characters, remove them
        if response_message.startswith("```") and response_message.endswith("```"):
            response_message = response_message[3:-3]
        # if it contains leading "json" characters, remove them
        if response_message.startswith("json"):
            response_message = response_message[4:]
        response_message = json.loads(response_message)
        print(response_message)
        # if the response message is a single JSON object, convert it to a list of JSON objects
        if type(response_message) == dict:
            response_message = [response_message]
        return response_message
    else:
        print("No response received.")
        return []




In [22]:
from pyvis.network import Network
import json

def visualize_json(json_data):
    net = Network(height="750px", width="100%", bgcolor="#222222", font_color="white")

    if isinstance(json_data, str):
        data = json.loads(json_data)
    else:
        data = json_data

    for entry in data:
        subject = entry.get('subject')
        verb = entry.get('verb')
        object = entry.get('object')

        net.add_node(subject, title=subject, color='skyblue')
        net.add_node(object, title=object, color='lightgreen')
        net.add_edge(subject, object, title=verb)

    net.show("knowledge_graph.html", notebook=False)



# Example text to analyze
text = "According to my knowledge, Mihai Surdeanu is an Associate Professor in the departments of Cognitive Science - GIDP, Computer Science, and BIO5 Institute at the University of Arizona. He earned his Ph.D. in Computer Science from Southern Methodist University in 2001 and has over 15 years of experience in building systems driven by natural language processing (NLP) and machine learning. Surdeanu has published over 80 peer-reviewed articles and has been a leader or member of teams that ranked in the top three at seven highly competitive international evaluations of end-user NLP systems such as question answering and information extraction. His work has been funded by several government organizations and private foundations. Surdeanu's research focuses on NLP and machine learning."
# Extract information from the text
json_data = extract_information(text)
print(json_data)

# Visualize the extracted information as a knowledge graph
visualize_json(json_data)


DEBUG: response_message =  ```json
[
    {
        "subject": "mihai surdeanu",
        "verb": "is",
        "object": "associate professor"
    },
    {
        "subject": "mihai surdeanu",
        "verb": "is",
        "object": "department"
    },
    {
        "subject": "unidentified",
        "verb": "include",
        "object": "cognitive science - gidp, computer science, bio5 institute"
    },
    {
        "subject": "university of arizona",
        "verb": "have",
        "object": "unidentified"
    },
    {
        "subject": "mihai surdeanu",
        "verb": "earn",
        "object": "ph.d"
    },
    {
        "subject": "ph.d.",
        "verb": "from",
        "object": "southern methodist university"
    },
    {
        "subject": "ph.d.",
        "verb": "in",
        "object": "computer science"
    },
    {
        "subject": "ph.d.",
        "verb": "in",
        "object": "2001"
    },
    {
        "subject": "mihai surdeanu",
        "verb": "have",
        "ob

In [23]:
from pyvis.network import Network
import json

def visualize_comparison(source_json_data, output_json_data, display='both'):
    net = Network(height="750px", width="100%", bgcolor="#222222", font_color="white")

    # Parse JSON data
    source_data = json.loads(source_json_data) if isinstance(source_json_data, str) else source_json_data
    output_data = json.loads(output_json_data) if isinstance(output_json_data, str) else output_json_data

    # Extract nodes and verbs from data
    source_nodes = set(entry['subject'] for entry in source_data) | set(entry['object'] for entry in source_data)
    output_nodes = set(entry['subject'] for entry in output_data) | set(entry['object'] for entry in output_data)
    
    # Determine common nodes
    common_nodes = source_nodes & output_nodes
    
    # Function to add nodes and edges to the network
    def add_nodes_edges(data, node_color):
        for entry in data:
            subject, verb, object = entry['subject'], entry['verb'], entry['object']
            # Conditionally color nodes if they are common
            sub_color = 'yellow' if subject in common_nodes else node_color
            obj_color = 'yellow' if object in common_nodes else node_color
            net.add_node(subject, title=subject, color=sub_color)
            net.add_node(object, title=object, color=obj_color)
            net.add_edge(subject, object, title=verb)

    # Add source graph nodes and edges
    if display in ['source', 'both']:
        add_nodes_edges(source_data, 'blue')

    # Add output graph nodes and edges
    if display in ['output', 'both']:
        add_nodes_edges(output_data, 'green')

    # Save and show the graph
    net.show("knowledge_graph_comparison.html", notebook=False)

# Example JSON data for source and output
source_json = json.dumps([
    {"subject": "Fox", "verb": "jumps", "object": "Dog"},
    {"subject": "Dog", "verb": "barks", "object": "loudly"}
])

output_json = json.dumps([
    {"subject": "Fox", "verb": "runs", "object": "fast"},
    {"subject": "Dog", "verb": "barks", "object": "quietly"}
])

# Visualize comparison allowing the user to choose which parts to display
visualize_comparison(source_json, output_json, display='both')


knowledge_graph_comparison.html


In [24]:
# Generate source text
source_text = "There was once a boy named Jack who lived in a small village. Jack was known for his bravery and kindness. One day, Jack decided to climb a giant beanstalk that had grown in his backyard. At the top of the beanstalk, he found a castle inhabited by a giant. The giant had a magical hen that laid golden eggs. Jack managed to steal the hen and escape from the giant. The giant chased Jack down the beanstalk, but Jack managed to cut it down, causing the giant to fall to his doom. Jack returned to his village with the magical hen and lived happily ever after."

# Generate output text with subtle hallucinations
output_text = "In the town of Kingman, Arizona, there lived a courageous and kind-hearted boy named Jack. One adventurous day, he ascended a massive beanstalk sprouting in his backyard. At its peak, Jack discovered a castle, home to a formidable giant. This giant possessed a hen with the ability to produce golden eggs. Jack cleverly captured the hen and fled from the giant's clutches. In a thrilling chase, the giant pursued Jack along the beanstalk. In a daring move, Jack chopped down the beanstalk, leading to the giant's tragic fall and demise. Triumphantly, Jack returned to his village, where he and the magical hen enjoyed a prosperous and joyful life."

# Extract information from source text
source_json = extract_information(source_text)
output_json = extract_information(output_text)

print("Source JSON:", source_json)
print("Output JSON:", output_json)

# Compare the graphs
visualize_comparison(source_json, output_json, display='both')


DEBUG: response_message =  ```json
[
    {
        "subject": "jack",
        "verb": "live",
        "object": "village"
    },
    {
        "subject": "jack",
        "verb": "know",
        "object": "bravery"
    },
    {
        "subject": "jack",
        "verb": "know",
        "object": "kindness"
    },
    {
        "subject": "jack",
        "verb": "decide",
        "object": "climb"
    },
    {
        "subject": "beanstalk",
        "verb": "grow",
        "object": "backyard"
    },
    {
        "subject": "jack",
        "verb": "find",
        "object": "castle"
    },
    {
        "subject": "castle",
        "verb": "inhabit",
        "object": "giant"
    },
    {
        "subject": "giant",
        "verb": "have",
        "object": "hen"
    },
    {
        "subject": "hen",
        "verb": "lay",
        "object": "eggs"
    },
    {
        "subject": "eggs",
        "verb": "be",
        "object": "golden"
    },
    {
        "subject": "jack",
        "ver

In [44]:
# RAG prompt question-answering examples



def ask_question(question: str, context: str, model="gpt-4-turbo"):
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are an assistant specialized in answering questions based on a given context. Your task is to provide accurate and concise answers to the questions asked. If the answer is not present in the context, you should respond with 'The answer is not present in the given context.'"},
            {"role": "system", "content": "Context: " + context},
            {"role": "user", "content": f"Question: {question}"}
        ]
    )

    if completion.choices:
        response_message = str(completion.choices[0].message.content)
        return response_message
    else:
        print("No response received.")
        return None
    
# Example context and questions
context = """

In the town of Quillhaven—a community renowned for its historic libraries and scholarly traditions—a rare astronomical event was predicted to occur for the first time in over a century. The event, a total solar eclipse, was shrouded in both scientific intrigue and centuries-old folklore. Local legends, drawing on influences from ancient Greek philosophy to indigenous spiritual practices, suggested that eclipses held the power to influence human thought and communal harmony.

At the heart of the community’s preparations was Dr. Lila Montrose, a respected astrophysicist whose career had been dedicated to unraveling the mysteries of cosmic events. Dr. Montrose saw the eclipse as an opportunity to blend modern scientific inquiry with the town’s rich cultural heritage. She proposed a series of public lectures and interactive exhibitions designed to educate residents on the mechanics of the eclipse, while also acknowledging its historical and psychological significance. Her balanced approach aimed to respect both empirical evidence and the symbolic narratives that had long captivated the community.

Meanwhile, Mr. Edmund Blackwell, the town’s dedicated historian, uncovered an ancient manuscript in the dusty archives of Quillhaven’s old library. The manuscript, penned in a mix of archaic English and Latin, detailed elaborate rituals and communal activities that had been performed during similar eclipses in bygone eras. Blackwell argued that these practices were more than mere superstition—they were intrinsic to the town’s identity and had once fostered unity and prosperity. His findings ignited a fervent debate about whether these ancient rituals should be revived as a way to reconnect with Quillhaven’s storied past.

This emerging debate quickly divided the community into two factions. The progressive wing, led by Dr. Montrose, advocated for a balanced, modern approach that combined scientific exploration with cultural respect. In contrast, the conservative faction, inspired by Mr. Blackwell’s manuscript, pushed for a return to the traditional rituals that they believed had been the cornerstone of the town’s former unity and success. As the day of the eclipse drew near, these conflicting views not only highlighted an ideological rift but also symbolized a broader struggle between embracing modernity and preserving historical identity.

In a final effort to bridge these divergent perspectives, town leaders organized a public forum. The forum featured spirited debates and passionate speeches, ultimately concluding with a proposal for collaboration. Both factions agreed that the eclipse could serve as a catalyst for dialogue, urging the community to forge a common path that honored both its scientific curiosity and its deep-rooted traditions.

"""



question = "Explain how the contrasting views of Dr. Lila Montrose and Mr. Edmund Blackwell regarding the upcoming eclipse illustrate the broader tension between modern scientific inquiry and traditional cultural practices. What solution did the town of Quillhaven ultimately propose to address this conflict?"

# Get the answer to the question
answer = ask_question(question, context)
print("Answer:", answer)

source_json = extract_information(context)
output_json = extract_information(answer)

print("Source JSON:", source_json)
print("Output JSON:", output_json)

client_embed = OpenAI(api_key=os.getenv("EMBEDDING_API_KEY"), base_url=os.getenv("EMBEDDING_BASE_URL"))
# Find semantically similar claims out of source and output JSON: "convert these back to text by s-v-o format and compare them", for each claim in output_json, find the two most similar claims in source_json
def find_similar_claims(source_json, output_json):
    source_graphs = []
    output_graphs = []
    for entry in source_json:
        source_graphs.append(f"{entry['subject']} {entry['verb']} {entry['object']}")
    for entry in output_json:
        output_graphs.append(f"{entry['subject']} {entry['verb']} {entry['object']}")

    # create embeddings for source and output graphs
    source_embeddings = client_embed.embeddings.create(input=source_graphs, model="text-embedding-3-small")
    output_embeddings = client_embed.embeddings.create(input=output_graphs, model="text-embedding-3-small")
    print("Source embeddings:", source_embeddings)
    similar_claims = {}

    for i, output_entry in enumerate(output_json):
        output_embedding = output_embeddings.data[i].embedding  
        similarities = []
        for j, source_entry in enumerate(source_json):
            source_embedding = source_embeddings.data[j].embedding
            # Calculate cosine similarity between embeddings
            similarity = np.dot(output_embedding, source_embedding) / (np.linalg.norm(output_embedding) * np.linalg.norm(source_embedding))
            similarities.append((source_entry, similarity))
        similarities.sort(key=lambda x: x[1], reverse=True)
        print(similarities)
        output_entry_string = f"{output_entry['subject']} {output_entry['verb']} {output_entry['object']}"
        similar_claims[output_entry_string] = similarities[:3]

    return similar_claims

similar_claims = find_similar_claims(source_json, output_json)

print("Similar Claims:")
for output_entry, source_entries in similar_claims.items():
    print(f"Output Claim: {output_entry}")
    for source_entry, similarity in source_entries:
        print(f"Similar Source Claim: {source_entry} (Similarity: {similarity})")
    print()


Answer: Dr. Lila Montrose and Mr. Edmund Blackwell represent contrasting viewpoints regarding the eclipse in Quillhaven. Dr. Montrose, an astrophysicist, champions a modern scientific approach, emphasizing education on the mechanics of the eclipse while also integrating cultural relevance into her teachings. Her perspective values empirical evidence and scientific understanding as tools for enhancing the community's appreciation of the event. In contrast, Mr. Blackwell, a historian, underscores the importance of ancient rituals and traditions, accessing antiquated manuscripts suggesting that these practices were integral to the town’s cultural identity and historical continuity. He posits that reviving these traditions could restore unity and prosperity within the community, reminiscent of earlier times.

The broader tension between these viewpoints mirrors the ideological struggle between embracing scientific advancements and preserving cultural heritage. Dr. Montrose's approach is fo

In [45]:
from huggingface_hub import login  
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

login(token=os.getenv("HUGGINGFACE_TOKEN"))
# Generate pairs from similar claims
claim_pairs = []
for output_entry, source_entries in similar_claims.items():
    source_claims = [f"{source_entry['subject']} {source_entry['verb']} {source_entry['object']}" for source_entry, _ in source_entries]
    source_claims = ";".join(source_claims)
    claim_pairs.append((source_claims, output_entry))

for pair in claim_pairs:
    # Print the pairs in a beautiful format
    print("Input:", pair[0])
    print("Output:", pair[1])
    print()

# Step 1: Load the model
model = AutoModelForSequenceClassification.from_pretrained('vectara/hallucination_evaluation_model', trust_remote_code=True)

# Step 2: Evaluate the hallucination model
# Prompt the pairs
prompt = "<pad> Determine if the hypothesis is true given the premise?\n\nPremise: {text1}\n\nHypothesis: {text2}"
input_pairs = [prompt.format(text1=pair[0], text2=pair[1]) for pair in claim_pairs]
print(input_pairs[0])

# Use text-classification pipeline to predict
classifier = pipeline(
            "text-classification",
            model='vectara/hallucination_evaluation_model',
            tokenizer=AutoTokenizer.from_pretrained('google/flan-t5-base'),
            trust_remote_code=True
        )
full_scores = classifier(input_pairs, top_k=None) # List[List[Dict[str, float]]]

# Optional: Extract the scores for the 'consistent' label
simple_scores = [score_dict['score'] for score_for_both_labels in full_scores for score_dict in score_for_both_labels if score_dict['label'] == 'consistent']

print(simple_scores)

Input: dr. lila montrose respected as astrophysicist;dr. montrose sees the eclipse as opportunity to blend modern scientific inquiry with cultural heritage;dr. montrose dedicates career to unraveling mysteries of cosmic events
Output: dr. lila montrose represent modern scientific approach

Input: mr. edmund blackwell uncovered ancient manuscript in quillhaven’s library;conservative faction inspired by mr. blackwell’s manuscript;dr. montrose sees the eclipse as opportunity to blend modern scientific inquiry with cultural heritage
Output: mr. edmund blackwell represent ancient rituals and traditions

Input: dr. lila montrose respected as astrophysicist;dr. montrose sees the eclipse as opportunity to blend modern scientific inquiry with cultural heritage;dr. montrose proposes series of public lectures and interactive exhibitions
Output: dr. lila montrose advocate for integration of cultural relevance into education

Input: mr. edmund blackwell uncovered ancient manuscript in quillhaven’s 

You are using a model of type HHEMv2Config to instantiate a model of type HHEMv2. This is not supported for all configurations of models and can yield errors.


<pad> Determine if the hypothesis is true given the premise?

Premise: dr. lila montrose respected as astrophysicist;dr. montrose sees the eclipse as opportunity to blend modern scientific inquiry with cultural heritage;dr. montrose dedicates career to unraveling mysteries of cosmic events

Hypothesis: dr. lila montrose represent modern scientific approach


You are using a model of type HHEMv2Config to instantiate a model of type HHEMv2. This is not supported for all configurations of models and can yield errors.
You are using a model of type HHEMv2Config to instantiate a model of type HHEMv2. This is not supported for all configurations of models and can yield errors.
Device set to use mps:0


[0.675209641456604, 0.020859278738498688, 0.10728395730257034, 0.3614426255226135, 0.8728690147399902, 0.46430009603500366, 0.8681468367576599, 0.709900438785553, 0.044571880251169205]
