In [1]:
!pip install wikipedia-api
!pip install torch

Collecting wikipedia-api
  Downloading wikipedia_api-0.7.1.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia-api
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia-api: filename=Wikipedia_API-0.7.1-py3-none-any.whl size=14347 sha256=13101a122760eea143ffdf668178b353e1a16bf2e4378a7c2435a3d8181a40f5
  Stored in directory: /root/.cache/pip/wheels/4c/96/18/b9201cc3e8b47b02b510460210cfd832ccf10c0c4dd0522962
Successfully built wikipedia-api
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.7.1


In [32]:
import wikipediaapi
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import json
import re

# Initialize the Qwen2-7B-Instruct model with FP16 to reduce memory usage
model_name = "Qwen/Qwen2-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)

# Move the model to CUDA (GPU) or keep on CPU based on availability
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"Model is using: {device}")

# Update tokenizer properties
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

def generate(messages, **gen_kwargs):
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt", padding=True).to(device)
    gen_kwargs['temperature'] = gen_kwargs.get("temperature", 0.01)
    gen_kwargs['max_new_tokens'] = gen_kwargs.get("max_new_tokens", 512)  # Increased from 256

    with torch.no_grad():
        generated_ids = model.generate(
            model_inputs.input_ids,
            attention_mask=model_inputs.attention_mask,
            **gen_kwargs
        )

    generated_ids = [
        output_ids[len(input_ids):]
        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(f"Generated Response:\n{response}\n")
    return response

def fetch_wikipedia_page(title):
    wiki_wiki = wikipediaapi.Wikipedia(language='en', user_agent='docvis/1.0 (bhatnagar007vidit@gmail.com)')
    page = wiki_wiki.page(title)
    return page.text if page.exists() else None

def clean_and_validate_json(response):
    try:
        # Remove any text before the first '{' and after the last '}'
        json_content = re.search(r'\{.*\}', response, re.DOTALL)
        if json_content:
            cleaned_response = json_content.group(0)
        else:
            return {"Relationships": []}

        # Parse the JSON
        data = json.loads(cleaned_response)

        # Ensure the structure is correct
        if "Relationships" not in data or not isinstance(data["Relationships"], list):
            return {"Relationships": []}

        # Filter out any relationships that don't have all required keys
        valid_relationships = [
            rel for rel in data["Relationships"]
            if all(key in rel for key in ["Person1", "Person2", "Relationship"])
        ]

        return {"Relationships": valid_relationships}
    except json.JSONDecodeError as e:
        print(f"JSON Decode Error: {e}")
        print(f"Problematic JSON:\n{cleaned_response}")
        return {"Relationships": []}
    except Exception as e:
        print(f"Unexpected Error: {e}")
        print(f"Failed to parse the response. Original Response:\n{response}")
        return {"Relationships": []}

import json

import re
import json

def extract_relationships_with_transformer(text):
    """
    Function to extract relationships between people from the given text.

    Parameters:
    text (str): The input text from which to extract relationships.

    Returns:
    dict: A dictionary with the relationships extracted between people in the text.
    """

    # Define the system prompt with detailed conditions and example paragraph
    system_prompt = """
    You are a helpful assistant that extracts relationships between people based on the provided text.

    *Conditions for Extraction*:
    1. *Extract only relationships between two distinct people*. Ignore any relationships involving organizations, places, countries, or objects.
    2. *Ensure the names of people are properly identified*. If a name appears with a title (e.g., "President John F. Kennedy"), use only the person's full name ("John F. Kennedy").
    3. *The relationship must be a meaningful human relationship* (e.g., "colleagues", "friends", "family members", "rivals"). Avoid vague or non-human descriptions.
    4. *Exclude non-person entities completely* (e.g., organizations like NASA, countries like the United States, or events like Apollo 11).
    5. *Return relationships in the following structured format*:
    json
    {
      "Relationships": [
        {
          "Person1Name": "<Person's Name>",
          "Person2Name": "<Another Person's Name>",
          "Relationship": "<Description of their relationship>"
        }
      ]
    }

    *Example Paragraph for Context*:
    During World War II, several key figures had notable interactions. Winston Churchill and Franklin D. Roosevelt
    maintained a strong alliance, planning many wartime strategies together. Adolf Hitler often communicated directly
    with Joseph Goebbels to discuss propaganda. General Eisenhower worked closely with Bernard Montgomery during the
    invasion of Normandy. Neil Armstrong and Buzz Aldrin worked together during the Apollo 11 mission. John F. Kennedy
    and Robert F. Kennedy, being brothers, collaborated on many political issues.

    *Example Output*:
    The extracted relationships should look similar to the following:

    json
    {
      "Relationships": [
        {
          "Person1Name": "Winston Churchill",
          "Person2Name": "Franklin D. Roosevelt",
          "Relationship": "Maintained a strong alliance and collaborated on wartime strategies"
        },
        {
          "Person1Name": "Adolf Hitler",
          "Person2Name": "Joseph Goebbels",
          "Relationship": "Communicated directly to discuss propaganda strategies"
        },
        {
          "Person1Name": "General Eisenhower",
          "Person2Name": "Bernard Montgomery",
          "Relationship": "Worked closely during the invasion of Normandy"
        },
        {
          "Person1Name": "Neil Armstrong",
          "Person2Name": "Buzz Aldrin",
          "Relationship": "Worked together during the Apollo 11 mission"
        },
        {
          "Person1Name": "John F. Kennedy",
          "Person2Name": "Robert F. Kennedy",
          "Relationship": "Brothers who collaborated on various political strategies"
        }
      ]
    }

    Only include relationships similar to these examples. Ensure that both names are proper people’s names and not organizations, countries, or events.
    """

    # Step 2: Create a user prompt for the model based on the example and the input text
    user_prompt = f"Extract relationships between people from the given text:\n\nBased on the example paragraph and relationships provided, extract relationships from the following text:\n\n{text[:4000]}"

    # Step 3: Generate response using the structured prompt
    response = generate([
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ])

    print(f"Raw Model Response:\n{response}")  # Print the raw response for debugging

    # Step 4: Clean the response to fix any JSON formatting issues
    cleaned_response = clean_model_response(response)
    print(f"Cleaned Model Response:\n{cleaned_response}")  # Print cleaned response

    # Step 5: Parse the cleaned response as JSON
    try:
        relationships_data = json.loads(cleaned_response)
    except json.JSONDecodeError:
        print("Failed to decode JSON. The model response might be incorrectly formatted.")
        return {"Relationships": []}

    # Initialize an empty list to hold valid relationships
    final_relationships = []

    # Directly append all relationships from the cleaned response
    if "Relationships" in relationships_data:
        final_relationships.extend(relationships_data["Relationships"])

    # Print the validated relationships before returning
    #print(f"Validated Relationships:\n{json.dumps(final_relationships, indent=2)}")

    # Return the final output with all the appended relationships
    return {"Relationships": final_relationships}, cleaned_response



def clean_model_response(response):
    """
    Clean the model response to ensure it can be parsed as JSON.
    This function removes any unnecessary text and fixes common JSON issues.

    Parameters:
    response (str): The raw response from the model.

    Returns:
    str: The cleaned response that can be parsed as JSON.
    """
    # Remove any text before the JSON response starts
    response = re.sub(r'^[^{]*', '', response)

    # Fix common JSON issues like trailing commas
    response = response.replace("\n", " ").replace("json", "").strip()
    response = re.sub(r",\s*}", "}", response)  # Remove trailing commas before closing braces
    response = re.sub(r",\s*\]", "]", response)  # Remove trailing commas before closing brackets

    return response


# Utility function to check if a given string is likely to be a person's name
def is_person_name(name):
    """
    Check if a given name is likely to be a person's name.
    This is a basic heuristic check that excludes entities like countries or organizations.
    """

    # List of common words associated with non-person entities
    non_person_entities = [
        "Company", "Corporation", "Organization", "Country", "Government", "Party",
        "United States", "Soviet Union", "NASA", "Apollo", "Alliance", "Axis", "Force",
        "Empire", "Republic", "State", "Ministry", "Agency", "Group", "Union"
    ]

    # Return False if name contains any of the non-person entities
    for entity in non_person_entities:
        if entity.lower() in name.lower():
            return False

    # Additional heuristics can be added here if needed
    return True


def process_long_text(text, chunk_size=4000, overlap=1000):
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size-overlap)]
    all_relationships = []
    for chunk in chunks:
        chunk_relationships = extract_relationships_with_transformer(chunk)
        all_relationships.extend(chunk_relationships.get("Relationships", []))

    # Remove duplicates and handle malformed relationships
    unique_relationships = []
    seen = set()
    for rel in all_relationships:
        try:
            person1 = rel.get("Person1", "")
            person2 = rel.get("Person2", "")
            relationship = rel.get("Relationship", "")

            if person1 and person2 and relationship:  # Check if all elements are non-empty
                rel_tuple = (person1, person2, relationship)
                if rel_tuple not in seen:
                    seen.add(rel_tuple)
                    unique_relationships.append({
                        "Person1": person1,
                        "Person2": person2,
                        "Relationship": relationship  # This is correct, singular form
                    })
        except Exception as e:
            print(f"Error processing relationship: {rel}. Error: {e}")

    return {"Relationships": unique_relationships}  # Note: "Relationships" is plural here, as it's the key for the list of relationships

if __name__ == "__main__":
    title = "Ramayana"
    text = fetch_wikipedia_page(title)
    if text:
        result = process_long_text(text)
        #print(f"Final Output:\n{json.dumps(result, indent=2)}")
        #print(f"Total relationships extracted: {len(result['Relationships'])}")
    else:
        print("Failed to retrieve the Wikipedia page.")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model is using: cuda
Generated Response:
The given text does not contain any relationships between specific individuals. It provides information about the Ramayana, an ancient Indian epic, its authorship, historical context, and cultural influences. Therefore, there are no relationships to extract according to the guidelines provided.

Raw Model Response:
The given text does not contain any relationships between specific individuals. It provides information about the Ramayana, an ancient Indian epic, its authorship, historical context, and cultural influences. Therefore, there are no relationships to extract according to the guidelines provided.
Cleaned Model Response:

Failed to decode JSON. The model response might be incorrectly formatted.
Generated Response:
The given text does not contain any specific relationships between two distinct people. Therefore, no JSON output can be generated based on the conditions provided.

Raw Model Response:
The given text does not contain any speci

In [34]:
import json
import networkx as nx
import plotly.graph_objects as go
from google.colab import files

# Function to load relationships from JSON data
def load_relationships(json_data):
    return json_data["Relationships"]

# Function to remove duplicate relationships from JSON data
def remove_duplicates(relationships):
    unique_relationships = []
    seen = set()  # Use a set to track seen relationships
    for rel in relationships:
        # Create a tuple that uniquely identifies the relationship
        rel_tuple = (rel["Person1"], rel["Person2"], rel["Relationship"])
        if rel_tuple not in seen:
            seen.add(rel_tuple)
            unique_relationships.append(rel)
    return unique_relationships

# Function to create a NetworkX graph from relationships
def create_graph(relationships):
    G = nx.Graph()
    for rel in relationships:
        G.add_edge(rel["Person1"], rel["Person2"], relationship=rel.get("Relationship", "Unknown"))
    return G

# Function to create and display a radial graph using Plotly
def create_radial_graph(G):
    # Use a higher `k` value in spring_layout to increase node spacing
    pos = nx.spring_layout(G, k=1.5, iterations=100)  # Increase `k` for more spacing

    edge_x = []
    edge_y = []
    edge_text = []  # To hold hover text for edges
    for edge in G.edges(data=True):
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])
        edge_text.append(f"Relationship: {edge[2]['relationship']}<br>{edge[0]} ↔ {edge[1]}")  # Add relationship to hover text

    # Increase line width and use a darker color for better visibility
    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=2.5, color='#555'),  # Adjust line width and color
        hoverinfo='text',
        hovertext=edge_text,
        mode='lines'
    )

    node_x = []
    node_y = []
    node_text = []
    for node, (x, y) in pos.items():
        node_x.append(x)
        node_y.append(y)

        # Collect information about connections for each node
        connections = []
        for neighbor in G.neighbors(node):
            relationship = G.edges[node, neighbor].get('relationship', 'Unknown')
            connections.append(f"Connected to {neighbor} for relationship - {relationship}")

        # Join all connection info and add to hover text
        connections_info = "<br>".join(connections)
        node_text.append(f"Name: {node}<br>{connections_info}")

    # Increase node size and color
    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        hoverinfo='text',
        text=node_text,  # Set hover text for nodes with connection information
        marker=dict(
            showscale=True,
            colorscale='YlGnBu',
            reversescale=True,
            color=[],
            size=18,  # Increase node size for better visibility
            colorbar=dict(
                thickness=15,
                title='Node Connections',
                xanchor='left',
                titleside='right'
            ),
            line_width=2)
    )

    # Set node color based on the number of connections
    node_adjacencies = [len(adj[1]) for adj in G.adjacency()]
    node_trace.marker.color = node_adjacencies

    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title='<b>Apollo 11 Relationships - Radial Graph</b>',
                        titlefont_size=16,
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=40),
                        annotations=[dict(
                            text="Radial Graph - Entity Relationships",
                            showarrow=False,
                            xref="paper", yref="paper",
                            x=0.005, y=-0.002)],
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                    )
    return fig

# Function to create and display a scatter plot using Plotly
def create_scatter_plot(relationships):
    # Generate unique entity nodes and assign an index
    entities = set()
    for rel in relationships:
        entities.add(rel["Person1"])
        entities.add(rel["Person2"])

    entity_to_index = {entity: i for i, entity in enumerate(entities)}

    # Initialize traces for edges and nodes
    edge_x = []
    edge_y = []
    edge_text = []  # To hold hover text for edges

    # Create edges between nodes
    for rel in relationships:
        x0, y0 = entity_to_index[rel["Person1"]], 0
        x1, y1 = entity_to_index[rel["Person2"]], 1
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])
        edge_text.append(f"Relationship: {rel.get('Relationship', 'Unknown')}<br>{rel['Person1']} ↔ {rel['Person2']}")

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=1.5, color='#888'),  # Increase line width for better visibility
        hoverinfo='text',
        hovertext=edge_text,  # Set hover text for edges
        mode='lines'
    )

    node_x = list(entity_to_index.values())
    node_y = [0] * len(entity_to_index)
    node_text = []  # To hold hover text for nodes

    # Collect information about connections for each node
    for node in entity_to_index:
        connections = []
        for rel in relationships:
            if rel["Person1"] == node:
                connections.append(f"Connected to {rel['Person2']} for relationship - {rel.get('Relationship', 'Unknown')}")
            elif rel["Person2"] == node:
                connections.append(f"Connected to {rel['Person1']} for relationship - {rel.get('Relationship', 'Unknown')}")

        # Join all connection info and add to hover text
        connections_info = "<br>".join(connections)
        node_text.append(f"Name: {node}<br>{connections_info}")

    # Increase node size and color
    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        hoverinfo='text',
        text=node_text,  # Set hover text for nodes with connection information
        marker=dict(
            showscale=True,
            colorscale='YlGnBu',
            reversescale=True,
            color=[],
            size=18,  # Increase node size for better visibility
            colorbar=dict(
                thickness=15,
                title='Node Connections',
                xanchor='left',
                titleside='right'
            ),
            line_width=2)
    )

    # Set node color based on degree (number of connections)
    node_adjacencies = [len([rel for rel in relationships if rel["Person1"] == node or rel["Person2"] == node])
                        for node in entity_to_index.keys()]
    node_trace.marker.color = node_adjacencies

    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title='<b>Apollo 11 Relationships - Scatter Plot</b>',
                        titlefont_size=16,
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=40),
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                    )
    return fig

# Function to generate and save both graphs
def generate_and_display_graphs(relationships):
    # Remove duplicate relationships before creating graphs
    unique_relationships = remove_duplicates(relationships)

    # Create graph and generate plots
    G = create_graph(unique_relationships)

    # Create individual figures
    radial_fig = create_radial_graph(G)
    scatter_fig = create_scatter_plot(unique_relationships)

    # Display both figures simultaneously
    radial_fig.show()
    scatter_fig.show()

    # Save both graphs as HTML files for download
    radial_fig.write_html("radial_relationship_graph.html")
    scatter_fig.write_html("scatter_relationship_graph.html")
    print("Graphs saved as 'radial_relationship_graph.html' and 'scatter_relationship_graph.html'")
    files.download("radial_relationship_graph.html")
    files.download("scatter_relationship_graph.html")

# Example usage
json_data = {
    "Relationships": [
        {"Person1": "Balarama Dasa", "Person2": "Narahari", "Relationship": "Poet of Torave Ramayana in Kannada"},
        {"Person1": "Thunchaththu Ramanujan Ezhuthachan", "Person2": "Adhyathmaramayanam", "Relationship": "Writer of Malayalam version"},
        {"Person1": "Sridhara", "Person2": "Marathi", "Relationship": "Writer of Ramayana in Marathi"},
        {"Person1": "Chanda Jha", "Person2": "Maithili", "Relationship": "Writer of Ramayana in Maithili"},
        {"Person1": "Rashtrakavi Kuvempu", "Person2": "Srimadramayana Kalpavrukshamu", "Relationship": "Writer of Ramayana in Kannada"},
        {"Person1": "Viswanatha Satyanarayana", "Person2": "Telugu", "Relationship": "Writer of Srimadramayana Kalpavrukshamu in Telugu"},
        {"Person1": "Adbhuta Ramayana", "Person2": "Valmiki", "Relationship": "Supplementary version attributed to Valmiki"},
        {"Person1": "Gondi people", "Person2": "Gond Ramayani", "Relationship": "Version of Ramayana"},
        {"Person1": "Adiya tribe of Wayanad", "Person2": "Adiya Ramayana", "Relationship": "Oral version of Ramayana"},
        {"Person1": "Kambar", "Person2": "Ramavataram", "Relationship": "Writer of Ramavataram in Tamil"},
        {"Person1": "Arunachala Kavirayar", "Person2": "Rama Natakam", "Relationship": "Composer of Rama Natakam"}
    ]
}

# Load relationships from JSON data and display graphs
relationships = load_relationships(json_data)
generate_and_display_graphs(relationships)


Graphs saved as 'radial_relationship_graph.html' and 'scatter_relationship_graph.html'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [37]:
import json
import networkx as nx
import plotly.graph_objects as go
from google.colab import files
import math  # Import the math module for trigonometric functions

# Function to load relationships from JSON data
def load_relationships(json_data):
    return json_data["Relationships"]

# Function to remove duplicate relationships from JSON data
def remove_duplicates(relationships):
    unique_relationships = []
    seen = set()  # Use a set to track seen relationships
    for rel in relationships:
        # Create a tuple that uniquely identifies the relationship
        rel_tuple = (rel["Person1"], rel["Person2"], rel["Relationship"])
        if rel_tuple not in seen:
            seen.add(rel_tuple)
            unique_relationships.append(rel)
    return unique_relationships

# Function to create a NetworkX graph with a central "Ramayana" node
def create_centralized_graph(relationships):
    G = nx.Graph()
    central_node = "Ramayana"

    # Add the central node to the graph
    G.add_node(central_node)

    for rel in relationships:
        # Connect each node to the central node and retain original relationships
        G.add_edge(central_node, rel["Person1"], relationship="Related to Ramayana")
        G.add_edge(central_node, rel["Person2"], relationship="Related to Ramayana")
        G.add_edge(rel["Person1"], rel["Person2"], relationship=rel.get("Relationship", "Unknown"))

    return G

# Function to create and display a spiral radial graph using Plotly
def create_spiral_radial_graph(G):
    # Create a spiral layout to place nodes around the central node
    central_node = "Ramayana"
    pos = nx.spring_layout(G, k=1.5, iterations=100, center=(0, 0))  # Position the central node at (0, 0)

    # Update positions for a spiral layout
    angle_step = 360 / (len(G.nodes) - 1)  # Angle step based on the number of nodes
    radius = 2.5  # Base radius for spiral layout
    i = 0  # Index for positioning

    for node in G.nodes:
        if node != central_node:
            # Calculate angle and position on the spiral using math module
            angle = math.radians(i * angle_step)  # Convert angle to radians
            x = radius * i * 0.1 * math.cos(angle)  # X-coordinate based on angle and spiral radius
            y = radius * i * 0.1 * math.sin(angle)  # Y-coordinate based on angle and spiral radius
            pos[node] = (x, y)
            i += 1
        else:
            pos[node] = (0, 0)  # Central node at the center

    edge_x = []
    edge_y = []
    edge_text = []  # To hold hover text for edges
    for edge in G.edges(data=True):
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])
        edge_text.append(f"Relationship: {edge[2]['relationship']}<br>{edge[0]} ↔ {edge[1]}")  # Add relationship to hover text

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=2.5, color='#555'),  # Adjust line width and color
        hoverinfo='text',
        hovertext=edge_text,
        mode='lines'
    )

    node_x = []
    node_y = []
    node_text = []
    for node, (x, y) in pos.items():
        node_x.append(x)
        node_y.append(y)

        # Collect information about connections for each node
        connections = []
        for neighbor in G.neighbors(node):
            relationship = G.edges[node, neighbor].get('relationship', 'Unknown')
            connections.append(f"Connected to {neighbor} for relationship - {relationship}")

        # Join all connection info and add to hover text
        connections_info = "<br>".join(connections)
        node_text.append(f"Name: {node}<br>{connections_info}")

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        hoverinfo='text',
        text=node_text,
        marker=dict(
            showscale=True,
            colorscale='YlGnBu',
            reversescale=True,
            color=[],
            size=18,  # Increase node size for better visibility
            colorbar=dict(
                thickness=15,
                title='Node Connections',
                xanchor='left',
                titleside='right'
            ),
            line_width=2)
    )

    # Set node color based on the number of connections
    node_adjacencies = [len(adj[1]) for adj in G.adjacency()]
    node_trace.marker.color = node_adjacencies

    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title='<b>Ramayana - Centralized Spiral Radial Graph</b>',
                        titlefont_size=16,
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=40),
                        annotations=[dict(
                            text="Spiral Radial Graph - Central Node: Ramayana",
                            showarrow=False,
                            xref="paper", yref="paper",
                            x=0.005, y=-0.002)],
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                    )
    return fig

# Function to generate and save the spiral radial graph
def generate_and_display_centralized_graph(relationships):
    # Remove duplicate relationships before creating graphs
    unique_relationships = remove_duplicates(relationships)

    # Create a centralized graph with "Ramayana" as the center node
    G = create_centralized_graph(unique_relationships)

    # Create and display the spiral radial graph
    spiral_radial_fig = create_spiral_radial_graph(G)
    spiral_radial_fig.show()

    # Save the graph as an HTML file for download
    spiral_radial_fig.write_html("spiral_radial_relationship_graph.html")
    print("Graph saved as 'spiral_radial_relationship_graph.html'")
    files.download("spiral_radial_relationship_graph.html")

# Example usage
json_data = {
    "Relationships": [
        {"Person1": "Balarama Dasa", "Person2": "Narahari", "Relationship": "Poet of Torave Ramayana in Kannada"},
        {"Person1": "Thunchaththu Ramanujan Ezhuthachan", "Person2": "Adhyathmaramayanam", "Relationship": "Writer of Malayalam version"},
        {"Person1": "Sridhara", "Person2": "Marathi", "Relationship": "Writer of Ramayana in Marathi"},
        {"Person1": "Chanda Jha", "Person2": "Maithili", "Relationship": "Writer of Ramayana in Maithili"},
        {"Person1": "Rashtrakavi Kuvempu", "Person2": "Srimadramayana Kalpavrukshamu", "Relationship": "Writer of Ramayana in Kannada"},
        {"Person1": "Viswanatha Satyanarayana", "Person2": "Telugu", "Relationship": "Writer of Srimadramayana Kalpavrukshamu in Telugu"},
        {"Person1": "Adbhuta Ramayana", "Person2": "Valmiki", "Relationship": "Supplementary version attributed to Valmiki"},
        {"Person1": "Gondi people", "Person2": "Gond Ramayani", "Relationship": "Version of Ramayana"},
        {"Person1": "Adiya tribe of Wayanad", "Person2": "Adiya Ramayana", "Relationship": "Oral version of Ramayana"},
        {"Person1": "Kambar", "Person2": "Ramavataram", "Relationship": "Writer of Ramavataram in Tamil"},
        {"Person1": "Arunachala Kavirayar", "Person2": "Rama Natakam", "Relationship": "Composer of Rama Natakam"}
    ]
}

# Load relationships from JSON data and display graphs
relationships = load_relationships(json_data)
generate_and_display_centralized_graph(relationships)


Graph saved as 'spiral_radial_relationship_graph.html'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [42]:
import wikipediaapi
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import json
import re

# Initialize the Qwen2-7B-Instruct model with FP16 to reduce memory usage
model_name = "Qwen/Qwen2-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)

# Move the model to CUDA (GPU) or keep on CPU based on availability
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"Model is using: {device}")

# Update tokenizer properties
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

def generate(messages, **gen_kwargs):
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt", padding=True).to(device)
    gen_kwargs['temperature'] = gen_kwargs.get("temperature", 0.01)
    gen_kwargs['max_new_tokens'] = gen_kwargs.get("max_new_tokens", 512)  # Increased from 256

    with torch.no_grad():
        generated_ids = model.generate(
            model_inputs.input_ids,
            attention_mask=model_inputs.attention_mask,
            **gen_kwargs
        )

    generated_ids = [
        output_ids[len(input_ids):]
        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(f"Generated Response:\n{response}\n")
    return response

def fetch_wikipedia_page(title):
    wiki_wiki = wikipediaapi.Wikipedia(language='en', user_agent='docvis/1.0 (bhatnagar007vidit@gmail.com)')
    page = wiki_wiki.page(title)
    return page.text if page.exists() else None

def clean_and_validate_json(response):
    try:
        # Remove any text before the first '{' and after the last '}'
        json_content = re.search(r'\{.*\}', response, re.DOTALL)
        if json_content:
            cleaned_response = json_content.group(0)
        else:
            return {"Relationships": []}

        # Parse the JSON
        data = json.loads(cleaned_response)

        # Ensure the structure is correct
        if "Relationships" not in data or not isinstance(data["Relationships"], list):
            return {"Relationships": []}

        # Filter out any relationships that don't have all required keys
        valid_relationships = [
            rel for rel in data["Relationships"]
            if all(key in rel for key in ["Person1", "Person2", "Relationship"])
        ]

        return {"Relationships": valid_relationships}
    except json.JSONDecodeError as e:
        print(f"JSON Decode Error: {e}")
        print(f"Problematic JSON:\n{cleaned_response}")
        return {"Relationships": []}
    except Exception as e:
        print(f"Unexpected Error: {e}")
        print(f"Failed to parse the response. Original Response:\n{response}")
        return {"Relationships": []}

import json

import re
import json

def extract_relationships_with_transformer(text):
    """
    Function to extract relationships between people from the given text.

    Parameters:
    text (str): The input text from which to extract relationships.

    Returns:
    tuple: A tuple containing the extracted relationships dictionary and the cleaned response string.
    """

    # Define the system prompt with detailed conditions and example paragraph
    system_prompt = """
    You are a helpful assistant that extracts relationships between people based on the provided text.

    *Conditions for Extraction*:
    1. *Extract only relationships between two distinct people*. Ignore any relationships involving organizations, places, countries, or objects.
    2. *Ensure the names of people are properly identified*. If a name appears with a title (e.g., "President John F. Kennedy"), use only the person's full name ("John F. Kennedy").
    3. *The relationship must be a meaningful human relationship* (e.g., "colleagues", "friends", "family members", "rivals"). Avoid vague or non-human descriptions.
    4. *Exclude non-person entities completely* (e.g., organizations like NASA, countries like the United States, or events like Apollo 11).
    5. *Return relationships in the following structured format*:
    json
    {
      "Relationships": [
        {
          "Person1Name": "<Person's Name>",
          "Person2Name": "<Another Person's Name>",
          "Relationship": "<Description of their relationship>"
        }
      ]
    }

    *Example Paragraph for Context*:
    During World War II, several key figures had notable interactions. Winston Churchill and Franklin D. Roosevelt
    maintained a strong alliance, planning many wartime strategies together. Adolf Hitler often communicated directly
    with Joseph Goebbels to discuss propaganda. General Eisenhower worked closely with Bernard Montgomery during the
    invasion of Normandy. Neil Armstrong and Buzz Aldrin worked together during the Apollo 11 mission. John F. Kennedy
    and Robert F. Kennedy, being brothers, collaborated on many political issues.

    *Example Output*:
    The extracted relationships should look similar to the following:

    json
    {
      "Relationships": [
        {
          "Person1Name": "Winston Churchill",
          "Person2Name": "Franklin D. Roosevelt",
          "Relationship": "Maintained a strong alliance and collaborated on wartime strategies"
        },
        {
          "Person1Name": "Adolf Hitler",
          "Person2Name": "Joseph Goebbels",
          "Relationship": "Communicated directly to discuss propaganda strategies"
        },
        {
          "Person1Name": "General Eisenhower",
          "Person2Name": "Bernard Montgomery",
          "Relationship": "Worked closely during the invasion of Normandy"
        },
        {
          "Person1Name": "Neil Armstrong",
          "Person2Name": "Buzz Aldrin",
          "Relationship": "Worked together during the Apollo 11 mission"
        },
        {
          "Person1Name": "John F. Kennedy",
          "Person2Name": "Robert F. Kennedy",
          "Relationship": "Brothers who collaborated on various political strategies"
        }
      ]
    }

    Only include relationships similar to these examples. Ensure that both names are proper people’s names and not organizations, countries, or events.
    """

    # Step 2: Create a user prompt for the model based on the example and the input text
    user_prompt = f"Extract relationships between people from the given text:\n\nBased on the example paragraph and relationships provided, extract relationships from the following text:\n\n{text[:4000]}"

    # Step 3: Generate response using the structured prompt
    response = generate([
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ])

    print(f"Raw Model Response:\n{response}")  # Print the raw response for debugging

    # Step 4: Clean the response to fix any JSON formatting issues
    cleaned_response = clean_model_response(response)
    print(f"Cleaned Model Response:\n{cleaned_response}")  # Print cleaned response

    # Step 5: Parse the cleaned response as JSON
    try:
        relationships_data = json.loads(cleaned_response)
    except json.JSONDecodeError:
        print("Failed to decode JSON. The model response might be incorrectly formatted.")
        return {"Relationships": []}, cleaned_response  # Return empty relationships and cleaned response

    # Initialize an empty list to hold valid relationships
    final_relationships = []

    # Directly append all relationships from the cleaned response
    if "Relationships" in relationships_data:
        final_relationships.extend(relationships_data["Relationships"])

    # Print the validated relationships before returning
    # print(f"Validated Relationships:\n{json.dumps(final_relationships, indent=2)}")

    # Return the final output with all the appended relationships and the cleaned response
    return {"Relationships": final_relationships}, cleaned_response  # Ensure a tuple is returned here




def clean_model_response(response):
    """
    Clean the model response to ensure it can be parsed as JSON.
    This function removes any unnecessary text and fixes common JSON issues.

    Parameters:
    response (str): The raw response from the model.

    Returns:
    str: The cleaned response that can be parsed as JSON.
    """
    # Remove any text before the JSON response starts
    response = re.sub(r'^[^{]*', '', response)

    # Fix common JSON issues like trailing commas
    response = response.replace("\n", " ").replace("json", "").strip()
    response = re.sub(r",\s*}", "}", response)  # Remove trailing commas before closing braces
    response = re.sub(r",\s*\]", "]", response)  # Remove trailing commas before closing brackets

    return response


# Utility function to check if a given string is likely to be a person's name
def is_person_name(name):
    """
    Check if a given name is likely to be a person's name.
    This is a basic heuristic check that excludes entities like countries or organizations.
    """

    # List of common words associated with non-person entities
    non_person_entities = [
        "Company", "Corporation", "Organization", "Country", "Government", "Party",
        "United States", "Soviet Union", "NASA", "Apollo", "Alliance", "Axis", "Force",
        "Empire", "Republic", "State", "Ministry", "Agency", "Group", "Union"
    ]

    # Return False if name contains any of the non-person entities
    for entity in non_person_entities:
        if entity.lower() in name.lower():
            return False

    # Additional heuristics can be added here if needed
    return True


def process_long_text(text, chunk_size=4000, overlap=1000):
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size-overlap)]
    all_relationships = []
    for chunk in chunks:
        chunk_relationships, _ = extract_relationships_with_transformer(chunk)  # Make sure to unpack the tuple
        all_relationships.extend(chunk_relationships.get("Relationships", []))

    # Remove duplicates and handle malformed relationships
    unique_relationships = []
    seen = set()
    for rel in all_relationships:
        try:
            person1 = rel.get("Person1", "")
            person2 = rel.get("Person2", "")
            relationship = rel.get("Relationship", "")

            if person1 and person2 and relationship:  # Check if all elements are non-empty
                rel_tuple = (person1, person2, relationship)
                if rel_tuple not in seen:
                    seen.add(rel_tuple)
                    unique_relationships.append({
                        "Person1": person1,
                        "Person2": person2,
                        "Relationship": relationship
                    })
        except Exception as e:
            print(f"Error processing relationship: {rel}. Error: {e}")

    return {"Relationships": unique_relationships}  # Return the final relationships

if __name__ == "__main__":
    title = "The Beatles"
    text = fetch_wikipedia_page(title)
    if text:
        result = process_long_text(text)
        #print(f"Final Output:\n{json.dumps(result, indent=2)}")
        #print(f"Total relationships extracted: {len(result['Relationships'])}")
    else:
        print("Failed to retrieve the Wikipedia page.")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model is using: cuda
Generated Response:
The extracted relationships should look similar to the following:

json
{
  "Relationships": [
    {
      "Person1Name": "John Lennon",
      "Person2Name": "Paul McCartney",
      "Relationship": "Primary songwriters and bandmates"
    },
    {
      "Person1Name": "John Lennon",
      "Person2Name": "George Harrison",
      "Relationship": "Bandmate"
    },
    {
      "Person1Name": "John Lennon",
      "Person2Name": "Ringo Starr",
      "Relationship": "Bandmate"
    },
    {
      "Person1Name": "Stuart Sutcliffe",
      "Person2Name": "John Lennon",
      "Relationship": "Initial bandmate"
    },
    {
      "Person1Name": "Pete Best",
      "Person2Name": "John Lennon",
      "Relationship": "Initial drummer"
    },
    {
      "Person1Name": "Ringo Starr",
      "Person2Name": "John Lennon",
      "Relationship": "Joined the band"
    },
    {
      "Person1Name": "Brian Epstein",
      "Person2Name": "The Beatles",
      "Relationship

In [43]:
import json
import networkx as nx
import plotly.graph_objects as go
from google.colab import files

# Function to load relationships from JSON data
def load_relationships(json_data):
    return json_data["Relationships"]

# Function to remove duplicate relationships from JSON data
def remove_duplicates(relationships):
    unique_relationships = []
    seen = set()  # Use a set to track seen relationships
    for rel in relationships:
        # Create a tuple that uniquely identifies the relationship
        rel_tuple = (rel["Person1"], rel["Person2"], rel["Relationship"])
        if rel_tuple not in seen:
            seen.add(rel_tuple)
            unique_relationships.append(rel)
    return unique_relationships

# Function to create a NetworkX graph from relationships
def create_graph(relationships):
    G = nx.Graph()
    for rel in relationships:
        G.add_edge(rel["Person1"], rel["Person2"], relationship=rel.get("Relationship", "Unknown"))
    return G

# Function to create and display a radial graph using Plotly
def create_radial_graph(G):
    # Use a higher `k` value in spring_layout to increase node spacing
    pos = nx.spring_layout(G, k=1.5, iterations=100)  # Increase `k` for more spacing

    edge_x = []
    edge_y = []
    edge_text = []  # To hold hover text for edges
    for edge in G.edges(data=True):
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])
        edge_text.append(f"Relationship: {edge[2]['relationship']}<br>{edge[0]} ↔ {edge[1]}")  # Add relationship to hover text

    # Increase line width and use a darker color for better visibility
    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=2.5, color='#555'),  # Adjust line width and color
        hoverinfo='text',
        hovertext=edge_text,
        mode='lines'
    )

    node_x = []
    node_y = []
    node_text = []
    for node, (x, y) in pos.items():
        node_x.append(x)
        node_y.append(y)

        # Collect information about connections for each node
        connections = []
        for neighbor in G.neighbors(node):
            relationship = G.edges[node, neighbor].get('relationship', 'Unknown')
            connections.append(f"Connected to {neighbor} for relationship - {relationship}")

        # Join all connection info and add to hover text
        connections_info = "<br>".join(connections)
        node_text.append(f"Name: {node}<br>{connections_info}")

    # Increase node size and color
    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        hoverinfo='text',
        text=node_text,  # Set hover text for nodes with connection information
        marker=dict(
            showscale=True,
            colorscale='YlGnBu',
            reversescale=True,
            color=[],
            size=18,  # Increase node size for better visibility
            colorbar=dict(
                thickness=15,
                title='Node Connections',
                xanchor='left',
                titleside='right'
            ),
            line_width=2)
    )

    # Set node color based on the number of connections
    node_adjacencies = [len(adj[1]) for adj in G.adjacency()]
    node_trace.marker.color = node_adjacencies

    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title='<b>Apollo 11 Relationships - Radial Graph</b>',
                        titlefont_size=16,
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=40),
                        annotations=[dict(
                            text="Radial Graph - Entity Relationships",
                            showarrow=False,
                            xref="paper", yref="paper",
                            x=0.005, y=-0.002)],
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                    )
    return fig

# Function to create and display a scatter plot using Plotly
def create_scatter_plot(relationships):
    # Generate unique entity nodes and assign an index
    entities = set()
    for rel in relationships:
        entities.add(rel["Person1"])
        entities.add(rel["Person2"])

    entity_to_index = {entity: i for i, entity in enumerate(entities)}

    # Initialize traces for edges and nodes
    edge_x = []
    edge_y = []
    edge_text = []  # To hold hover text for edges

    # Create edges between nodes
    for rel in relationships:
        x0, y0 = entity_to_index[rel["Person1"]], 0
        x1, y1 = entity_to_index[rel["Person2"]], 1
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])
        edge_text.append(f"Relationship: {rel.get('Relationship', 'Unknown')}<br>{rel['Person1']} ↔ {rel['Person2']}")

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=1.5, color='#888'),  # Increase line width for better visibility
        hoverinfo='text',
        hovertext=edge_text,  # Set hover text for edges
        mode='lines'
    )

    node_x = list(entity_to_index.values())
    node_y = [0] * len(entity_to_index)
    node_text = []  # To hold hover text for nodes

    # Collect information about connections for each node
    for node in entity_to_index:
        connections = []
        for rel in relationships:
            if rel["Person1"] == node:
                connections.append(f"Connected to {rel['Person2']} for relationship - {rel.get('Relationship', 'Unknown')}")
            elif rel["Person2"] == node:
                connections.append(f"Connected to {rel['Person1']} for relationship - {rel.get('Relationship', 'Unknown')}")

        # Join all connection info and add to hover text
        connections_info = "<br>".join(connections)
        node_text.append(f"Name: {node}<br>{connections_info}")

    # Increase node size and color
    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        hoverinfo='text',
        text=node_text,  # Set hover text for nodes with connection information
        marker=dict(
            showscale=True,
            colorscale='YlGnBu',
            reversescale=True,
            color=[],
            size=18,  # Increase node size for better visibility
            colorbar=dict(
                thickness=15,
                title='Node Connections',
                xanchor='left',
                titleside='right'
            ),
            line_width=2)
    )

    # Set node color based on degree (number of connections)
    node_adjacencies = [len([rel for rel in relationships if rel["Person1"] == node or rel["Person2"] == node])
                        for node in entity_to_index.keys()]
    node_trace.marker.color = node_adjacencies

    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title='<b>Apollo 11 Relationships - Scatter Plot</b>',
                        titlefont_size=16,
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=40),
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                    )
    return fig

# Function to generate and save both graphs
def generate_and_display_graphs(relationships):
    # Remove duplicate relationships before creating graphs
    unique_relationships = remove_duplicates(relationships)

    # Create graph and generate plots
    G = create_graph(unique_relationships)

    # Create individual figures
    radial_fig = create_radial_graph(G)
    scatter_fig = create_scatter_plot(unique_relationships)

    # Display both figures simultaneously
    radial_fig.show()
    scatter_fig.show()

    # Save both graphs as HTML files for download
    radial_fig.write_html("radial_relationship_graph.html")
    scatter_fig.write_html("scatter_relationship_graph.html")
    print("Graphs saved as 'radial_relationship_graph.html' and 'scatter_relationship_graph.html'")
    files.download("radial_relationship_graph.html")
    files.download("scatter_relationship_graph.html")

# Example usage
json_data = {
    "Relationships": [
        {"Person1": "Martin", "Person2": "Lennon", "Relationship": "Collaborated on songwriting and recording, adapting to different approaches"},
        {"Person1": "Martin", "Person2": "McCartney", "Relationship": "Worked more naturally with, influenced by conventionally articulate style"},
        {"Person1": "Martin", "Person2": "Lennon", "Relationship": "Challenged by intuitive approach, led to more original arrangements"},
        {"Person1": "Harrison", "Person2": "Martin", "Relationship": "Recognized stabilizing role in interpreting madness"},
        {"Person1": "Beatles", "Person2": "Martin", "Relationship": "Urge experimentation, innovative use of technology in studio"},
        {"Person1": "Beatles", "Person2": "EMI staff engineers", "Relationship": "Contributed significantly to their records"},
        {"Person1": "Beatles", "Person2": "Martin", "Relationship": "Combined innovative studio techniques, augmented songs with unconventional instruments"},
        {"Person1": "Rolling Stone magazine", "Person2": "Greenfield", "Relationship": "Compared the Beatles to Picasso"},
        {"Person1": "Philip Larkin", "Person2": "Beatles", "Relationship": "Described their work as revolutionary, distinctive in popular music"}
    ]
}


# Load relationships from JSON data and display graphs
relationships = load_relationships(json_data)
generate_and_display_graphs(relationships)


Graphs saved as 'radial_relationship_graph.html' and 'scatter_relationship_graph.html'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>