In [10]:
import pandas as pd  # Importing the pandas library for data manipulation and analysis
import numpy as np  # Importing the numpy library for numerical operations
import os  # Importing the os library to interact with the operating system
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader  # Importing PDF loaders for processing PDFs
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader  # Importing loaders to handle PDF directories
from langchain.text_splitter import RecursiveCharacterTextSplitter  # Importing a text splitter for breaking down documents
from pathlib import Path  # Importing Path from pathlib for working with file paths
import random  # Importing random library to use for generating random values

## Input data directory
inputdirectory = Path("./input")  # Setting the input directory path
## This is where the output csv files will be written
outputdirectory = Path("./output")  # Setting the output directory path for CSV files

In [None]:
# Load the PDF document
loader = PyPDFLoader("input/#FileName")  # Initialising the PDF loader with the specified file
documents = loader.load()  # Loading the content of the PDF into a variable

# Split the document into chunks
splitter = RecursiveCharacterTextSplitter(  # Initialising the text splitter for chunking the document
    chunk_size=1500,  # Defining the size of each chunk in characters
    chunk_overlap=150,  # Defining the overlap between chunks
    length_function=len,  # Using the length function to determine the size of each chunk
    is_separator_regex=False,  # Setting whether the separator is a regex or not
)

pages = splitter.split_documents(documents)  # Splitting the document into chunks

# Save the chunks to a text file
with open("output/chunks.txt", "w") as file:  # Opening a file to write the chunks to
    for chunk in pages:  # Iterating over each chunk of the document
        file.write(chunk.page_content + "\n\n")  # Writing each chunk and separating by two newlines

print("Number of chunks = ", len(pages))  # Printing the number of chunks

In [None]:
from helpers.df_helpers import documents2Dataframe  # Importing a helper function to convert documents to a DataFrame
df = documents2Dataframe(pages)  # Converting the split document pages into a DataFrame
print(df.shape)  # Printing the shape (rows, columns) of the DataFrame
df.head()  # Displaying the first few rows of the DataFrame


In [13]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph  # Importing the df2Graph function from df_helpers to generate graph data from a DataFrame
from helpers.df_helpers import graph2Df  # Importing the graph2Df function from df_helpers to convert graph nodes into a DataFrame


In [None]:
import os  # Importing os to interact with the file system
import pandas as pd  # Importing pandas for data manipulation
import numpy as np  # Importing numpy for numerical operations

# To regenerate the graph with LLM, set this to True
regenerate = True  # Flag to control whether the graph is regenerated

if regenerate:
    # Extract concepts from the DataFrame using the specified model
    concepts_list = df2Graph(df, model='zephyr:latest')  # Generating graph data from the DataFrame using the model 'zephyr:latest'
    # Convert the list of concepts into a DataFrame
    dfg1 = graph2Df(concepts_list)  # Converting the generated graph nodes into a DataFrame
    
    # Create the output directory if it doesn't exist
    if not os.path.exists(outputdirectory):  # Checking if the output directory exists
        os.makedirs(outputdirectory)  # Creating the directory if it doesn't exist
    
    # Save the generated graph DataFrame to a CSV file
    dfg1.to_csv(os.path.join(outputdirectory, "graph.csv"), sep="|", index=False)  # Saving the graph DataFrame to 'graph.csv'
    # Save the original DataFrame chunks to a CSV file
    df.to_csv(os.path.join(outputdirectory, "chunks.csv"), sep="|", index=False)  # Saving the original DataFrame chunks to 'chunks.csv'
else:
    # Load the graph DataFrame from an existing CSV file
    dfg1 = pd.read_csv(os.path.join(outputdirectory, "graph.csv"), sep="|")  # Loading the graph DataFrame from 'graph.csv'

# Replace empty strings with NaN
dfg1.replace("", np.nan, inplace=True)  # Replacing empty strings in the DataFrame with NaN
# Drop rows with NaN values in 'node_1', 'node_2', or 'edge' columns
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)  # Dropping rows where 'node_1', 'node_2', or 'edge' is NaN
# Set the initial count value to 4 for each row
dfg1['count'] = 4  # Assigning a count value of 4 to all rows in the DataFrame

# Print the shape of the cleaned DataFrame
print(dfg1.shape)  # Printing the shape (rows, columns) of the cleaned DataFrame
# Display the first few rows of the cleaned DataFrame
dfg1.head()  # Displaying the first few rows of the cleaned DataFrame


In [None]:
def contextual_proximity(input_df: pd.DataFrame) -> pd.DataFrame:
    # Melt the dataframe into a long format with two columns for chunk_id and node
    # This will turn node_1 and node_2 into rows with a single "node" column
    melted_df = pd.melt(
        input_df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    # Remove the "variable" column that was created during the melting process
    melted_df.drop(columns=["variable"], inplace=True)
    
    # Perform a self-join on the melted dataframe using "chunk_id" to link nodes from the same chunk
    # This will create all possible combinations of terms within the same chunk
    merged_df = pd.merge(melted_df, melted_df, on="chunk_id", suffixes=("_1", "_2"))
    
    # Identify and drop rows where the node in both columns is the same (self-loops)
    self_loops_drop = merged_df[merged_df["node_1"] == merged_df["node_2"]].index
    filtered_df = merged_df.drop(index=self_loops_drop).reset_index(drop=True)
    
    # Group by pairs of nodes and aggregate the chunk_id by joining them into a list and counting occurrences
    grouped_df = (
        filtered_df.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})  # Join chunk_ids and count their frequency for each pair of nodes
        .reset_index()
    )
    
    # Rename the columns to make them clearer
    grouped_df.columns = ["node_1", "node_2", "chunk_id", "count"]
    
    # Replace empty strings with NaN values (in case there are any)
    grouped_df.replace("", np.nan, inplace=True)
    
    # Drop rows where either node_1 or node_2 is missing (NaN values)
    grouped_df.dropna(subset=["node_1", "node_2"], inplace=True)
    
    # Drop rows where the count of the node pair is only 1 (keep only pairs that appear more than once)
    grouped_df = grouped_df[grouped_df["count"] != 1]
    
    # Add a column to indicate that this edge represents "contextual proximity"
    grouped_df["edge"] = "contextual proximity"
    
    # Return the processed dataframe
    return grouped_df


# Call the function to generate the new dataframe based on the provided input_df
output_df = contextual_proximity(dfg1)

# Display the last few rows of the resulting dataframe
output_df.tail()

# Print the original dataframe (dfg1) for reference
print(dfg1)


In [None]:
import pandas as pd  # Importing pandas for data manipulation
import numpy as np  # Importing numpy for numerical operations

def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    # Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )  # Convert the 'node_1' and 'node_2' columns into a single 'node' column, keeping 'chunk_id' as the identifier
    dfg_long.drop(columns=["variable"], inplace=True)  # Drop the unnecessary 'variable' column created by the melt function
    
    # Self join with chunk_id as the key will create a link between terms occurring in the same text chunk
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))  # Perform a self-join on 'chunk_id' to link nodes that occur together in the same chunk
    
    # Drop self loops (i.e., rows where node_1 is the same as node_2)
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index  # Identify rows where 'node_1' and 'node_2' are the same
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)  # Remove self-loops and reset the DataFrame's index
    
    # Group and count direct edges between node_1 and node_2
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])  # Group by the node pairs
        .agg({"chunk_id": [",".join, "count"]})  # Aggregate the chunk_id as a comma-separated string and count occurrences
        .reset_index()  # Reset the index
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]  # Rename the columns
    dfg2.replace("", np.nan, inplace=True)  # Replace empty strings with NaN
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)  # Drop rows where either node_1 or node_2 is NaN
    
    # Drop edges that occur only once (optional, based on use case)
    dfg2 = dfg2[dfg2["count"] != 1]  # Remove edges that only appear once, as they might not be significant
    dfg2["edge"] = "contextual proximity"  # Label these edges as "contextual proximity"
    
    # Create a set of indirect edges based on shared intermediate nodes
    indirect_edges = []  # Initialize a list to store indirect edges
    nodes = dfg2[['node_1', 'node_2']].stack().unique()  # Get a unique list of all nodes

    for node in nodes:
        # Get all nodes directly connected to the current node
        connected_nodes = pd.concat([
            dfg2[dfg2['node_1'] == node][['node_2', 'chunk_id']],  # Find nodes where 'node_1' is the current node
            dfg2[dfg2['node_2'] == node][['node_1', 'chunk_id']].rename(columns={'node_1': 'node_2'})  # Find nodes where 'node_2' is the current node and rename to 'node_2'
        ])
        
        # Create pairs of these connected nodes to form indirect edges
        for i in range(len(connected_nodes)):
            for j in range(i + 1, len(connected_nodes)):
                pair = sorted([connected_nodes.iloc[i]['node_2'], connected_nodes.iloc[j]['node_2']])  # Sort the node pairs to avoid duplicate combinations
                chunk_ids = ','.join([connected_nodes.iloc[i]['chunk_id'], connected_nodes.iloc[j]['chunk_id']])  # Concatenate chunk_ids

                # Append the indirect edge with the intermediate node
                indirect_edges.append((pair[0], pair[1], node, chunk_ids))

    # Convert indirect edges into a DataFrame
    indirect_df = pd.DataFrame(indirect_edges, columns=["node_1", "node_2", "via_node", "chunk_id"])
    
    # Group the indirect edges by node pairs and count how many times they occur
    indirect_df = (
        indirect_df.groupby(["node_1", "node_2"])  # Group by the node pairs
        .agg({"chunk_id": ",".join, "via_node": "count"})  # Aggregate chunk_ids and count how many times the indirect edge appears
        .reset_index()  # Reset the index
    )
    indirect_df.columns = ["node_1", "node_2", "chunk_id", "count"]  # Rename columns
    indirect_df["edge"] = "indirect contextual proximity"  # Label these edges as "indirect contextual proximity"
    
    # Merge indirect edges with the direct edges
    final_df = pd.concat([dfg2, indirect_df], ignore_index=True)  # Concatenate the direct and indirect edges

    # Handle cases where direct and indirect edges overlap
    final_df = (
        final_df.groupby(["node_1", "node_2", "edge"])  # Group by node pairs and edge type
        .agg({"chunk_id": ",".join, "count": "sum"})  # Aggregate the chunk_ids and sum the count for overlapping edges
        .reset_index()  # Reset the index
    )
    
    return final_df  # Return the final DataFrame containing both direct and indirect edges

# Usage: apply the contextual proximity function to a DataFrame containing graph data
dfg2 = contextual_proximity(dfg1)

# Display the last few rows of the resulting DataFrame
dfg2.tail()


In [None]:
# Printing the final DataFrame (dfg2) which contains both direct and indirect edges
print(dfg2)


In [18]:
# Concatenate the two DataFrames (dfg1 and dfg2) along the rows (axis=0)
# This combines both the direct edges (dfg1) and the newly generated edges (dfg2)
dfg = pd.concat([dfg1, dfg2], axis=0)


In [None]:
# Step 1: Group and aggregate the DataFrame by 'node_1' and 'node_2', concatenating 'chunk_id' and 'edge' as comma-separated strings
# and summing the 'count' for the same node pairs, then reset the index
updated_dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})  # Aggregating chunk_id and edge, summing the count
    .reset_index()  # Reset the index to flatten the grouped DataFrame
)

# Step 2: Print the resulting DataFrame (optional, for checking the output)
print(updated_dfg)

# Step 3: Define the output directory and file path
output_directory = "output"  # Directory where the file will be saved
output_file = os.path.join(output_directory, "updated_dfg_grouped.csv")  # Full path for the output CSV file

# Save the updated DataFrame to the CSV file
updated_dfg.to_csv(output_file, index=False)  # Save the DataFrame without writing row indices

# Print a message to confirm that the file has been saved
print(f"DataFrame saved to {output_file}")


In [None]:
def remove_duplicate_chunk_ids_and_save(updated_dfg: pd.DataFrame, output_file: str):
    # Iterate over each row in the DataFrame and process the 'chunk_id' column
    # Split the 'chunk_id' string by commas, remove duplicates using set, sort the values, and join them back into a comma-separated string
    updated_dfg['chunk_id'] = updated_dfg['chunk_id'].apply(lambda x: ','.join(sorted(set(x.split(',')))))

    # Save the updated DataFrame back to the specified file
    updated_dfg.to_csv(output_file, index=False)  # Save the DataFrame without row indices
    print(f"DataFrame with unique chunk_ids saved to {output_file}")  # Print confirmation message

# Example usage after the grouping and aggregation:
output_directory = "output"  # Define the output directory
output_file = os.path.join(output_directory, "updated_dfg_grouped.csv")  # Define the output file path

# Call the function to remove duplicate chunk_ids and save the file
remove_duplicate_chunk_ids_and_save(updated_dfg, output_file)


In [None]:
import pandas as pd  # Importing pandas for data manipulation
import os  # Importing os to interact with the file system

# Define the path to the chunks.csv file
chunks_file = os.path.join("output", "chunks.csv")  # Setting the path to the chunks.csv file located in the 'output' directory

# Load the chunks.csv file into a DataFrame using the correct delimiter
chunks_df = pd.read_csv(chunks_file, delimiter='|')  # Reading the CSV file with '|' as the delimiter and loading it into a DataFrame

# Display the first few rows of the DataFrame to confirm it's loaded correctly
print(chunks_df.head())  # Printing the first few rows of the DataFrame to verify the contents


In [None]:
import pandas as pd  # Importing pandas for data manipulation

# Function to replace chunk_ids in updated_dfg with the corresponding content from chunks_df
def replace_chunk_ids_with_content(updated_dfg: pd.DataFrame, chunks_df: pd.DataFrame) -> pd.DataFrame:
    # Create a dictionary to map chunk_id to its corresponding text content
    chunk_id_to_content = chunks_df.set_index('chunk_id')['text'].to_dict()  # Setting 'chunk_id' as the index and mapping it to 'text'

    # Function to replace chunk_ids in a row with the corresponding text content
    def replace_chunk_id_with_text(chunk_ids):
        # Split the chunk_ids by commas, remove duplicates, and replace each chunk_id with its associated text content
        unique_chunk_ids = set(chunk_ids.split(','))  # Remove duplicates by using a set
        # For each chunk_id, get its corresponding content from the dictionary, or show an error message if not found
        replaced_content = [chunk_id_to_content.get(chunk_id.strip(), f"[Unknown chunk_id: {chunk_id}]") for chunk_id in unique_chunk_ids]
        return ' '.join(replaced_content)  # Join the replaced content into a single string

    # Create a copy of updated_dfg to store the new content (so the original DataFrame remains unchanged)
    contentreplacedforchunk_dfg = updated_dfg.copy()
    
    # Apply the function to the 'chunk_id' column of the new DataFrame
    contentreplacedforchunk_dfg['chunk_id'] = contentreplacedforchunk_dfg['chunk_id'].apply(replace_chunk_id_with_text)  # Replacing chunk_ids with text content

    # Rename the 'chunk_id' column to 'text_from_chunk_id' to reflect the new content
    contentreplacedforchunk_dfg.rename(columns={'chunk_id': 'text_from_chunk_id'}, inplace=True)
    
    # Return the new DataFrame with the chunk content replaced
    return contentreplacedforchunk_dfg

# Example usage:

# Replace chunk_ids with content and get the new DataFrame
contentreplacedforchunk_dfg = replace_chunk_ids_with_content(updated_dfg, chunks_df)

# Save the new DataFrame to a CSV file
output_file = os.path.join("output", "contentreplacedforchunk_dfg.csv")  # Define the output file path
contentreplacedforchunk_dfg.to_csv(output_file, index=False)  # Save the DataFrame without row indices

# Print a message to confirm that the new DataFrame has been saved
print(f"DataFrame with chunk content saved to {output_file}")


In [None]:
import pandas as pd  # Importing pandas for data manipulation
import os  # Importing os to interact with the file system
from ollama.client import generate  # Importing the generate function from the Ollama client to interact with the LLM

# Function to update edge labels using an LLM
def update_edge_labels_with_llm(contentreplacedforchunk_dfg: pd.DataFrame, model_name: str) -> pd.DataFrame:
    # Function to interact with the LLM and get the refined edge label
    def get_refined_edge(node_1, node_2, text_from_chunk_id):
        # Creating the prompt for the LLM
        prompt = (
            f"Based on the following context, please provide a short and accurate label "
            f"for the relationship between \"{node_1}\" and \"{node_2}\". "
            f"The label should be concise and meaningful. Do not use any outside knowledge, only use the context provided.\n\n"
            f"Context: \"{text_from_chunk_id}\""
        )
        
        # Call the LLM model using the Ollama client
        response, _ = generate(model_name=model_name, prompt=prompt)  # Unpack the response from the LLM call
        refined_edge = response.strip()  # Strip any extra whitespace from the response
        
        # If the response is empty or unhelpful, default to "contextual proximity"
        if not refined_edge or refined_edge.lower() in ["", "unknown", "not found"]:
            refined_edge = "contextual proximity"  # Default label if no useful response is given
        
        return refined_edge  # Return the refined edge label
    
    # Iterate over each row in the DataFrame and update the edge label using the LLM
    for index, row in contentreplacedforchunk_dfg.iterrows():
        node_1 = row['node_1']  # Extract the first node from the row
        node_2 = row['node_2']  # Extract the second node from the row
        text_from_chunk_id = row['text_from_chunk_id']  # Extract the context text from the chunk_id
        
        # Get the refined edge label from the LLM based on the nodes and their context
        refined_edge = get_refined_edge(node_1, node_2, text_from_chunk_id)
        
        # Update the edge label in the DataFrame at the current row
        contentreplacedforchunk_dfg.at[index, 'edge'] = refined_edge
    
    # Save the updated DataFrame to a CSV file
    output_file = os.path.join("output", "contentreplacedforchunk_dfg.csv")  # Define the output file path
    contentreplacedforchunk_dfg.to_csv(output_file, index=False)  # Save the DataFrame without row indices
    
    print(f"Updated DataFrame with refined edge labels saved to {output_file}")  # Confirm that the file has been saved
    
    return contentreplacedforchunk_dfg  # Return the updated DataFrame

# Function to check if the file already exists and process it accordingly
def check_and_process_file(contentreplacedforchunk_dfg: pd.DataFrame, model_name: str):
    output_file = os.path.join("output", "contentreplacedforchunk_dfg.csv")  # Define the output file path
    
    # Check if the file already exists
    if os.path.exists(output_file):
        # Prompt the user to choose whether to regenerate the file
        user_input = input(f"The file {output_file} already exists. Do you want to regenerate it? (yes/no): ").strip().lower()
        
        if user_input == 'no':
            # If the user chooses not to regenerate, use the existing file
            print(f"Using the existing file: {output_file}")
            return pd.read_csv(output_file)  # Load and return the existing file as a DataFrame
        elif user_input == 'yes':
            # If the user chooses to regenerate, proceed with the regeneration
            print("Regenerating the file...")
        else:
            # If the input is invalid, ask again
            print("Invalid input. Please enter 'yes' or 'no'.")
            return check_and_process_file(contentreplacedforchunk_dfg, model_name)  # Recursively ask for valid input
    
    # If the file doesn't exist or the user chose to regenerate it, update the edge labels with the LLM
    return update_edge_labels_with_llm(contentreplacedforchunk_dfg, model_name)

# Example usage:

# Assuming contentreplacedforchunk_dfg is already defined and contains the DataFrame

# Check if the file exists and process it accordingly
contentreplacedforchunk_dfg = check_and_process_file(contentreplacedforchunk_dfg, model_name="mistral-openorca:latest")


In [None]:
# Concatenate the 'node_1' column from contentreplacedforchunk_dfg and 'node_2' column from dfg along the rows (axis=0)
nodes = pd.concat([contentreplacedforchunk_dfg['node_1'], dfg['node_2']], axis=0).unique()

# Output the shape (i.e., number of unique nodes) from the concatenated result
nodes.shape


In [25]:
import networkx as nx  # Importing networkx for graph creation and manipulation

# Initialize a new graph
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)  # Adding each node to the graph, converting the node to a string for consistency
    )

## Add edges to the graph
for index, row in contentreplacedforchunk_dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),  # Adding an edge between node_1 and node_2
        str(row["node_2"]),  # Convert both nodes to strings for consistency
        title=row["edge"],  # Using the 'edge' label from the DataFrame for a meaningful relationship label
        weight=row['count']/4  # Assigning a weight to the edge based on the 'count' divided by 4 for scaling
    )


In [None]:
# Generate communities using the Girvan-Newman algorithm
communities_generator = nx.community.girvan_newman(G)  # Create a generator for communities using the Girvan-Newman algorithm

# Get the top level of communities (first split)
top_level_communities = next(communities_generator)  # Get the first level of communities from the generator

# Get the next level of communities (further refined split)
next_level_communities = next(communities_generator)  # Get the next level of communities (more detailed split)

# Sort and organize the communities into a list of sorted node groups
communities = sorted(map(sorted, next_level_communities))  # Sort the nodes within each community and the list of communities

# Print the number of communities detected
print("Number of Communities = ", len(communities))  # Output the total number of communities

# Print the communities
print(communities)  # Output the list of communities with the nodes belonging to each


In [None]:
import seaborn as sns  # Importing seaborn to generate color palettes
palette = "hls"  # Defining the color palette to be used (hue-lightness-saturation)

## Function to assign colors to communities and create a new DataFrame
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette with as many colors as there are communities
    p = sns.color_palette(palette, len(communities)).as_hex()  # Generate a list of hex color codes for each community
    
    random.shuffle(p)  # Shuffle the color palette to randomize the color assignments

    rows = []  # Initialize an empty list to store node-color-group mappings
    group = 0  # Group identifier for each community
    
    # Iterate over each community and assign colors
    for community in communities:
        color = p.pop()  # Pop a color from the shuffled palette
        group += 1  # Increment the group counter for each new community
        
        # Assign each node in the community a color and group
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]  # Append node-color-group mapping to the list

    # Convert the list of mappings into a DataFrame
    df_colors = pd.DataFrame(rows)  # Create a DataFrame with 'node', 'color', and 'group' columns

    return df_colors  # Return the DataFrame with nodes, colors, and group assignments

# Call the function to create the colors DataFrame for the communities
colors = colors2Community(communities)

# Display the resulting DataFrame
colors


In [28]:
# Iterate through each row of the colors DataFrame and assign attributes to the nodes in the graph G
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']  # Assign the 'group' attribute to the node
    G.nodes[row['node']]['color'] = row['color']  # Assign the 'color' attribute to the node
    G.nodes[row['node']]['size'] = G.degree[row['node']]  # Assign the 'size' attribute based on the node's degree (number of connections)


In [29]:
from pyvis.network import Network  # Importing the Network class from the pyvis library for network visualization
import os  # Importing os for file system interactions

# Define the output directory and file path for saving the HTML visualization
output_directory = "./docs"  # Directory where the HTML file will be saved
graph_output_directory = os.path.join(output_directory, "index.html")  # Full path to the output HTML file

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)  # Ensure that the output directory exists, or create it

# Initialize a Pyvis Network object for visualization
net = Network(
    notebook=False,  # Disable notebook mode (for saving as an HTML file)
    cdn_resources="remote",  # Use remote CDN resources for the visualization
    height="900px",  # Set the height of the visualization
    width="100%",  # Set the width of the visualization to take up 100% of the available width
    select_menu=True,  # Enable a select menu for node selection
    filter_menu=False,  # Disable the filter menu
)

# Add the nodes and edges from the NetworkX graph (G) to the Pyvis network object
net.from_nx(G)

# Apply the Force Atlas 2 algorithm for graph layout
net.force_atlas_2based(central_gravity=0.015, gravity=-31)  # Apply force-atlas layout settings

# Optional: Show buttons to control physics settings
net.show_buttons(filter_=["physics"])  # Add buttons to enable users to adjust the physics parameters in the visualization

# Save the graph visualization as an HTML file
net.show(graph_output_directory)  # Save and display the graph in the specified output directory as "index.html"


In [None]:
from sentence_transformers import SentenceTransformer  # Importing SentenceTransformer for generating embeddings
from scipy.spatial.distance import cosine  # Importing cosine similarity calculation
import pandas as pd  # Importing pandas for data manipulation
from ollama.client import generate  # Importing Ollama client for LLM generation

# Load the pre-trained model for generating embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # Using a pre-trained model for text embeddings

def generate_embeddings(text_list):
    """Generate embeddings for a list of texts using a pre-trained SentenceTransformer model."""
    return model.encode(text_list)  # Generate and return embeddings for the provided list of texts

def add_embeddings_to_contentreplacedforchunk_dfg(contentreplacedforchunk_dfg: pd.DataFrame) -> pd.DataFrame:
    """Add embeddings to the DataFrame `contentreplacedforchunk_dfg` based on combined text from nodes and edges."""
    # Combine node_1, edge, and node_2 into a single text string
    contentreplacedforchunk_dfg['combined_text'] = contentreplacedforchunk_dfg.apply(lambda row: f"{row['node_1']} {row['edge']} {row['node_2']}", axis=1)
    
    # Generate embeddings for the combined text
    embeddings = generate_embeddings(contentreplacedforchunk_dfg['combined_text'].tolist())
    
    # Store embeddings in the DataFrame
    contentreplacedforchunk_dfg['embedding'] = list(embeddings)  # Add embeddings as a new column in the DataFrame
    
    return contentreplacedforchunk_dfg  # Return the DataFrame with embeddings

def generate_final_answer_with_llm(relationships, nodes_entities, context):
    """Generate the final answer using the LLM."""
    # Create a prompt to pass to the LLM
    prompt = (
        f"Given the following relationships and context, provide a summary answer to the query:\n\n"
        f"Relationships:\n{relationships}\n\n"
        f"Entities Involved:\n{nodes_entities}\n\n"
        f"Context:\n{context}\n\n"
        "Answer:"
    )
    
    # Call the LLM model to generate the response based on the prompt
    full_response, _ = generate("mistral-openorca:latest", prompt)  # Using the LLM with the prompt
    return full_response.strip()  # Return the response after stripping any extra whitespace

def answer_query_with_all_relationships(query: str, contentreplacedforchunk_dfg: pd.DataFrame, df: pd.DataFrame, similarity_threshold=0.3) -> str:
    """Answer a user query by gathering all relevant relationships and generating a final answer with LLM."""
    # Step 1: Generate an embedding for the user query
    query_embedding = generate_embeddings([query])[0]  # Generate embedding for the query text
    
    # Step 2: Compute cosine similarity between the query embedding and embeddings in contentreplacedforchunk_dfg
    contentreplacedforchunk_dfg['similarity'] = contentreplacedforchunk_dfg['embedding'].apply(lambda emb: 1 - cosine(query_embedding, emb))
    
    # Step 3: Filter rows based on a similarity threshold
    relevant_rows = contentreplacedforchunk_dfg[contentreplacedforchunk_dfg['similarity'] >= similarity_threshold]  # Select rows with high similarity
    
    # Step 4: Gather all relevant relationships and contexts
    relationships = []
    context_list = []
    nodes_entities = set()  # To collect unique nodes and entities
    
    for _, row in relevant_rows.iterrows():
        relationship = f"{row['node_1']} - {row['edge']} - {row['node_2']}"  # Create a string representing the relationship
        relationships.append(relationship)  # Add the relationship to the list
        context_list.append(get_context_from_chunks(row['text_from_chunk_id'].split(','), df))  # Get the context for the relationship
        nodes_entities.update([row['node_1'], row['node_2']])  # Add unique nodes to the set
    
    relationships_text = "\n".join(relationships)  # Combine all relationships into a single string
    context = " ".join(context_list)  # Combine all context into a single string
    nodes_entities_text = ", ".join(nodes_entities)  # Combine all entities into a single string
    
    # Step 5: Generate the final answer using the LLM
    final_answer = generate_final_answer_with_llm(relationships_text, nodes_entities_text, context)
    
    # Combine everything for the final output
    answer = (
        f"Final Answer: {final_answer}\n\n"
        f"All Relationships:\n{relationships_text}\n\n"
        f"Entities Involved:\n{nodes_entities_text}\n\n"
        f"Context:\n{context}"
    )
    
    return answer  # Return the final answer

def get_context_from_chunks(text_from_chunk_ids, df):
    """Retrieve context from the text chunks based on text_from_chunk_ids."""
    relevant_texts = df[df['chunk_id'].isin(text_from_chunk_ids)]['text'].tolist()  # Get text for the matching chunk_ids
    return " ".join(relevant_texts) if relevant_texts else ""  # Combine the text and return

# Ensure `contentreplacedforchunk_dfg` is populated with your graph data, and `df` contains the relevant text chunks.
contentreplacedforchunk_dfg = add_embeddings_to_contentreplacedforchunk_dfg(contentreplacedforchunk_dfg)  # Generate and add embeddings to the DataFrame

# To answer a query:
query = "#ENTER YOUR QUERY HERE"  # Example query
response = answer_query_with_all_relationships(query, contentreplacedforchunk_dfg, df)

# Print the response
print(response)


In [None]:
from sentence_transformers import SentenceTransformer  # Importing SentenceTransformer for generating text embeddings
from scipy.spatial.distance import cosine  # Importing cosine similarity calculation
import pandas as pd  # Importing pandas for data manipulation
from ollama.client import generate  # Importing Ollama client for interacting with the LLM

# Load the pre-trained model for generating embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # Using the 'all-MiniLM-L6-v2' model to generate embeddings

def generate_embeddings(text_list):
    """Generate embeddings for a list of texts using a pre-trained SentenceTransformer model."""
    return model.encode(text_list)  # Generate and return embeddings for the provided text list

def add_embeddings_to_dfg(dfg: pd.DataFrame) -> pd.DataFrame:
    """Add embeddings to the DataFrame `dfg` based on combined text from nodes and edges."""
    # Combine node_1, edge, and node_2 into a single text string for each row
    dfg['combined_text'] = dfg.apply(lambda row: f"{row['node_1']} {row['edge']} {row['node_2']}", axis=1)
    
    # Generate embeddings for the combined text
    embeddings = generate_embeddings(dfg['combined_text'].tolist())
    
    # Store embeddings in the DataFrame
    dfg['embedding'] = list(embeddings)  # Add the embeddings as a new column to the DataFrame
    
    return dfg  # Return the updated DataFrame

def generate_final_answer_with_llm(relationships, nodes_entities, context):
    """Generate the final answer using the LLM."""
    # Create a prompt to pass to the LLM for generating a summary answer
    prompt = (
        f"Given the following relationships and context, provide a summary answer to the query:\n\n"
        f"Relationships:\n{relationships}\n\n"
        f"Entities Involved:\n{nodes_entities}\n\n"
        f"Context:\n{context}\n\n"
        "Answer:"
    )
    
    # Generate a response using the LLM based on the prompt
    full_response, _ = generate("mistral-openorca:latest", prompt)  # Call the LLM to generate a response
    return full_response.strip()  # Return the response after stripping extra whitespace

def answer_query_with_all_relationships(query: str, dfg: pd.DataFrame, df: pd.DataFrame, similarity_threshold=0.5) -> str:
    """Answer a user query by gathering all relevant relationships and generating a final answer with LLM."""
    # Step 1: Generate an embedding for the user query
    query_embedding = generate_embeddings([query])[0]  # Generate embedding for the query text
    
    # Step 2: Compute cosine similarity between the query embedding and embeddings in dfg
    dfg['similarity'] = dfg['embedding'].apply(lambda emb: 1 - cosine(query_embedding, emb))  # Calculate cosine similarity
    
    # Step 3: Filter rows based on a similarity threshold
    relevant_rows = dfg[dfg['similarity'] >= similarity_threshold]  # Select rows where similarity is above the threshold
    
    # Step 4: Gather all relevant relationships and contexts
    relationships = []
    context_list = []
    nodes_entities = set()  # To collect unique nodes and entities
    
    for _, row in relevant_rows.iterrows():
        relationship = f"{row['node_1']} - {row['edge']} - {row['node_2']}"  # Create a string representing the relationship
        relationships.append(relationship)  # Append the relationship to the list
        context_list.append(get_context_from_chunks(row['chunk_id'].split(','), df))  # Get the context for the relationship
        nodes_entities.update([row['node_1'], row['node_2']])  # Add unique nodes to the set
    
    # Combine the relationships, context, and entities for the LLM input
    relationships_text = "\n".join(relationships)
    context = " ".join(context_list)
    nodes_entities_text = ", ".join(nodes_entities)
    
    # Step 5: Generate the final answer using the LLM
    final_answer = generate_final_answer_with_llm(relationships_text, nodes_entities_text, context)
    
    # Combine everything for the final output
    answer = (
        f"Final Answer: {final_answer}\n\n"
        f"All Relationships:\n{relationships_text}\n\n"
        f"Entities Involved:\n{nodes_entities_text}\n\n"
        f"Context:\n{context}"
    )
    
    return answer  # Return the final answer

def get_context_from_chunks(chunk_ids, df):
    """Retrieve context from the text chunks based on chunk IDs."""
    # Select the relevant text from `df` where 'chunk_id' matches the provided chunk IDs
    relevant_texts = df[df['chunk_id'].isin(chunk_ids)]['text'].tolist()
    return " ".join(relevant_texts) if relevant_texts else ""  # Return combined text, or an empty string if no match

def interactive_pdf_query(dfg: pd.DataFrame, df: pd.DataFrame):
    """Interactive chat function for querying the PDF."""
    print("You can now interact with the PDF. Ask questions about the document.")
    print("Type 'exit' or 'quit' to end the interaction.")
    
    while True:
        # Get the user's query
        query = input("Ask your question: ").strip()
        
        # Check if the user wants to exit
        if query.lower() in ['exit', 'quit']:
            print("Ending the interaction. Goodbye!")
            break
        
        # Answer the query using the function defined earlier
        response = answer_query_with_all_relationships(query, dfg, df)
        
        # Display the response
        print("\n" + response + "\n")

# Ensure `dfg` is populated with your graph data, and `df` contains the relevant text chunks.
dfg = add_embeddings_to_dfg(dfg)  # Generate and add embeddings to the DataFrame

# Start the interactive query session
interactive_pdf_query(dfg, df)
