## Setup

In [3]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "cureus"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [4]:
## Dir PDF Loader
loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
# loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)


Ignoring wrong pointing object 56 0 (offset 0)
Ignoring wrong pointing object 59 0 (offset 0)
Ignoring wrong pointing object 61 0 (offset 0)


Number of chunks =  770
risk factors for CVD and the current state of prevention, guidelines, and management of CVD in women.TRADITIONAL RISK FACTORS
Non-Modifiable
Age.  Age is one of the most powerful risk factors for developing 
CVD. The cardioprotective effect of estrogen for premenopausal 
women results in a roughly 8- to 10-year lag in the onset of CAD in women. After the age of 55, the risk for CAD increases similarly in both men and women.
7
Modifiable
Smoking.  According to the American Heart Association


## Create a dataframe of all the chunks

In [5]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(770, 4)


Unnamed: 0,text,source,page,chunk_id
0,REVIEW METHODIST DEBAKEY CARDIOVASC J | 13 (4)...,data_input/cureus/i1947-6094-13-4-185.pdf,0,3dc851440134402d87573a15088a4351
1,between men and women are summarized in Table ...,data_input/cureus/i1947-6094-13-4-185.pdf,0,51dc59c88fb141e1a0ad643fd6109a21
2,"therefore, should begin with awareness of the ...",data_input/cureus/i1947-6094-13-4-185.pdf,0,5e5c6f31324f4f8ea805b0964a33cf41
3,risk factors for CVD and the current state of ...,data_input/cureus/i1947-6094-13-4-185.pdf,0,08615e84dd1245e2b5c0037f8325f1c0
4,"(AHA), 13% of women in the United States who a...",data_input/cureus/i1947-6094-13-4-185.pdf,0,bee9f43fb9a2448195d98d5d70438e54


## Extract Concepts

In [6]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

In [7]:
# Function to process data in batches
def process_batches(dataframe, batch_size=2):
    for i in range(0, len(dataframe), batch_size):
        yield dataframe.iloc[i:i+batch_size]

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [None]:
## To regenerate the graph with LLM, set this to True
regenerate = True

# if regenerate:
#     concepts_list = df2Graph(df, model='mistral-openorca:latest')
#     dfg1 = graph2Df(concepts_list)
#     if not os.path.exists(outputdirectory):
#         os.makedirs(outputdirectory)
    
#     dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
#     df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
# else:
#     dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

# dfg1.replace("", np.nan, inplace=True)
# dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
# dfg1['count'] = 4 
# ## Increasing the weight of the relation to 4. 
# ## We will assign the weight of 1 when later the contextual proximity will be calculated.  
# print(dfg1.shape)
# dfg1.head()

if regenerate:
    # Assuming df and model are defined elsewhere
    chunks = process_batches(df)
    first_chunk = True
    for chunk_idx, chunk in enumerate(chunks):
        concepts_list = df2Graph(chunk, model='mistral-openorca:latest')
        dfg1 = pd.DataFrame(concepts_list)
        dfg1.replace("", np.nan, inplace=True)
        dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
        dfg1['count'] = 4 
        
        if first_chunk:
            mode = 'w'
            first_chunk = False
        else:
            mode = 'a'
        dfg1.to_csv(os.path.join(outputdirectory, "graph.csv"), sep="|", mode=mode, index=False, header=not bool(chunk_idx))
        chunk.to_csv(os.path.join(outputdirectory, "chunks.csv"), sep="|", mode=mode, index=False, header=not bool(chunk_idx))
else:
    dfg1 = pd.read_csv(os.path.join(outputdirectory, "graph.csv"), sep="|")

dfg1 = pd.read_csv(os.path.join(outputdirectory, "graph.csv"), sep="|")
dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
print(dfg1.shape)
print(dfg1.head())

 [
   {
       "node_1": "Cardiovascular disease (CVD)",
       "node_2": "leading cause of mortality",
       "edge": "in women in the United States"
   }, {
       "node_1": "Overall management of CVD",
       "node_2": "similar for both genders",
       "edge": ""
   }, {
       "node_1": "Gender-based variations",
       "node_2": "exist in pathophysiology",
       "edge": ""
   }, {
       "node_1": "Gender-based variations",
       "node_2": "exist in symptoms",
       "edge": ""
   }, {
       "node_1": "Gender-based variations",
       "node_2": "exist in presentation",
       "edge": ""
   }, {
       "node_1": "Gender-based variations",
       "node_2": "exist in efficacy of diagnostic tests",
       "edge": ""
   }, {
       "node_1": "Gender-based variations",
       "node_2": "exist in response to pharmacological interventions",
       "edge": ""
   }
] [
   {
       "node_1": "women",
       "node_2": "men",
       "edge": "comparison between men and women"
   }, {
      

In [None]:
dfg1 = pd.read_csv(os.path.join(outputdirectory, "graph.csv"), sep="|")
dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
print(dfg1.shape)
print(dfg1.head())

## Calculating contextual proximity

In [None]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

### Merge both the dataframes

In [None]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

## Calculate the NetworkX Graph

In [None]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

In [None]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [None]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

### Create a dataframe for community colors

In [None]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

### Add colors to the graph

In [None]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [None]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)