## Setup

In [1]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "cureus"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [2]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)


 50%|██████████████████████▌                      | 1/2 [01:37<01:37, 97.80s/it]


Number of chunks =  135916
The authors declare no conflict of interest.

4 8 2022 9 2022 4 8 2022 11 9 e00181-2221 3 2022 19 7 2022 Copyright © 2022 Palmieri et al. 2022 Palmieri et al. https://creativecommons.org/licenses/by/4.0/ This is an open-access article distributed under the terms of the Creative Commons Attribution 4.0 International license.

ABSTRACT


## Create a dataframe of all the chunks

In [3]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(135916, 3)


Unnamed: 0,text,source,chunk_id
0,# File: PMC9476978.txt\n\n==== Front Microbiol...,data_input/cureus/op1000file.txt,e291fbfbdf6a482391de9bc015c8e89b
1,35924938 00181-22 10.1128/mra.00181-22 mra.001...,data_input/cureus/op1000file.txt,f582a51c2c1b4ddc88eea1014faba83e
2,Udriet Pauline a https://orcid.org/0000-0002-3...,data_input/cureus/op1000file.txt,d3736d3c22874f41bfe914c4b02616f1
3,The authors declare no conflict of interest.\n...,data_input/cureus/op1000file.txt,e60d2deaaf254247b115b7dffbea2bd9
4,"ABSTRACT\n\nHere, we report the complete genom...",data_input/cureus/op1000file.txt,52c4f6c785764df684cf85439640d1c4


## Extract Concepts

In [4]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

In [5]:
# Function to process data in batches
def process_batches(dataframe, batch_size=2):
    for i in range(0, len(dataframe), batch_size):
        yield dataframe.iloc[i:i+batch_size]

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [6]:
## To regenerate the graph with LLM, set this to True
regenerate = False

# if regenerate:
#     concepts_list = df2Graph(df, model='mistral-openorca:latest')
#     dfg1 = graph2Df(concepts_list)
#     if not os.path.exists(outputdirectory):
#         os.makedirs(outputdirectory)
    
#     dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
#     df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
# else:
#     dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

# dfg1.replace("", np.nan, inplace=True)
# dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
# dfg1['count'] = 4 
# ## Increasing the weight of the relation to 4. 
# ## We will assign the weight of 1 when later the contextual proximity will be calculated.  
# print(dfg1.shape)
# dfg1.head()

if regenerate:
    # Assuming df and model are defined elsewhere
    chunks = process_batches(df)
    first_chunk = True
    for chunk_idx, chunk in enumerate(chunks):
        concepts_list = df2Graph(chunk, model='mistral-openorca:latest')
        dfg1 = pd.DataFrame(concepts_list)
        dfg1.replace("", np.nan, inplace=True)
        dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
        dfg1['count'] = 4 
        
        if first_chunk:
            mode = 'w'
            first_chunk = False
        else:
            mode = 'a'
        dfg1.to_csv(os.path.join(outputdirectory, "graph.csv"), sep="|", mode=mode, index=False, header=not bool(chunk_idx))
        chunk.to_csv(os.path.join(outputdirectory, "chunks.csv"), sep="|", mode=mode, index=False, header=not bool(chunk_idx))
else:
    dfg1 = pd.read_csv(os.path.join(outputdirectory, "graph.csv"), sep="|")

dfg1 = pd.read_csv(os.path.join(outputdirectory, "graph.csv"), sep="|")
dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
print(dfg1.shape)
print(dfg1.head())

ParserError: Error tokenizing data. C error: Expected 5 fields in line 1765, saw 6


In [15]:
dfg1 = pd.read_csv(os.path.join(outputdirectory, "graph.csv"), sep="|")
dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
print(dfg1.shape)
print(dfg1.head())

(3374, 5)
                         node_1  \
0  Cardiovascular disease (CVD)   
1                         women   
2                         women   
3                         women   
4                         women   

                                              node_2  \
0                         leading cause of mortality   
1                                                men   
2                      coronary artery disease (CAD)   
3  1-year mortality rate after acute myocardial i...   
4                                        black women   

                                                edge  \
0                      in women in the United States   
1                   comparison between men and women   
2  having less obstructive but more diffuse coron...   
3  higher 1-year mortality rate after acute myoca...   
4                 having the highest CAD death rates   

                           chunk_id  count  
0  3dc851440134402d87573a15088a4351      4  
1  51dc59c88fb1

## Calculating contextual proximity

In [16]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
51029,zebrafish and in mice,SIRT1,"9508450fd46943c092632c7e026877c4,9508450fd4694...",4,contextual proximity
51038,β blockers,ARBs,"a3cb4cf2955a46e6a1149c6b9c9a8c15,a3cb4cf2955a4...",2,contextual proximity
51054,€ 192 billion,CVD,"5858afe30c1d44558b8f8632fc6ad382,5858afe30c1d4...",6,contextual proximity
51069,≥ 7.5%,elevated hs-CRP,"426fe5b8c62e4adfa880f23ac1d76acc,426fe5b8c62e4...",2,contextual proximity
51081,≥140 and ≥90,AH,"c20c2bbc85ba423ebb0a4c64cf758931,c20c2bbc85ba4...",2,contextual proximity


### Merge both the dataframes

In [17]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,"marine n-3 fatty acids, fish intake, and the ...",Physicians’ Health Study,"c1322d3da33247ada5e8fc7a1034048b,c1322d3da3324...",contextual proximity,3
1,(1·69–2·63),2·11,"fcfdafc6471042d08b1fe8b96a7be029,fcfdafc647104...",contextual proximity,2
2,(1·69–2·63),Hazard ratio,"fcfdafc6471042d08b1fe8b96a7be029,fcfdafc647104...",contextual proximity,2
3,(95% CI),Author Manuscript,"61c8344660df49bdb13316c58314cd72,61c8344660df4...",contextual proximity,2
4,(95% CI),effect estimate,"61c8344660df49bdb13316c58314cd72,61c8344660df4...",contextual proximity,2
...,...,...,...,...,...
12865,zebrafish and in mice,SIRT1,"9508450fd46943c092632c7e026877c4,9508450fd4694...","observed to regulate blood vessel growth in,co...",8
12866,β blockers,ARBs,"a3cb4cf2955a46e6a1149c6b9c9a8c15,a3cb4cf2955a4...",contextual proximity,2
12867,€ 192 billion,CVD,"5858afe30c1d44558b8f8632fc6ad382,5858afe30c1d4...",contextual proximity,6
12868,≥ 7.5%,elevated hs-CRP,"426fe5b8c62e4adfa880f23ac1d76acc,426fe5b8c62e4...",contextual proximity,2


## Calculate the NetworkX Graph

In [18]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(4591,)

In [19]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [20]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  686


### Create a dataframe for community colors

In [21]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,Omega-3 fatty acids in high-risk cardiovascul...,#57db94,1
1,"Filion K, El Khoury F, Bielinski M, Schiller I...",#57db94,1
2,"marine n-3 fatty acids, fish intake, and the ...",#8d57db,2
3,(1·69–2·63),#8d57db,2
4,(95% CI),#8d57db,2
...,...,...,...
4586,using HIV-population cohorts,#db577c,684
4587,to stimulus,#d5db57,685
4588,variable cardiovascular responses,#d5db57,685
4589,untreated HIV patients,#57db61,686


### Add colors to the graph

In [22]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [23]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./docs/index.html
