In [5]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random
import json


In [6]:
loader = DirectoryLoader(os.getcwd() + "/data_input", show_progress=True)
documents = loader.load()

100%|██████████| 1/1 [00:01<00:00,  1.61s/it]


In [7]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)

Number of chunks =  23
An extensive literature search was performed, and 56 articles published in peer-reviewed journals between 2005 and 2021 were selected and analyzed. The corresponding authors' experiential knowledge served as the foundation for the analysis.


# Create Dataframe for all Chunks

In [8]:
from df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(23, 3)


Unnamed: 0,text,source,chunk_id
0,Abstract India’s health indicators have improv...,/home/ec2-user/llm_knowledge_graph/data_input/...,4b44b0cbc2d9443c8b8509dbe263a4eb
1,"Categories: Public Health, Epidemiology/Public...",/home/ec2-user/llm_knowledge_graph/data_input/...,314b98edc90a4d48ac444834991d1f2a
2,Introduction And Background India’s health ind...,/home/ec2-user/llm_knowledge_graph/data_input/...,729789511249431980174f368116a57d
3,"An extensive literature search was performed, ...",/home/ec2-user/llm_knowledge_graph/data_input/...,5cfe543c48b549f189395cb30610bd14
4,Review Overview of the public and private heal...,/home/ec2-user/llm_knowledge_graph/data_input/...,5215ec9f2a5947bb957b199d55acb4ed


In [9]:
from df_helpers import df2Graph
from df_helpers import graph2Df

In [14]:
# To regenerate the graph with LLM, set this to True
regenerate = False

if regenerate:
    concept_list = df2Graph(df, model='phi3:latest')
    dfg1 = graph2Df(concept_list)
    if not os.path.exists(os.getcwd() + "/data_input"):
        os.makedirs(os.getcwd() + "/data_input")
    dfg1.to_csv(os.getcwd() + "/data_input/graph.csv", sep='|', index=False)
    df.to_csv(os.getcwd() + "/data_input/chunks.csv", sep='|', index=False)
else:
    dfg1 = pd.read_csv(os.getcwd() + "/data_input/graph.csv", sep='|')

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=['node_1', 'node_2', 'edge'], inplace=True)
dfg1['count'] = 4
print(dfg1.shape)
dfg1.head()

(159, 5)


Unnamed: 0,node_1,node_2,edge,chunk_id,count
0,india,health indicators,India's health indicators have improved in rec...,4b44b0cbc2d9443c8b8509dbe263a4eb,4
1,india,active health workers density (doctors and nur...,The country has an estimated active health wor...,4b44b0cbc2d9443c8b8509dbe263a4eb,4
2,who threshold,"active health workers density (doctors, nurses...",The estimated active health workers density in...,4b44b0cbc2d9443c8b8509dbe263a4eb,4
3,skilled health workforce,healthcare,The paucity of skilled personnel must be addre...,4b44b0cbc2d9443c8b8509dbe263a4eb,4
4,federal health budget,opportunity for augmenting skilled health work...,The recent increase in the federal health budg...,4b44b0cbc2d9443c8b8509dbe263a4eb,4


# Calculating contextual proximity


In [15]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2

In [16]:
dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
3974,vulnerable populations,government-funded health sector,"5215ec9f2a5947bb957b199d55acb4ed,5215ec9f2a594...",2,contextual proximity
3989,vulnerable populations,private for-profit health sector,"5215ec9f2a5947bb957b199d55acb4ed,5215ec9f2a594...",2,contextual proximity
4014,who threshold,india,"4b44b0cbc2d9443c8b8509dbe263a4eb,4b44b0cbc2d94...",2,contextual proximity
4024,world-class health facilities,informal providers,"b8a21542c68747f9a8f99bc7e591646e,b8a21542c6874...",2,contextual proximity
4027,world-class health facilities,nhm strategies,"b8a21542c68747f9a8f99bc7e591646e,b8a21542c6874...",2,contextual proximity


# Merge both dataframes

In [17]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,18 months of training,accredited social health activists (asha),"b553a0b8abd94b03bb28d2e645c49f5d,b553a0b8abd94...",contextual proximity,2
1,23 days’ initial training plus on-the-job shor...,accredited social health activists (asha),"b553a0b8abd94b03bb28d2e645c49f5d,b553a0b8abd94...",contextual proximity,2
2,3d printing,manufacturing of protective equipment,b1ace8359a5044aaa8fd803f2d654ad0,provided a solution to shortage of medical sup...,4
3,accredited social health activists (asha),18 months of training,"b553a0b8abd94b03bb28d2e645c49f5d,b553a0b8abd94...",contextual proximity,2
4,accredited social health activists (asha),23 days’ initial training plus on-the-job shor...,"b553a0b8abd94b03bb28d2e645c49f5d,b553a0b8abd94...",contextual proximity,2
...,...,...,...,...,...
522,who threshold,"active health workers density (doctors, nurses...",4b44b0cbc2d9443c8b8509dbe263a4eb,The estimated active health workers density in...,4
523,who threshold,india,"4b44b0cbc2d9443c8b8509dbe263a4eb,4b44b0cbc2d94...",contextual proximity,2
524,world-class health facilities,informal providers,"b8a21542c68747f9a8f99bc7e591646e,b8a21542c6874...",contextual proximity,2
525,world-class health facilities,nhm strategies,"b8a21542c68747f9a8f99bc7e591646e,b8a21542c6874...",contextual proximity,2


# Calculate NETWORKX graph

In [18]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(275,)

In [19]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

# Calculate communities for coloring the nodes


In [20]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  63
[['$30 billion (doubled allocation)', 'indian government health expenditure'], ['1.29% of its budget', 'government of india healthcare allocation'], ['18 months of training', '23 days’ initial training plus on-the-job short training', 'accredited social health activists (asha)', 'anm training duration', 'anms', 'asha', 'asha education level', 'asha initial training duration', 'asha payment model', 'ashas and anms evaluation', 'auxiliary nurse midwives (anm)', 'female community health workers', 'government employees or private contractors', 'health systems perspective synthesis with few positive findings', 'meant to serve a population of 5,000', 'payment proportionate to the amount of work performed', 'population coverage ratio of 1,000 ashas per population', 'realistically serving up to 20,000 people', 'up to the eighth grade and sometimes less'], ["3.84% of india's gdp", "india's gdp"], ['3d printing', 'manufacturing of protective equipment'], ['50 to 175 t

# Create a dataframe for community colors

In [22]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,$30 billion (doubled allocation),#db579b,1
1,indian government health expenditure,#db579b,1
2,1.29% of its budget,#57cddb,2
3,government of india healthcare allocation,#57cddb,2
4,18 months of training,#db5775,3
...,...,...,...
270,training initiative,#68db57,61
271,taking on more patients than reasonably served,#57dbaa,62
272,underpaid physicians,#57dbaa,62
273,training initiative for nurses in india,#57db91,63


# Add colors to graph

In [23]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [28]:
from pyvis.network import Network

graph_output_directory = os.getcwd() + "index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

/home/ec2-user/llm_knowledge_graphindex.html
