## Setup

In [19]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "cureus"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [20]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)


 50%|██████████████████████▌                      | 1/2 [00:00<00:00,  3.12it/s]

Number of chunks =  23
of research has been conducted continuously on these solutions in recent years

(Dai et al, 2020b). The proposed knowledge graphs are widely employed in

various AI systems recently (Ko et al, 2021; Mohamed et al, 2021), such as rec-

ommender systems, question answering, and information retrieval. They are

also widely applied in many fields (e.g., education and medical care) to benefit

human life and society. (Sun et al, 2020; Bounhas et al, 2020).





## Create a dataframe of all the chunks

In [21]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(23, 3)


Unnamed: 0,text,source,chunk_id
0,1 Introduction\n\nKnowledge plays a vital role...,data_input/cureus/cureus-0015-00000040274.txt,3d1c25046d0c4e73a20dcc73939628af
1,human knowledge according to different concept...,data_input/cureus/cureus-0015-00000040274.txt,fa8539dd48624b88ab07b8f688cd2b72
2,"entities (Hogan et al, 2021; Cheng et al, 2022...",data_input/cureus/cureus-0015-00000040274.txt,4bd1375f70a14cab8a811946f3dba23c
3,of research has been conducted continuously on...,data_input/cureus/cureus-0015-00000040274.txt,e2345789c78941bb801442c491a79140
4,"Therefore, knowledge graphs have seized great ...",data_input/cureus/cureus-0015-00000040274.txt,4ef57d094e9b43d4929eca2c82b45faf


## Extract Concepts

In [22]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

In [23]:
# Function to process data in batches
def process_batches(dataframe, batch_size=2):
    for i in range(0, len(dataframe), batch_size):
        yield dataframe.iloc[i:i+batch_size]

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [24]:
## To regenerate the graph with LLM, set this to True
regenerate = True

# if regenerate:
#     concepts_list = df2Graph(df, model='mistral-openorca:latest')
#     dfg1 = graph2Df(concepts_list)
#     if not os.path.exists(outputdirectory):
#         os.makedirs(outputdirectory)
    
#     dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
#     df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
# else:
#     dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

# dfg1.replace("", np.nan, inplace=True)
# dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
# dfg1['count'] = 4 
# ## Increasing the weight of the relation to 4. 
# ## We will assign the weight of 1 when later the contextual proximity will be calculated.  
# print(dfg1.shape)
# dfg1.head()

if regenerate:
    # Assuming df and model are defined elsewhere
    chunks = process_batches(df)
    first_chunk = True
    for chunk_idx, chunk in enumerate(chunks):
        concepts_list = df2Graph(chunk, model='mistral-openorca:latest')
        dfg1 = pd.DataFrame(concepts_list)
        dfg1.replace("", np.nan, inplace=True)
        dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
        dfg1['count'] = 4 
        
        if first_chunk:
            mode = 'w'
            first_chunk = False
        else:
            mode = 'a'
        dfg1.to_csv(os.path.join(outputdirectory, "graph.csv"), sep="|", mode=mode, index=False, header=not bool(chunk_idx))
        chunk.to_csv(os.path.join(outputdirectory, "chunks.csv"), sep="|", mode=mode, index=False, header=not bool(chunk_idx))
else:
    dfg1 = pd.read_csv(os.path.join(outputdirectory, "graph.csv"), sep="|")

dfg1 = pd.read_csv(os.path.join(outputdirectory, "graph.csv"), sep="|")
dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
print(dfg1.shape)
print(dfg1.head())

 [
   {
       "node_1": "Knowledge",
       "node_2": "Human existence and development",
       "edge": "knowledge plays a vital role"
   },
   {
       "node_1": "Learning",
       "node_2": "Representing human knowledge",
       "edge": "crucial tasks in artificial intelligence research"
   },
   {
       "node_1": "Artificial Intelligence",
       "node_2": "Solve complex tasks in realistic scenarios",
       "edge": "AI systems require additional knowledge to obtain the same abilities"
   },
   {
       "node_1": "Emergence of approaches",
       "node_2": "Representing human knowledge",
       "edge": "support these systems, we have seen"
   }
] [
    {
        "node_1": "human knowledge",
        "node_2": "conceptual models",
        "edge": "according to"
    },
    {
        "node_1": "knowledge graphs",
        "node_2": "standard solution",
        "edge": "in this space"
    },
    {
        "node_1": "knowledge graphs",
        "node_2": "research trend",
        "edge": 

In [34]:
# dfg1 = pd.read_csv(os.path.join(outputdirectory, "graph.csv"), sep="|")
# dfg1.replace("", np.nan, inplace=True)
# dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
# dfg1['count'] = 4 
# print(dfg1.shape)
# print(dfg1.head())

(116, 5)
                    node_1                                      node_2  \
0                Knowledge             Human existence and development   
1                 Learning                Representing human knowledge   
2  Artificial Intelligence  Solve complex tasks in realistic scenarios   
3  Emergence of approaches                Representing human knowledge   
4          human knowledge                           conceptual models   

                                                edge  \
0                       knowledge plays a vital role   
1  crucial tasks in artificial intelligence research   
2  AI systems require additional knowledge to obt...   
3                support these systems, we have seen   
4                                       according to   

                           chunk_id  count  
0  3d1c25046d0c4e73a20dcc73939628af      4  
1  3d1c25046d0c4e73a20dcc73939628af      4  
2  3d1c25046d0c4e73a20dcc73939628af      4  
3  3d1c25046d0c4e73a20dcc7393

## Calculating contextual proximity

In [35]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
1492,various areas,knowledge graphs,"4ef57d094e9b43d4929eca2c82b45faf,4ef57d094e9b4...",4,contextual proximity
1499,well-structured ontological knowledge base,DBpedia,"de50b40ec9594391a1b6740daf2aad3b,de50b40ec9594...",3,contextual proximity
1500,well-structured ontological knowledge base,Facebook's entity graph,"de50b40ec9594391a1b6740daf2aad3b,de50b40ec9594...",2,contextual proximity
1501,well-structured ontological knowledge base,Freebase,"de50b40ec9594391a1b6740daf2aad3b,de50b40ec9594...",2,contextual proximity
1503,well-structured ontological knowledge base,knowledge graph,"de50b40ec9594391a1b6740daf2aad3b,de50b40ec9594...",3,contextual proximity


### Merge both the dataframes

In [36]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,"(Bill Gates, founderOf, Microsoft)",knowledge graph,"f035c1db86364da28114308ab5ba634b,f035c1db86364...","example of triple in,contextual proximity",9
1,"(e1 , r1 , e2 )",example of a simple knowledge graph,"5e217a142f9f4085bc72161560094a99,5e217a142f9f4...",contextual proximity,2
2,"(e1 , r1 , e2 )",knowledge graph,"b564f4818b6a40e881a17664f76e6969,5e217a142f9f4...",contextual proximity,2
3,"(e1 , r1 , e2 )",knowledge graph construction,"b564f4818b6a40e881a17664f76e6969,b564f4818b6a4...",contextual proximity,3
4,"(e1 , r1 , e2 )",ontology construction,"b564f4818b6a40e881a17664f76e6969,b564f4818b6a4...",contextual proximity,2
...,...,...,...,...,...
488,various areas,knowledge graphs,"4ef57d094e9b43d4929eca2c82b45faf,4ef57d094e9b4...",contextual proximity,4
489,well-structured ontological knowledge base,DBpedia,"de50b40ec9594391a1b6740daf2aad3b,de50b40ec9594...",contextual proximity,3
490,well-structured ontological knowledge base,Facebook's entity graph,"de50b40ec9594391a1b6740daf2aad3b,de50b40ec9594...",contextual proximity,2
491,well-structured ontological knowledge base,Freebase,"de50b40ec9594391a1b6740daf2aad3b,de50b40ec9594...",contextual proximity,2


## Calculate the NetworkX Graph

In [37]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(153,)

In [38]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [39]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  16
[['(Bill Gates, founderOf, Microsoft)', '(e1 , r1 , e2 )', '2007', '2008', 'Auer et al', 'Bollacker et al', 'Bordes et al, 2011', 'DBpedia', 'Ehrlinger and Wöß, 2016', "Facebook's entity graph", 'Fig. 1', 'Freebase', 'Google Knowledge Graph', 'Knowledge Graphs', 'Knowledge Graphs: Opportunities and Challenges', 'Labeled Property Graphs (LPGs)', 'Opportunities and Challenges', 'Rebele et al, 2016', 'Resource Description Framework (RDF)', 'Vrandečić and Krötzsch, 2014', 'Wikidata', 'Wikipedia', 'WordNet', 'Yago', 'analyse the semantic similarity', 'contains a number of hierarchical concept graphs', 'context', 'directed graph', 'directed graph composed of nodes and edges', 'edge', 'education', 'entity', 'example of a knowledge graph', 'example of a simple knowledge graph', 'fundamental unit', 'hierarchical concept graphs', 'human society', 'information retrieval', 'introduced', 'knowledge base', 'knowledge graph', 'knowledge graph challenges', 'knowledge graph 

### Create a dataframe for community colors

In [40]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,"(Bill Gates, founderOf, Microsoft)",#57db80,1
1,"(e1 , r1 , e2 )",#57db80,1
2,2007,#57db80,1
3,2008,#57db80,1
4,Auer et al,#57db80,1
...,...,...,...
148,example,#57a2db,14
149,five main knowledge graph technologies,#db5780,15
150,introduce,#db5780,15
151,"news, research papers, and patents",#57dbb2,16


### Add colors to the graph

In [41]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [42]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./docs/index.html
