## Libraries

In [1]:
#main
import pandas as pd
import numpy as np
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

#helpers
import uuid
import pandas as pd
import numpy as np


#loader
from bs4 import BeautifulSoup
import requests
from langchain_core.documents import Document


## Load Documents from an HTML File



In [2]:
# loaders.py

def load_from_html(url: str) -> list:
    """Fetch and parse an HTML page, return content as LangChain-style Document objects."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # You can tweak this selector logic to suit your use case
    content_blocks = soup.find_all(["p", "div", "article", "section"])
    documents = []

    for i, block in enumerate(content_blocks):
        text = block.get_text(strip=True)
        if text:
            documents.append(Document(page_content=text, metadata={"source": url, "block_id": i}))

    return documents


In [10]:
from readability import Document as ReadabilityDoc
import requests
from bs4 import BeautifulSoup
from langchain_core.documents import Document

def load_readable_html(url: str) -> list:
    html = requests.get(url).text
    readable_article = ReadabilityDoc(html).summary()
    soup = BeautifulSoup(readable_article, "html.parser")
    paragraphs = soup.find_all("p")
    docs = []
    for i, p in enumerate(paragraphs):
        text = p.get_text(strip=True)
        if len(text) > 50:
            docs.append(Document(page_content=text, metadata={"source": url, "block_id": i}))
    return docs


In [None]:

WEBPAGE_URL = "https://en.wikipedia.org/wiki/Wikipedia"


docs = load_readable_html(WEBPAGE_URL)


splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)
pages = splitter.split_documents(docs)

print("Number of chunks:", len(pages))


Number of chunks: 142


## Document to dataframe

In [12]:
def documents2Dataframe(documents) -> pd.DataFrame:
    rows = []
    for chunk in documents:
        row = {
            "text": chunk.page_content,
            **chunk.metadata,
            "chunk_id": uuid.uuid4().hex,
        }
        rows = rows + [row]

    df = pd.DataFrame(rows)
    return df

def extractConcepts(dataframe: pd.DataFrame, model=None) -> list:
    # dataframe.reset_index(inplace=True)
    results = dataframe.apply(
        lambda row: graphPrompt(row.text, {"chunk_id": row.chunk_id}, model), axis=1
    )
    # invalid json results in NaN
    results = results.dropna()
    results = results.reset_index(drop=True)

    ## Flatten the list of lists to one single list of entities.
    concept_list = np.concatenate(results).ravel().tolist()
    return concept_list


def cleanConcepts2df(nodes_list) -> pd.DataFrame:
    ## Remove all NaN entities
    graph_dataframe = pd.DataFrame(nodes_list).replace(" ", np.nan)
    graph_dataframe = graph_dataframe.dropna(subset=["node_1", "node_2"])
    graph_dataframe["node_1"] = graph_dataframe["node_1"].apply(lambda x: x.lower())
    graph_dataframe["node_2"] = graph_dataframe["node_2"].apply(lambda x: x.lower())

    return graph_dataframe


In [13]:
df = documents2Dataframe(pages)
print(df.shape)

pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.width", None)  # No limit on width
pd.set_option("display.max_colwidth", None)  # No limit on column width

df.head()

# print(df.apply(lambda row: print(row.text), axis=1))


(142, 4)


Unnamed: 0,text,source,block_id,chunk_id
0,"Wikipedia[b]is afreeonlineencyclopedia, written and maintained by a community ofvolunteers, known asWikipedians, throughopen collaborationand thewikisoftwareMediaWiki. Founded byJimmy WalesandLarry Sangeron January 15, 2001, Wikipedia has been hosted since 2003 by theWikimedia Foundation, an Americannonprofit organizationfunded mainly by donations from readers.[2]Wikipedia is the largest and most-readreference workin history.[3][4]",https://en.wikipedia.org/wiki/Wikipedia,2,ca39ec7af23f47d0ad6561e668897fff
1,"Initially available only in English, Wikipedia now existsin over 340 languages. TheEnglish Wikipedia, with over 6 millionarticles, remains the largest of the editions, which together comprise more than 64 million articles and attract more than 1.5 billion unique device visits and 13 million edits per month (about 5edits per second on average) as of April 2024[update].[W 1]As of March 2025[update], over 25% of Wikipedia'strafficcomes from theUnited States, followed byJapanat 6.38%, theUnited Kingdomat 5.81%,Germanyat 4.97%,Russiaat 4.86%, and the remaining 52.25% split among all othercountries.[5]",https://en.wikipedia.org/wiki/Wikipedia,3,2493ffd6ee7940faa94105ec83d95c1e
2,"Wikipedia has been praised for enabling thedemocratization of knowledge, its extensive coverage, unique structure, and culture.Wikipedia has been censoredby some national governments, ranging from specific pages to the entire site.[6][7]Although Wikipedia's volunteer editors have written extensively on a wide variety of topics,the encyclopedia has been criticizedforsystemic bias, such as agender biasagainst women andgeographical biasagainst theGlobal South(Eurocentrism).[8][9]While thereliability of Wikipediawas frequently criticized in the 2000s, it has improved over time, receiving greater praise from the late 2010s onward,[3][10][11]while becomingan important fact-checking site.[12][13]Articles onbreaking newsare often accessed as sources for up-to-date information about those events.[14][15]",https://en.wikipedia.org/wiki/Wikipedia,4,869781f63b3247339f4d20a135f504a8
3,"Various collaborative online encyclopedias were attempted before the start of Wikipedia, but with limited success.[16]Wikipedia began as a complementary project for Nupedia, a free online English-language encyclopedia project whose articles were written by experts and reviewed under a formal process.[17]It was founded on March 9, 2000, under the ownership ofBomis, aweb portalcompany. Its main figures were Bomis CEOJimmy WalesandLarry Sanger, editor-in-chief for Nupedia and later Wikipedia.[1][18]Nupedia was initially licensed under its own NupediaOpen ContentLicense, but before Wikipedia was founded, Nupedia switched to theGNU Free Documentation Licenseat the urging ofRichard Stallman.[W 2]Wales is credited with defining the goal of making a publicly editable encyclopedia,[19][W 3]while Sanger is credited with the strategy of using awikito reach that goal.[W 4]On January 10, 2001, Sanger proposed on the Nupedia mailing list to create a wiki as a ""feeder"" project for Nupedia.[W 5]",https://en.wikipedia.org/wiki/Wikipedia,7,1548bfa7f06045b6ae0937aa3fe90119
4,"Wikipedia was launched on January 15, 2001[17](referred to asWikipedia Day) as a singleEnglish languageedition with the domain namewww.wikipedia.com,[W 6]and was announced by Sanger on the Nupedia mailing list.[19]The name originated from ablendof the wordswikiandencyclopedia.[20][21]Its integral policy of ""neutral point-of-view""[W 7]was codified in its first few months. Otherwise, there were initially relatively few rules, and it operated independently of Nupedia.[19]Bomis originally intended for it to be a for-profit business.[22]",https://en.wikipedia.org/wiki/Wikipedia,9,4b95de49d8244dcb8603b450068ba67c


## Extract Concepts from the Dataframe

!!The ollama.client doesnt work because client is a file in the main file!!


In [44]:
import sys
from yachalk import chalk
sys.path.append("..")

import json
import ollama


def graphPrompt(input: str, metadata={}, model="mistral-openorca:latest"):
    if model == None:
        model = "mistral-openorca:latest"



    # SYS_PROMPT = (
    #     "You are a network graph maker who extracts terms and their relations from a given context. "
    #     "You are provided with a context chunk (delimited by ```) Your task is to extract the ontology "
    #     "of terms mentioned in the given context. These terms should represent the key concepts as per the context. \n"
    #     "Thought 1: While traversing through each sentence, Think about the key terms mentioned in it.\n"
    #         "\tTerms may include object, entity, location, organization, person, \n"
    #         "\tcondition, acronym, documents, service, concept, etc.\n"
    #         "\tTerms should be as atomistic as possible\n\n"
    #     "Thought 2: Think about how these terms can have one on one relation with other terms.\n"
    #         "\tTerms that are mentioned in the same sentence or the same paragraph are typically related to each other.\n"
    #         "\tTerms can be related to many other terms\n\n"
    #     "Thought 3: Find out the relation between each such related pair of terms. \n\n"
    #     "Format your output as a list of json. Each element of the list contains a pair of terms"
    #     "and the relation between them, like the follwing: \n"
    #     "[\n"
    #     "   {\n"
    #     '       "node_1": "A concept from extracted ontology",\n'
    #     '       "node_2": "A related concept from extracted ontology",\n'
    #     '       "edge": "relationship between the two concepts, node_1 and node_2 in one or two sentences"\n'
    #     "   }, {...}\n"
    #     "]"
    # )

    USER_PROMPT = f"context: ```{input}``` \n\n output: "
    response = ollama.chat(model="kg_good", messages=[ {"role": "user", "content": USER_PROMPT}], options={"use_gpu": True})

    try:

        result = json.loads(response['message']['content'])  
        result = [dict(item, **metadata) for item in result]
    except Exception as e:
        print("\n\nERROR ### Here is the buggy response: ", response, "\n\n")
        print("Exception:", str(e))
        result = None

    return result


In [45]:
## To regenerate the graph with LLM, set this to True
regenerate = False

if regenerate:
    concepts_list = extractConcepts(df, model='zephyr:latest')
    dfg1 = cleanConcepts2df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 10
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.width", None)  # No limit on width
pd.set_option("display.max_colwidth", None)  # No limit on column width
print(dfg1.shape)
dfg1.head()

(52, 5)


Unnamed: 0,node_1,node_2,edge,chunk_id,count
0,1,number,represented as,5a6ec9a2868449f787cbbd04d229fe70,10
1,1,single,indicated a single unit,5a6ec9a2868449f787cbbd04d229fe70,10
2,introduction,virtual unwrapping techniques,covered,453a9cb6de4d459ca97edff5ab9aee69,10
3,brief history of virtual unwrapping techniques,virtual unwrapping techniques,related to,453a9cb6de4d459ca97edff5ab9aee69,10
4,how it works,virtual unwrapping techniques,explained,453a9cb6de4d459ca97edff5ab9aee69,10


In [46]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
969,x-ray computed tomography (ct),virtual unwrapping,"0788cec8ead6448d8476a3f54174f34d,0788cec8ead6448d8476a3f54174f34d,0788cec8ead6448d8476a3f54174f34d,0788cec8ead6448d8476a3f54174f34d,0788cec8ead6448d8476a3f54174f34d",5,contextual proximity
971,x-ray ct scan,algorithms,"96e305fc48d946daab61df9d5c64d1b5,96e305fc48d946daab61df9d5c64d1b5",2,contextual proximity
982,x-ray ct scan,research,"96e305fc48d946daab61df9d5c64d1b5,96e305fc48d946daab61df9d5c64d1b5,96e305fc48d946daab61df9d5c64d1b5",3,contextual proximity
986,x-ray ct scan,virtual unwrapping,"96e305fc48d946daab61df9d5c64d1b5,96e305fc48d946daab61df9d5c64d1b5,96e305fc48d946daab61df9d5c64d1b5,96e305fc48d946daab61df9d5c64d1b5,96e305fc48d946daab61df9d5c64d1b5",5,contextual proximity
994,x-ray source,scanning,"88f48d8de0b84cb690c4211e7db00389,88f48d8de0b84cb690c4211e7db00389,88f48d8de0b84cb690c4211e7db00389",3,contextual proximity


## Merge both dataframes


In [47]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg
# dfg = pd.concat([dfg1, dfg2], axis=0, ignore_index=True)

# dfg = dfg.groupby(["node_1", "node_2"], as_index=False).agg({
#     "chunk_id": ",".join, 
#     "edge": lambda x: ','.join(set(x)),  # Avoid duplicate edges in text
#     "count": "sum"  # Ensure the count is properly accumulated
# })
# dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,1,number,"5a6ec9a2868449f787cbbd04d229fe70,5a6ec9a2868449f787cbbd04d229fe70,5a6ec9a2868449f787cbbd04d229fe70","represented as,contextual proximity",12
1,1,single,"5a6ec9a2868449f787cbbd04d229fe70,5a6ec9a2868449f787cbbd04d229fe70,5a6ec9a2868449f787cbbd04d229fe70","indicated a single unit,contextual proximity",12
2,3d image,scanning,"88f48d8de0b84cb690c4211e7db00389,88f48d8de0b84cb690c4211e7db00389,88f48d8de0b84cb690c4211e7db00389",contextual proximity,3
3,6,4,9fe9dfea575147fca818074c003d84a3,divided into,10
4,advanced algorithms,layers within scanned data,88f48d8de0b84cb690c4211e7db00389,identify,10
5,advanced algorithms,scanning,"88f48d8de0b84cb690c4211e7db00389,88f48d8de0b84cb690c4211e7db00389,88f48d8de0b84cb690c4211e7db00389",contextual proximity,3
6,algorithm,contrast,5e6c95b9fcd24c0c8d18cb6756e91713,achieved,10
7,algorithm,layers,"5e6c95b9fcd24c0c8d18cb6756e91713,5e6c95b9fcd24c0c8d18cb6756e91713",contextual proximity,2
8,algorithms,analysis,"96e305fc48d946daab61df9d5c64d1b5,96e305fc48d946daab61df9d5c64d1b5",contextual proximity,2
9,algorithms,cultural artifacts,"96e305fc48d946daab61df9d5c64d1b5,96e305fc48d946daab61df9d5c64d1b5",contextual proximity,2


In [48]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(75,)

In [49]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

In [50]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  6
[['1', 'number', 'single'], ['3d image', 'advanced algorithms', 'ct imaging', 'faint text or images', 'layers within scanned data', 'object', 'scanned data', 'scanning', 'segmentation & layer detection', 'texture mapping', 'x-ray source'], ['4', '6'], ['algorithm', 'contrast', 'digital algorithms', 'hidden text or details', 'image processing techniques', 'ink deposits', 'intensity', 'layers', 'layers of interest', 'machine learning', 'physical layers', 'texture', 'virtual layers', 'writing materials'], ['algorithms', 'analysis', 'archeology', 'artifacts', 'conservation of cultural heritage', 'cultural artifacts', 'deep learning algorithms', 'delicate writings and artifacts', 'digital procedure', 'digital reconstruction algorithms', 'forensic medicine', 'fragile items', 'herculaneum scrolls', 'historical knowledge', 'images', 'imaging technologies', 'innards', 'intelligence', 'internal structure imaging', 'layers of text', 'limits', 'material composition', 'me

In [51]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,1,#d3db57,1
1,number,#d3db57,1
2,single,#d3db57,1
3,3d image,#5f57db,2
4,advanced algorithms,#5f57db,2
5,ct imaging,#5f57db,2
6,faint text or images,#5f57db,2
7,layers within scanned data,#5f57db,2
8,object,#5f57db,2
9,scanned data,#5f57db,2


In [52]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [53]:
# !pip install pyvis

from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./docs/index.html
