## Setup

In [1]:
pip install yachalk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "cureus"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

## Create a dataframe of all the chunks

In [3]:
from docx import Document as Docx
from langchain.schema import Document

def read_docx(file_path):
    doc = Docx(file_path)
    full_text = []
    for paragraph in doc.paragraphs:
        full_text.append(paragraph.text)
    return '\n'.join(full_text)

file_path = '..\Resources\SongHongDoc.docx'
text = read_docx(file_path)
page = Document(page_content=text)

from langchain.text_splitter import TokenTextSplitter

text_splitter = TokenTextSplitter(chunk_size=1024, chunk_overlap=128)

# Only take the first the raw_documents
pages = text_splitter.split_documents([page])


from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(19, 2)


Unnamed: 0,text,chunk_id
0,Bể trầm tích Sông Hồng là bể trầm tích Kainozo...,2b796e4335064983a21222b4251d11e9
1,"�ng đối tượng, khu vực và nghiên cứu tiềm năng...",65d47ce81f6f4f298178f626dd46dd9f
2,ề tiềm năng còn lại và đề xuất định hướng TKTD...,78f3613f808c40b59f9ff520584b8cfe
3,�c nước thay đổi từ 30 đến 800m có chỗ trên 10...,917952e0c1434dfaaca1ddcf15bcd9a4
4,"i các Lô 102, 103, 106, 107; mạng lưới tuyến 2...",8c8ea02594854884a84593411403b3c8


## Extract Concepts

In [4]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df, generate_OpenAIGPT

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [5]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    concepts_list = df2Graph(df, generate = generate_OpenAIGPT)
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head(100)

.[{'node_1': 'Sông Hồng sedimentary basin', 'node_2': 'Kainozoi', 'edge': 'is a type of'}, {'node_1': 'Sông Hồng sedimentary basin', 'node_2': 'petroleum', 'edge': 'contains'}, {'node_1': 'Vietnam Oil and Gas Group', 'node_2': 'Sông Hồng sedimentary basin', 'edge': 'is interested in'}, {'node_1': 'Sông Hồng sedimentary basin', 'node_2': 'Ham Rong discovery', 'edge': 'has'}, {'node_1': 'Sông Hồng sedimentary basin', 'node_2': 'Ki Lan discovery', 'edge': 'has'}, {'node_1': 'Sông Hồng sedimentary basin', 'node_2': 'Blue Whale discovery', 'edge': 'has'}, {'node_1': 'Sông Hồng sedimentary basin', 'node_2': 'Ken Bau gas discovery', 'edge': 'has'}, {'node_1': 'Ham Rong discovery', 'node_2': 'carbonate reservoir', 'edge': 'is in'}, {'node_1': 'Ki Lan discovery', 'node_2': 'northern part of the basin', 'edge': 'is in'}, {'node_1': 'Blue Whale discovery', 'node_2': 'middle Miocene carbonate', 'edge': 'is in'}, {'node_1': 'Ken Bau gas discovery', 'node_2': 'late Miocene trap', 'edge': 'is in'}]
.

Unnamed: 0,node_1,node_2,edge,chunk_id,count
0,sông hồng sedimentary basin,kainozoi,is a type of,2b796e4335064983a21222b4251d11e9,4
1,sông hồng sedimentary basin,petroleum,contains,2b796e4335064983a21222b4251d11e9,4
2,vietnam oil and gas group,sông hồng sedimentary basin,is interested in,2b796e4335064983a21222b4251d11e9,4
3,sông hồng sedimentary basin,ham rong discovery,has,2b796e4335064983a21222b4251d11e9,4
4,sông hồng sedimentary basin,ki lan discovery,has,2b796e4335064983a21222b4251d11e9,4


In [24]:
dfg1

Unnamed: 0,node_1,node_2,edge,chunk_id,count
0,sông hồng sedimentary basin,kainozoi,is a type of,2b796e4335064983a21222b4251d11e9,4
1,sông hồng sedimentary basin,petroleum,contains,2b796e4335064983a21222b4251d11e9,4
2,vietnam oil and gas group,sông hồng sedimentary basin,is interested in,2b796e4335064983a21222b4251d11e9,4
3,sông hồng sedimentary basin,ham rong discovery,has,2b796e4335064983a21222b4251d11e9,4
4,sông hồng sedimentary basin,ki lan discovery,has,2b796e4335064983a21222b4251d11e9,4
...,...,...,...,...,...
184,seismic acquisition company,projection system,at,885ae2861dad4d389009d4992a896e5b,4
185,seismic data,common positioning system,is converted to,885ae2861dad4d389009d4992a896e5b,4
186,seismic data,projection system,is converted to,885ae2861dad4d389009d4992a896e5b,4
187,seismic data,coordinate system,is transformed to,885ae2861dad4d389009d4992a896e5b,4


## Calculating contextual proximity

In [10]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
3652,đá móng lộ thiên mesozoi – paleozoi,bể sông hồng,"78f3613f808c40b59f9ff520584b8cfe,78f3613f808c4...",10,contextual proximity
3662,đá móng lộ thiên mesozoi – paleozoi đảo hải nam,bể sông hồng,"78f3613f808c40b59f9ff520584b8cfe,78f3613f808c4...",10,contextual proximity
3672,địa hình đáy biển,bể sông hồng,"78f3613f808c40b59f9ff520584b8cfe,78f3613f808c4...",10,contextual proximity
3682,địa lý,bể sông hồng,"78f3613f808c40b59f9ff520584b8cfe,78f3613f808c4...",10,contextual proximity
3692,độ sâu nước biển,bể sông hồng,"78f3613f808c40b59f9ff520584b8cfe,78f3613f808c4...",10,contextual proximity


### Merge both the dataframes

In [11]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,0.4 mmcf/d,flow,"a525c1c6d8e845319952801092a2d426,a525c1c6d8e84...",contextual proximity,2
1,0.4 mmcf/d,gk 114-kèn bầu-1x,"a525c1c6d8e845319952801092a2d426,a525c1c6d8e84...",contextual proximity,2
2,0.4 mmcf/d,well 105-cl-1x,"a525c1c6d8e845319952801092a2d426,a525c1c6d8e84...",contextual proximity,4
3,102/10-sp-1x well,106/10-hrn-1x well,"8ea238aabedf45fb8d685ea666b54b54,8ea238aabedf4...",contextual proximity,4
4,102/10-sp-1x well,"exploration block 102/10&106/10, pvep","8ea238aabedf45fb8d685ea666b54b54,8ea238aabedf4...",contextual proximity,2
...,...,...,...,...,...
1322,đá móng lộ thiên mesozoi – paleozoi,bể sông hồng,"78f3613f808c40b59f9ff520584b8cfe,78f3613f808c4...",contextual proximity,10
1323,đá móng lộ thiên mesozoi – paleozoi đảo hải nam,bể sông hồng,"78f3613f808c40b59f9ff520584b8cfe,78f3613f808c4...",contextual proximity,10
1324,địa hình đáy biển,bể sông hồng,"78f3613f808c40b59f9ff520584b8cfe,78f3613f808c4...",contextual proximity,10
1325,địa lý,bể sông hồng,"78f3613f808c40b59f9ff520584b8cfe,78f3613f808c4...",contextual proximity,10


## Calculate the NetworkX Graph

In [12]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(245,)

In [13]:
pip install networkx

Collecting networkx
  Obtaining dependency information for networkx from https://files.pythonhosted.org/packages/38/e9/5f72929373e1a0e8d142a130f3f97e6ff920070f87f91c4e13e40e0fba5a/networkx-3.3-py3-none-any.whl.metadata
  Downloading networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Downloading networkx-3.3-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   - -------------------------------------- 0.1/1.7 MB 1.1 MB/s eta 0:00:02
   ------ --------------------------------- 0.3/1.7 MB 2.3 MB/s eta 0:00:01
   --------------- ------------------------ 0.7/1.7 MB 4.1 MB/s eta 0:00:01
   -------------------------- ------------- 1.1/1.7 MB 5.5 MB/s eta 0:00:01
   -------------------------------------- - 1.6/1.7 MB 6.4 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 6.4 MB/s eta 0:00:00
Installing collected packages: networkx
Successfully installed networkx-3.


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [15]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  13
[['0.4 mmcf/d', '102/10-sp-1x well', '106/10-hrd-1x well', '106/10-hrn-1x well', '2009', '4 gas reservoirs', 'august 2013', 'blue whale discovery', 'bạch đằng petroleum exploration production', 'carbon dioxide (co2)', 'carbonate reservoir', 'co2 content', 'coal reservoir', 'commercial gas', 'commercial result', 'completion', 'condensate barrels', 'contractor', 'crude oil signs', 'depth 2399-2437mmd', 'depth 3.201m', 'drilling well', 'dry gas', 'dst#1', 'dst#2', 'dấu hiệu dầu khí', 'dấu hiệu dầu thô', 'eni', 'eni, chrisenergy, neon', 'exploration', 'exploration and exploitation of oil and gas', 'exploration block 102/10&106/10, pvep', 'exploration drilling', 'flow', 'formation', 'gas', 'gas field', 'gas flow rate', 'gas reservoir', 'gas-condensate reservoir', 'gas-condensate, dry gas reservoirs', 'gk 102-tb-1x', 'gk 106-ds-1x', 'gk 106-yt-1x', 'gk 107-kỳ lân-1x', 'gk 114-kèn bầu-1x', 'ham rong discovery', 'hrn-1x', 'hrn-1x well', 'hydrogen sulfide (h2s)', 'hà

### Create a dataframe for community colors

In [17]:
pip install pyvis

Collecting pyvis
  Obtaining dependency information for pyvis from https://files.pythonhosted.org/packages/ab/4b/e37e4e5d5ee1179694917b445768bdbfb084f5a59ecd38089d3413d4c70f/pyvis-0.3.2-py3-none-any.whl.metadata
  Downloading pyvis-0.3.2-py3-none-any.whl.metadata (1.7 kB)
Collecting jinja2>=2.9.6 (from pyvis)
  Obtaining dependency information for jinja2>=2.9.6 from https://files.pythonhosted.org/packages/30/6d/6de6be2d02603ab56e72997708809e8a5b0fbfee080735109b40a3564843/Jinja2-3.1.3-py3-none-any.whl.metadata
  Using cached Jinja2-3.1.3-py3-none-any.whl.metadata (3.3 kB)
Collecting jsonpickle>=1.4.1 (from pyvis)
  Obtaining dependency information for jsonpickle>=1.4.1 from https://files.pythonhosted.org/packages/19/c3/453e4e2da82d5efad9e653916a120d94daf5062f7eae43e28f39fff1bc6a/jsonpickle-3.0.4-py3-none-any.whl.metadata
  Downloading jsonpickle-3.0.4-py3-none-any.whl.metadata (2.6 kB)
Collecting MarkupSafe>=2.0 (from jinja2>=2.9.6->pyvis)
  Obtaining dependency information for MarkupSa


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [18]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,0.4 mmcf/d,#63db57,1
1,102/10-sp-1x well,#63db57,1
2,106/10-hrd-1x well,#63db57,1
3,106/10-hrn-1x well,#63db57,1
4,2009,#63db57,1
...,...,...,...
240,seismic data quality,#57b5db,11
241,geological survey area,#7357db,12
242,northern and eastern phu khanh basin area,#7357db,12
243,seismic line density,#dbd957,13


### Add colors to the graph

In [19]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [22]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./docs/index.html


UnicodeEncodeError: 'charmap' codec can't encode character '\u1ea7' in position 13237: character maps to <undefined>