In [1]:
!pip install langchain neo4j openai wikipedia tiktoken langchain_openai pdfplumber



In [2]:
#Package Imports
from langchain.graphs import Neo4jGraph
import neo4j

In [3]:
#Neo4j connection
url = "neo4j+s://39edf771.databases.neo4j.io"
username ="neo4j"
password = "31Nwe5MwJKLGHFCTtkmWQVO7R3DU1fYYvX_D63HZGEM"
graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)

In [4]:
#Langchain for graph
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel

class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [5]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
      return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [6]:
import openai
import json
import os
from langchain.chat_models import AzureChatOpenAI
from langchain.schema import HumanMessage
from langchain import LLMChain
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Load config values
with open(r'config.json') as config_file:
    config_details = json.load(config_file)

# The base URL for your Azure OpenAI resource.
openai_api_base = config_details['OPENAI_API_BASE']

# API version e.g. "2023-07-01-preview"
openai_api_version = config_details['OPENAI_API_VERSION']

# The name of your Azure OpenAI deployment chat model. e.g. "gpt-35-turbo-0613"
deployment_name = config_details['DEPLOYMENT_NAME']

# The API key for your Azure OpenAI resource.
openai_api_key = os.getenv("OPENAI_API_KEY")

# This is set to `azure`
openai_api_type = "azure"

In [18]:
from langchain.chat_models import AzureChatOpenAI

# Create an instance of chat llm
llm = AzureChatOpenAI(
    azure_endpoint=openai_api_base,
    openai_api_version=openai_api_version,
    deployment_name=deployment_name,
    openai_api_key=openai_api_key,
    openai_api_type=openai_api_type,
)

In [19]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.openai_functions import create_structured_output_chain

def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
    [(
      "system",
      f"""# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a specialized algorithm designed to extract structured financial data from bank annual reports to build a comprehensive knowledge graph.
- **Nodes** represent financial terms, entities, departments, and concepts specific to the banking industry.
- The goal is to structure information in a manner that highlights financial relationships, decision-making hierarchies, and policy summaries.
## 2. Sections and Nodes
- **Sections**: Target key sections such as 'Director's Report', 'Statements of Financial Position', 'Income Statement', etc. Identify relevant financial data and decisions.
- **Entities and Concepts**: Create nodes for identifiable entities like board members, financial metrics, and policy terms.
- **Node IDs**: Use names or specific identifiers for node labels. Avoid integers or vague references.
- **Relationships**: Map relationships that reflect financial dependencies, reporting structures, and policy impacts.
## 3. Detailed Financial Extraction
- **Extract Details**: Focus on numbers and financial statements, converting them into properties of nodes. For instance, assets and liabilities figures should be attached to the 'Statement of Financial Position' node.
- **Accuracy and Precision**: Ensure the extraction of financial data is accurate, citing exact figures and contextual information.
- **Contextual Relevance**: Attach relevant notes and assumptions from the 'Notes to Financial Statements' to the appropriate financial statements or metrics.
## 4. Compliance and Consistency
- **Regulatory Statements**: Extract and highlight compliance statements from 'Independent Auditors' Report' and 'Statutory Declaration'.
- **Maintain Consistency**: Use consistent terminology across different sections of the report to avoid confusion.
- **Strict Compliance**: Adhere strictly to the rules for knowledge graph construction.
## 5. Tips
- Remember to format financial data as attributes of the nodes and structure the graph to reflect the organization and flow of the annual report.
Use the given format to extract information from the following input: <input here>
Tip: Make sure to answer in the correct format
"""),
        ("human", "Use the given format to extract information from the following input: {input}"),
        ("human", "Tip: Make sure to answer in the correct format"),
    ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)


In [20]:
def extract_and_store_graph(
    document: Document,
    nodes: Optional[List[str]] = None,
    rels: Optional[List[str]] = None
) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    data = extract_chain.invoke(document)['function']
    
    # Construct a graph document
    graph_document = GraphDocument(
        nodes=[map_to_base_node(node) for node in data.nodes],
        relationships=[map_to_base_relationship(rel) for rel in data.rels],
        source=document.dict()  # Convert Document object to dictionary
    )
    
    # Store information into a graph
    graph.add_graph_documents([graph_document])


In [23]:
def transform_table_to_graph_nodes_and_relationships(table):
    headers = table[0]
    rows = table[1:]
    
    nodes = []
    relationships = []
    
    for row in rows:
        node_properties = {}
        for header, value in zip(headers, row):
            if header and value:  # Ensure neither header nor value is None
                node_properties[header] = value
        
        # Create a node for each row
        if headers[0] in node_properties:  # Ensure the first column has a value
            node = Node(
                id=node_properties[headers[0]],  # Assume the first column is a unique identifier
                type="FinancialData",
                properties=[Property(key=k, value=str(v)) for k, v in node_properties.items() if k and v]
            )
            nodes.append(node)
        
            # Create relationships if needed (e.g., linking related rows)
            for related_row in rows:
                if row != related_row and headers[0] in related_row:  # Example condition, adjust as needed
                    relationship = Relationship(
                        source=node,
                        target=Node(id=related_row[0], type="FinancialData"),
                        type="Related",
                        properties=[]
                    )
                    relationships.append(relationship)
    
    return nodes, relationships

# Read text and tables from PDF
raw_text, raw_tables = extract_text_and_tables_from_pdf(pdf_path)
if raw_text:
    # Preprocess the text
    preprocessed_text = preprocess_text(raw_text)
    
    # Define chunking strategy
    text_splitter = TokenTextSplitter(chunk_size=2048, chunk_overlap=24)

    # Apply text splitter
    documents = text_splitter.split_text(preprocessed_text)

    # Output the first few chunks of text
    for doc in documents[:3]:  # Adjust the number of documents to output as needed
        print("Document Chunk:")
        print(doc)

    # Process and store tables
    nodes, relationships = process_tables(raw_tables)
    
    # Output nodes and relationships for later processing
    for node in nodes[:3]:  # Adjust the number of nodes to output as needed
        print("Node:")
        print(node)
    for relationship in relationships[:3]:  # Adjust the number of relationships to output as needed
        print("Relationship:")
        print(relationship)
else:
    print("No text extracted from the PDF.")


Document Chunk:
Together drive sustainable growth FINANCIAL REPORT 2022 RHB Bank Berhad 196501000373 6171 FINANCIAL REPORT 2022 RHB Bank Berhad 196501000373 6171 Level 10 Tower One RHB Centre Jalan Tun Razak 50400 Kuala Lumpur Malaysia Tel 603 9287 8888 Fax 603 9281 9314 facebook com RHBgroup instagram com RHBgroup Rhbgroup twitter com RHBgroup youtube com RHBgroup www rhbgroup com mySTATUTORY FINANCIAL STATEMENTS2 Responsibility Statement Board Directors 3 Directors Report 8 Statements Financial Position 10 Income Statements 11 Statements Comprehensive Income 12 Statements Changes Equity 16 Statements Cash Flows 22 Summary Significant Accounting Policies Critical Accounting Estimates Assumptions 53 Notes Financial Statements 196 Statement Directors 196 Statutory Declaration 197 Independent Auditors Report Members RHB Bank BerhadINTEGRATED REPORT 2022 Responsibility Statement Board Directors Directors responsible ensuring audited financial statement Group Bank drawn accordance Malaysia

In [24]:
from tqdm import tqdm
from langchain.schema import Document

# Process each document chunk and extract/store the graph
for i, chunk in tqdm(enumerate(documents), total=len(documents)):
    doc = Document(page_content=chunk, metadata={"source": pdf_path, "chunk": i})
    try:
        extract_and_store_graph(doc)
        print(f"Successfully processed and stored chunk {i+1}")
    except Exception as e:
        print(f"An error occurred while processing chunk {i+1}: {e}")


  0%|          | 0/57 [00:00<?, ?it/s]

  warn_deprecated(
  2%|▏         | 1/57 [00:48<45:41, 48.95s/it]

Successfully processed and stored chunk 1


  4%|▎         | 2/57 [03:27<1:43:43, 113.15s/it]

Successfully processed and stored chunk 2


  5%|▌         | 3/57 [03:28<55:59, 62.21s/it]   

Successfully processed and stored chunk 3


  7%|▋         | 4/57 [04:11<48:22, 54.76s/it]

Successfully processed and stored chunk 4


  9%|▉         | 5/57 [05:09<48:23, 55.83s/it]

Successfully processed and stored chunk 5


 11%|█         | 6/57 [06:31<54:57, 64.66s/it]

Successfully processed and stored chunk 6


 12%|█▏        | 7/57 [07:03<44:52, 53.84s/it]

Successfully processed and stored chunk 7


 14%|█▍        | 8/57 [07:26<36:06, 44.21s/it]

Successfully processed and stored chunk 8


 16%|█▌        | 9/57 [07:44<28:44, 35.93s/it]

Successfully processed and stored chunk 9


 18%|█▊        | 10/57 [07:52<21:21, 27.26s/it]

Successfully processed and stored chunk 10


 19%|█▉        | 11/57 [08:19<20:59, 27.39s/it]

Successfully processed and stored chunk 11


 21%|██        | 12/57 [08:27<16:00, 21.35s/it]

Successfully processed and stored chunk 12


 23%|██▎       | 13/57 [08:56<17:19, 23.63s/it]

Successfully processed and stored chunk 13


 25%|██▍       | 14/57 [09:14<15:41, 21.90s/it]

Successfully processed and stored chunk 14


 26%|██▋       | 15/57 [09:28<13:49, 19.74s/it]

Successfully processed and stored chunk 15


 28%|██▊       | 16/57 [10:16<19:15, 28.18s/it]

Successfully processed and stored chunk 16


 30%|██▉       | 17/57 [10:28<15:30, 23.26s/it]

Successfully processed and stored chunk 17


 32%|███▏      | 18/57 [11:38<24:12, 37.23s/it]

Successfully processed and stored chunk 18


 33%|███▎      | 19/57 [12:06<21:46, 34.37s/it]

Successfully processed and stored chunk 19


 35%|███▌      | 20/57 [12:39<20:56, 33.97s/it]

Successfully processed and stored chunk 20


 37%|███▋      | 21/57 [15:58<50:15, 83.77s/it]

Successfully processed and stored chunk 21


 39%|███▊      | 22/57 [16:07<35:44, 61.28s/it]

An error occurred while processing chunk 22: 1 validation error for _OutputFormatter
output -> rels
  field required (type=value_error.missing)


 40%|████      | 23/57 [16:32<28:32, 50.36s/it]

Successfully processed and stored chunk 23


 42%|████▏     | 24/57 [16:42<20:57, 38.11s/it]

Successfully processed and stored chunk 24


 44%|████▍     | 25/57 [16:48<15:11, 28.47s/it]

Successfully processed and stored chunk 25


 46%|████▌     | 26/57 [17:18<15:00, 29.04s/it]

Successfully processed and stored chunk 26


 47%|████▋     | 27/57 [17:41<13:33, 27.12s/it]

Successfully processed and stored chunk 27


 49%|████▉     | 28/57 [18:46<18:40, 38.64s/it]

Successfully processed and stored chunk 28


 51%|█████     | 29/57 [19:08<15:43, 33.68s/it]

Successfully processed and stored chunk 29


 53%|█████▎    | 30/57 [19:17<11:50, 26.30s/it]

Successfully processed and stored chunk 30


 54%|█████▍    | 31/57 [19:31<09:42, 22.40s/it]

Successfully processed and stored chunk 31


 56%|█████▌    | 32/57 [20:27<13:35, 32.61s/it]

Successfully processed and stored chunk 32


 58%|█████▊    | 33/57 [20:39<10:36, 26.53s/it]

Successfully processed and stored chunk 33


 60%|█████▉    | 34/57 [21:22<12:00, 31.31s/it]

Successfully processed and stored chunk 34


 61%|██████▏   | 35/57 [21:56<11:50, 32.28s/it]

An error occurred while processing chunk 35: 1 validation error for _OutputFormatter
output -> rels
  field required (type=value_error.missing)


 63%|██████▎   | 36/57 [22:11<09:29, 27.10s/it]

Successfully processed and stored chunk 36


 65%|██████▍   | 37/57 [22:42<09:20, 28.04s/it]

Successfully processed and stored chunk 37


 67%|██████▋   | 38/57 [23:26<10:27, 33.04s/it]

Successfully processed and stored chunk 38


 68%|██████▊   | 39/57 [23:53<09:19, 31.06s/it]

An error occurred while processing chunk 39: 1 validation error for _OutputFormatter
output -> rels
  field required (type=value_error.missing)


 70%|███████   | 40/57 [24:23<08:40, 30.64s/it]

Successfully processed and stored chunk 40


 72%|███████▏  | 41/57 [24:32<06:29, 24.32s/it]

Successfully processed and stored chunk 41


 74%|███████▎  | 42/57 [24:51<05:40, 22.73s/it]

Successfully processed and stored chunk 42


 75%|███████▌  | 43/57 [25:11<05:06, 21.88s/it]

Successfully processed and stored chunk 43


 77%|███████▋  | 44/57 [25:24<04:08, 19.11s/it]

Successfully processed and stored chunk 44


 79%|███████▉  | 45/57 [25:33<03:13, 16.11s/it]

Successfully processed and stored chunk 45


 81%|████████  | 46/57 [29:03<13:39, 74.46s/it]

An error occurred while processing chunk 46: 1 validation error for _OutputFormatter
__root__
  Unterminated string starting at: line 2556 column 17 (char 52846) (type=value_error.jsondecode; msg=Unterminated string starting at; doc={
  "output": {
    "nodes": [
      {
        "id": "179",
        "type": "Financial Figure",
        "properties": [
          {
            "key": "Value",
            "value": "179"
          }
        ]
      },
      {
        "id": "193",
        "type": "Financial Figure",
        "properties": [
          {
            "key": "Value",
            "value": "193"
          }
        ]
      },
      {
        "id": "204",
        "type": "Financial Term",
        "properties": []
      },
      {
        "id": "Foreign exchange related contract",
        "type": "Financial Concept",
        "properties": []
      },
      {
        "id": "7",
        "type": "Financial Figure",
        "properties": [
          {
            "key": "Value",
        

 82%|████████▏ | 47/57 [30:16<12:20, 74.04s/it]

Successfully processed and stored chunk 47


 84%|████████▍ | 48/57 [31:00<09:43, 64.82s/it]

Successfully processed and stored chunk 48


 86%|████████▌ | 49/57 [31:12<06:32, 49.09s/it]

Successfully processed and stored chunk 49


 88%|████████▊ | 50/57 [31:17<04:10, 35.85s/it]

Successfully processed and stored chunk 50


 89%|████████▉ | 51/57 [32:04<03:55, 39.25s/it]

Successfully processed and stored chunk 51


 91%|█████████ | 52/57 [32:05<02:18, 27.71s/it]

Successfully processed and stored chunk 52


 93%|█████████▎| 53/57 [33:24<02:52, 43.03s/it]

Successfully processed and stored chunk 53


 95%|█████████▍| 54/57 [33:29<01:34, 31.63s/it]

Successfully processed and stored chunk 54


 96%|█████████▋| 55/57 [34:32<01:22, 41.17s/it]

Successfully processed and stored chunk 55


 98%|█████████▊| 56/57 [38:02<01:31, 91.77s/it]

An error occurred while processing chunk 56: 1 validation error for _OutputFormatter
__root__
  Expecting ',' delimiter: line 2566 column 1 (char 57340) (type=value_error.jsondecode; msg=Expecting ',' delimiter; doc={
  "output": {
    "nodes": [
      {
        "id": "shariah_act_guide",
        "type": "concept",
        "properties": [
          {
            "key": "description",
            "value": "Related Shariah Act guide"
          }
        ]
      },
      {
        "id": "personnel",
        "type": "concept",
        "properties": [
          {
            "key": "description",
            "value": "Personnel engaged in Islamic business activity"
          }
        ]
      },
      {
        "id": "shariah_principle",
        "type": "concept",
        "properties": [
          {
            "key": "description",
            "value": "Shariah principle"
          }
        ]
      },
      {
        "id": "prudence",
        "type": "concept",
        "properties": [
   

100%|██████████| 57/57 [38:05<00:00, 40.09s/it]

Successfully processed and stored chunk 57



