In [45]:
!pip install langchain neo4j openai wikipedia tiktoken langchain_openai pdfplumber



In [46]:
#Package Imports
from langchain.graphs import Neo4jGraph
import neo4j

In [47]:
#Neo4j connection
url = "neo4j+s://e1557b9c.databases.neo4j.io"
username ="neo4j"
password = "m8CFT88LsU8EZ9BzYC4dC0c7Pwgc1ZBQxHKqQKpA3HY"
graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)

In [48]:
#Langchain for graph
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel

class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [49]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
      return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [50]:
import openai
import json
import os
from langchain.chat_models import AzureChatOpenAI
from langchain.schema import HumanMessage
from langchain import LLMChain
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Load config values
with open(r'config.json') as config_file:
    config_details = json.load(config_file)

# The base URL for your Azure OpenAI resource.
openai_api_base = config_details['OPENAI_API_BASE']

# API version e.g. "2023-07-01-preview"
openai_api_version = config_details['OPENAI_API_VERSION']

# The name of your Azure OpenAI deployment chat model. e.g. "gpt-35-turbo-0613"
deployment_name = config_details['DEPLOYMENT_NAME']

# The API key for your Azure OpenAI resource.
openai_api_key = os.getenv("OPENAI_API_KEY")

# This is set to `azure`
openai_api_type = "azure"


from langchain.chat_models import AzureChatOpenAI

# Create an instance of chat llm
llm = AzureChatOpenAI(
    azure_endpoint=openai_api_base,
    openai_api_version=openai_api_version,
    deployment_name=deployment_name,
    openai_api_key=openai_api_key,
    openai_api_type=openai_api_type,
)

In [51]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.openai_functions import create_structured_output_chain

def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
    [(
      "system",
      f"""# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a specialized algorithm designed to extract structured financial data from bank annual reports to build a comprehensive knowledge graph.
- **Nodes** represent financial terms, entities, departments, and concepts specific to the banking industry.
- The goal is to structure information in a manner that highlights financial relationships, decision-making hierarchies, and policy summaries.

## 2. Sections and Nodes
- **Sections**: Target key sections such as 'Director's Report', 'Statements of Financial Position', 'Income Statement', etc. Identify relevant financial data and decisions.
- **Entities and Concepts**: Create nodes for identifiable entities like board members, financial metrics, and policy terms.
- **Node IDs**: Use names or specific identifiers for node labels. Avoid integers or vague references.
- **Relationships**: Map relationships that reflect financial dependencies, reporting structures, and policy impacts.

## 3. Detailed Financial Extraction
- **Extract Details**: Focus on numbers and financial statements, converting them into properties of nodes. For instance, assets and liabilities figures should be attached to the 'Statement of Financial Position' node.
- **Accuracy and Precision**: Ensure the extraction of financial data is accurate, citing exact figures and contextual information.
- **Contextual Relevance**: Attach relevant notes and assumptions from the 'Notes to Financial Statements' to the appropriate financial statements or metrics.

## 4. Compliance and Consistency
- **Regulatory Statements**: Extract and highlight compliance statements from 'Independent Auditors' Report' and 'Statutory Declaration'.
- **Maintain Consistency**: Use consistent terminology across different sections of the report to avoid confusion.
- **Strict Compliance**: Adhere strictly to the rules for knowledge graph construction.

## 5. Symbolic Tokens and Special Characters
- **Symbolic Tokens**: Identify and properly handle symbolic tokens (e.g., %, $, £, etc.) and ensure they are accurately represented in the data.
- **Special Characters**: Recognize special characters and their meanings (e.g., ± for approximately, > for greater than, etc.) and handle them appropriately in the extraction process.

## 6. Domain Vocabulary and Abbreviations
- **Domain Vocabulary**: Utilize and recognize domain-specific vocabulary and terminology relevant to the banking and financial industry.
- **Abbreviations**: Identify and expand domain-related abbreviations (e.g., FVOCI for Fair Value through Other Comprehensive Income) to ensure clarity and accuracy.

## 7. N-gram Extraction
- **N-grams**: Extract meaningful N-grams (bigrams, trigrams, etc.) that represent significant financial terms or phrases and include them as part of the node properties.

## 8. Tips
- Remember to format financial data as attributes of the nodes and structure the graph to reflect the organization and flow of the annual report.
- Use the given format to extract information from the following input: <input here>
- Tip: Make sure to answer in the correct format
"""),
        ("human", "Use the given format to extract information from the following input: {input}"),
        ("human", "Tip: Make sure to answer in the correct format"),
    ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)


In [52]:
def extract_and_store_graph(
    document: Document,
    nodes: Optional[List[str]] = None,
    rels: Optional[List[str]] = None
) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    data = extract_chain.invoke(document)['function']
    
    # Construct a graph document
    graph_document = GraphDocument(
        nodes=[map_to_base_node(node) for node in data.nodes],
        relationships=[map_to_base_relationship(rel) for rel in data.rels],
        source=document.dict()  # Convert Document object to dictionary
    )
    
    # Store information into a graph
    graph.add_graph_documents([graph_document])


In [25]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [36]:
!pip install tensorflow keras torch transformers spacy





In [41]:
import pdfplumber
import os
import re
import nltk
import spacy
from transformers import pipeline
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from langchain.text_splitter import TokenTextSplitter
import pickle

# Ensure nltk resources are downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Load FinBERT model for NER
model_checkpoint = "Kansallisarkisto/finbert-ner"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple", framework="pt"
)

# Define the folder containing the PDF files and the ground truth folder
pdf_folder_path = r"C:\Users\WendyChuaXingZhao\OneDrive - SRKK Group of Companies\Documents\MsDS\Knowledge Graph Test\Financial Documents"
ground_truth_folder_path = r"C:\Users\WendyChuaXingZhao\OneDrive - SRKK Group of Companies\Documents\MsDS\Knowledge Graph Test\ground_truth"

# Create ground truth folder if it doesn't exist
os.makedirs(ground_truth_folder_path, exist_ok=True)

def extract_text_and_tables_from_pdf(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            full_text = ''
            tables = []
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    full_text += page_text
                page_tables = page.extract_tables()
                if page_tables:
                    tables.extend(page_tables)
        return full_text, tables
    except Exception as e:
        print(f"An error occurred while reading the PDF {pdf_path}: {e}")
        return None, None

def preprocess_text(text):
    # Convert text to lower case
    text = text.lower()
    
    # Remove noise
    text = re.sub(r'[\s]+', ' ', text)
    text = re.sub(r'[^a-z0-9\s]+', ' ', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Stop-word removal
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back to string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

def extract_entities_spacy(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ in {'PERSON', 'ORG', 'GPE', 'LOC', 'DATE', 'MONEY', 'PERCENT'}]
    return entities

def extract_entities_finbert(text):
    ner_results = token_classifier(text)
    entities = [entity['word'] for entity in ner_results]
    return entities

def combine_and_postprocess_entities(spacy_entities, finbert_entities):
    # Combine entities from both models
    combined_entities = set(spacy_entities + finbert_entities)
    
    # Post-processing to remove irrelevant entities (example rule: filter out single characters or non-alphanumeric entities)
    processed_entities = [entity for entity in combined_entities if len(entity) > 1 and entity.isalnum()]
    return ' '.join(processed_entities)

def extract_and_preprocess_pdf(pdf_path):
    raw_text, _ = extract_text_and_tables_from_pdf(pdf_path)
    if raw_text:
        preprocessed_text = preprocess_text(raw_text)
        
        # Extract entities using SpaCy
        spacy_entities = extract_entities_spacy(preprocessed_text)
        
        # Extract entities using FinBERT
        finbert_entities = extract_entities_finbert(preprocessed_text)
        
        # Combine and post-process entities
        combined_entities = combine_and_postprocess_entities(spacy_entities, finbert_entities)
        
        text_splitter = TokenTextSplitter(chunk_size=2048, chunk_overlap=24)
        documents = text_splitter.split_text(combined_entities)
        return documents, combined_entities
    return [], ''

# List all PDF files in the directory
pdf_paths = [os.path.join(pdf_folder_path, f) for f in os.listdir(pdf_folder_path) if f.lower().endswith('.pdf')]

# Extract and preprocess documents
all_documents = []
for pdf_path in pdf_paths:
    documents, combined_entities = extract_and_preprocess_pdf(pdf_path)
    all_documents.extend(documents)
    
    # Save the combined entity text as ground truth
    pdf_name = os.path.basename(pdf_path).replace('.pdf', '.txt')
    ground_truth_path = os.path.join(ground_truth_folder_path, pdf_name)
    with open(ground_truth_path, 'w', encoding='utf-8') as f:
        f.write(combined_entities)

# Save preprocessed documents for later use
with open('processed_documents.pkl', 'wb') as f:
    pickle.dump(all_documents, f)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\WendyChuaXingZhao\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\WendyChuaXingZhao\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\WendyChuaXingZhao\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package punkt is already up-to-date!


In [42]:
import os
from sklearn.metrics import precision_score, recall_score, f1_score
import pickle

def load_ground_truth_data(ground_truth_folder_path):
    ground_truth_data = {}
    for file_name in os.listdir(ground_truth_folder_path):
        if file_name.lower().endswith('.txt'):
            with open(os.path.join(ground_truth_folder_path, file_name), 'r', encoding='utf-8') as f:
                ground_truth_data[file_name] = f.read().lower()
    return ground_truth_data

def calculate_precision_recall_f1(extracted_text, ground_truth_text):
    extracted_tokens = set(extracted_text.split())
    ground_truth_tokens = set(ground_truth_text.split())

    true_positives = extracted_tokens.intersection(ground_truth_tokens)
    false_positives = extracted_tokens.difference(ground_truth_tokens)
    false_negatives = ground_truth_tokens.difference(extracted_tokens)

    precision = len(true_positives) / (len(true_positives) + len(false_positives)) if (len(true_positives) + len(false_positives)) > 0 else 0
    recall = len(true_positives) / (len(true_positives) + len(false_negatives)) if (len(true_positives) + len(false_negatives)) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

# Define the folder containing the ground truth text files
ground_truth_folder_path = r"C:\Users\WendyChuaXingZhao\OneDrive - SRKK Group of Companies\Documents\MsDS\Knowledge Graph Test\ground_truth"

# Load ground truth data
ground_truth_data = load_ground_truth_data(ground_truth_folder_path)

# Load preprocessed documents
with open('processed_documents.pkl', 'rb') as f:
    all_documents = pickle.load(f)

# Evaluate the extracted documents
precisions = []
recalls = []
f1_scores = []

for file_name in ground_truth_data.keys():
    ground_truth_text = ground_truth_data[file_name]
    extracted_text = ' '.join(all_documents)  # Concatenate all extracted documents for simplicity

    precision, recall, f1 = calculate_precision_recall_f1(extracted_text, ground_truth_text)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate average metrics
average_precision = sum(precisions) / len(precisions)
average_recall = sum(recalls) / len(recalls)
average_f1 = sum(f1_scores) / len(f1_scores)

print(f"Average Precision: {average_precision:.2f}")
print(f"Average Recall: {average_recall:.2f}")
print(f"Average F1-Score: {average_f1:.2f}")


Average Precision: 0.17
Average Recall: 1.00
Average F1-Score: 0.29


In [45]:
!pip install stanza

Collecting stanza
  Using cached stanza-1.8.2-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Using cached emoji-2.12.1-py3-none-any.whl.metadata (5.4 kB)
Using cached stanza-1.8.2-py3-none-any.whl (990 kB)
Using cached emoji-2.12.1-py3-none-any.whl (431 kB)
Installing collected packages: emoji, stanza
Successfully installed emoji-2.12.1 stanza-1.8.2


In [46]:
import pdfplumber
import os
import re
import nltk
import spacy
from transformers import pipeline
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer
from langchain.text_splitter import TokenTextSplitter
import pickle
import stanza

# Ensure nltk resources are downloaded
nltk.download('stopwords')
nltk.download('wordnet')

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Load Stanza model
stanza.download('en')
stanza_nlp = stanza.Pipeline('en')

# Load FinBERT model for NER
model_checkpoint = "Kansallisarkisto/finbert-ner"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple", framework="pt"
)

# Define the folder containing the PDF files and the ground truth folder
pdf_folder_path = r"C:\Users\WendyChuaXingZhao\OneDrive - SRKK Group of Companies\Documents\MsDS\Knowledge Graph Test\Financial Documents"
ground_truth_folder_path = r"C:\Users\WendyChuaXingZhao\OneDrive - SRKK Group of Companies\Documents\MsDS\Knowledge Graph Test\ground_truth"

# Create ground truth folder if it doesn't exist
os.makedirs(ground_truth_folder_path, exist_ok=True)

def extract_text_and_tables_from_pdf(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            full_text = ''
            tables = []
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    full_text += page_text
                page_tables = page.extract_tables()
                if page_tables:
                    tables.extend(page_tables)
        return full_text, tables
    except Exception as e:
        print(f"An error occurred while reading the PDF {pdf_path}: {e}")
        return None, None

def preprocess_text_gensim(text):
    # Tokenization and stop-word removal using Gensim
    tokens = [token for token in simple_preprocess(text, deacc=True) if token not in STOPWORDS]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)

def preprocess_text_stanza(text):
    # Use Stanza for preprocessing
    doc = stanza_nlp(text)
    tokens = [word.lemma for sent in doc.sentences for word in sent.words if word.text not in STOPWORDS and re.match(r'[^\W\d]*$', word.text)]
    return ' '.join(tokens)

def extract_entities_spacy(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ in {'PERSON', 'ORG', 'GPE', 'LOC', 'DATE', 'MONEY', 'PERCENT'}]
    return entities

def extract_entities_finbert(text):
    ner_results = token_classifier(text)
    entities = [entity['word'] for entity in ner_results]
    return entities

def combine_and_postprocess_entities(spacy_entities, finbert_entities):
    # Combine entities from both models
    combined_entities = set(spacy_entities + finbert_entities)
    
    # Post-processing to remove irrelevant entities (example rule: filter out single characters or non-alphanumeric entities)
    processed_entities = [entity for entity in combined_entities if len(entity) > 1 and entity.isalnum()]
    return ' '.join(processed_entities)

def extract_and_preprocess_pdf(pdf_path, preprocess_method):
    raw_text, _ = extract_text_and_tables_from_pdf(pdf_path)
    if raw_text:
        if preprocess_method == "gensim":
            preprocessed_text = preprocess_text_gensim(raw_text)
        elif preprocess_method == "stanza":
            preprocessed_text = preprocess_text_stanza(raw_text)
        
        # Extract entities using SpaCy
        spacy_entities = extract_entities_spacy(preprocessed_text)
        
        # Extract entities using FinBERT
        finbert_entities = extract_entities_finbert(preprocessed_text)
        
        # Combine and post-process entities
        combined_entities = combine_and_postprocess_entities(spacy_entities, finbert_entities)
        
        text_splitter = TokenTextSplitter(chunk_size=2048, chunk_overlap=24)
        documents = text_splitter.split_text(combined_entities)
        return documents, combined_entities
    return [], ''

# List all PDF files in the directory
pdf_paths = [os.path.join(pdf_folder_path, f) for f in os.listdir(pdf_folder_path) if f.lower().endswith('.pdf')]

# Extract and preprocess documents using Gensim
all_documents_gensim = []
for pdf_path in pdf_paths:
    documents, combined_entities = extract_and_preprocess_pdf(pdf_path, preprocess_method="gensim")
    all_documents_gensim.extend(documents)
    
    # Save the combined entity text as ground truth
    pdf_name = os.path.basename(pdf_path).replace('.pdf', '_gensim.txt')
    ground_truth_path = os.path.join(ground_truth_folder_path, pdf_name)
    with open(ground_truth_path, 'w', encoding='utf-8') as f:
        f.write(combined_entities)

# Save preprocessed documents for later use
with open('processed_documents_gensim.pkl', 'wb') as f:
    pickle.dump(all_documents_gensim, f)

# Extract and preprocess documents using Stanza
all_documents_stanza = []
for pdf_path in pdf_paths:
    documents, combined_entities = extract_and_preprocess_pdf(pdf_path, preprocess_method="stanza")
    all_documents_stanza.extend(documents)
    
    # Save the combined entity text as ground truth
    pdf_name = os.path.basename(pdf_path).replace('.pdf', '_stanza.txt')
    ground_truth_path = os.path.join(ground_truth_folder_path, pdf_name)
    with open(ground_truth_path, 'w', encoding='utf-8') as f:
        f.write(combined_entities)

# Save preprocessed documents for later use
with open('processed_documents_stanza.pkl', 'wb') as f:
    pickle.dump(all_documents_stanza, f)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\WendyChuaXingZhao\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\WendyChuaXingZhao\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package wordnet is already up-to-date!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-06-09 22:52:01 INFO: Downloaded file to C:\Users\WendyChuaXingZhao\stanza_resources\resources.json
2024-06-09 22:52:01 INFO: Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/default.zip:   0%|          | 0…

2024-06-09 22:52:47 INFO: Downloaded file to C:\Users\WendyChuaXingZhao\stanza_resources\en\default.zip
2024-06-09 22:52:51 INFO: Finished downloading models and saved to C:\Users\WendyChuaXingZhao\stanza_resources
2024-06-09 22:52:51 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-06-09 22:52:51 INFO: Downloaded file to C:\Users\WendyChuaXingZhao\stanza_resources\resources.json
2024-06-09 22:52:53 INFO: Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined_charlm           |
| lemma        | combined_nocharlm         |
| constituency | ptb3-revised_charlm       |
| depparse     | combined_charlm           |
| sentiment    | sstplus_charlm            |
| ner          | ontonotes-ww-multi_charlm |

2024-06-09 22:52:53 INFO: Using device: cpu
2024-06-09 22:52:53 INFO: Loading: tokenize
2024-06-09 22:52:53 INFO: Loading: mwt
2024-06-09 22:52:53 INFO: Loading: pos
2024-06-09 22:52:53 INFO: Loading: lemma
2024-06-09 22:52:53 INFO: Loading: constituency
2024-06-09 22:52:54 INFO: Loading: depparse
2024-06-09 22:52:54 INFO: Loading: sentiment
2024-06-09 22:52:54 INFO: 

In [None]:
import os
from sklearn.metrics import precision_score, recall_score, f1_score
import pickle

def load_ground_truth_data(ground_truth_folder_path, suffix):
    ground_truth_data = {}
    for file_name in os.listdir(ground_truth_folder_path):
        if file_name.lower().endswith(suffix):
            with open(os.path.join(ground_truth_folder_path, file_name), 'r', encoding='utf-8') as f:
                ground_truth_data[file_name] = f.read().lower()
    return ground_truth_data

def calculate_precision_recall_f1(extracted_text, ground_truth_text):
    extracted_tokens = set(extracted_text.split())
    ground_truth_tokens = set(ground_truth_text.split())

    true_positives = extracted_tokens.intersection(ground_truth_tokens)
    false_positives = extracted_tokens.difference(ground_truth_tokens)
    false_negatives = ground_truth_tokens.difference(extracted_tokens)

    precision = len(true_positives) / (len(true_positives) + len(false_positives)) if (len(true_positives) + len(false_positives)) > 0 else 0
    recall = len(true_positives) / (len(true_positives) + len(false_negatives)) if (len(true_positives) + len(false_negatives)) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

# Define the folder containing the ground truth text files
ground_truth_folder_path = r"C:\Users\WendyChuaXingZhao\OneDrive - SRKK Group of Companies\Documents\MsDS\Knowledge Graph Test\ground_truth"

# Load ground truth data for gensim
ground_truth_data_gensim = load_ground_truth_data(ground_truth_folder_path, "_gensim.txt")

# Load preprocessed documents for gensim
with open('processed_documents_gensim.pkl', 'rb') as f:
    all_documents_gensim = pickle.load(f)

# Evaluate the extracted documents for gensim
precisions_gensim = []
recalls_gensim = []
f1_scores_gensim = []

for file_name in ground_truth_data_gensim.keys():
    ground_truth_text = ground_truth_data_gensim[file_name]
    extracted_text = ' '.join(all_documents_gensim)  # Concatenate all extracted documents for simplicity

    precision, recall, f1 = calculate_precision_recall_f1(extracted_text, ground_truth_text)
    precisions_gensim.append(precision)
    recalls_gensim.append(recall)
    f1_scores_gensim.append(f1)

# Calculate average metrics for gensim
average_precision_gensim = sum(precisions_gensim) / len(precisions_gensim)
average_recall_gensim = sum(recalls_gensim) / len(recalls_gensim)
average_f1_gensim


In [None]:
import pickle
from tqdm import tqdm
from langchain.schema import Document
import json
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, filename='error_log.log', filemode='w',
                    format='%(name)s - %(levelname)s - %(message)s')

# Load preprocessed documents
with open('processed_documents.pkl', 'rb') as f:
    documents = pickle.load(f)

def extract_and_store_graph(
    document: Document,
    nodes: Optional[List[str]] = None,
    rels: Optional[List[str]] = None
) -> None:
    try:
        # Extract graph data using OpenAI functions
        extract_chain = get_extraction_chain(nodes, rels)
        response = extract_chain.invoke(document)
        
        # Log the response for debugging
        logging.info("Response from extract_chain: %s", response)
        
        # Validate JSON response
        try:
            data = response['function']
            # Construct a graph document
            graph_document = GraphDocument(
                nodes=[map_to_base_node(node) for node in data.nodes],
                relationships=[map_to_base_relationship(rel) for rel in data.rels],
                source=document.dict()  # Convert Document object to dictionary
            )
            
            # Store information into a graph
            graph.add_graph_documents([graph_document])
        
        except json.JSONDecodeError as e:
            logging.error("JSON decoding error: %s", e)
            logging.error("Invalid JSON response: %s", response)
        except KeyError as e:
            logging.error("Key error: %s", e)
            logging.error("Response missing expected key: %s", response)
        except Exception as e:
            logging.error("An unexpected error occurred: %s", e)
            logging.error("Response: %s", response)
    
    except Exception as e:
        logging.error("An error occurred in extract_and_store_graph: %s", e)

# Process each document chunk and extract/store the graph
for i, chunk in tqdm(enumerate(documents), total=len(documents)):
    doc = Document(page_content=chunk, metadata={"source": "batch_process", "chunk": i})
    try:
        extract_and_store_graph(doc)
        print(f"Successfully processed and stored chunk {i+1}")
    except Exception as e:
        print(f"An error occurred while processing chunk {i+1}: {e}")
