# Chatbot with HybridGRAG + Auto KG Construction Pipeline (English)

#### Auto reload modules

In [1]:
%load_ext autoreload
%autoreload 2

#### Imports
- Make sure it prints `proceed` before continuing.

In [2]:
import warnings 
warnings.filterwarnings('ignore')

from langchain_core.runnables import  RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from langchain_core.output_parsers import StrOutputParser
from langchain_community.graphs import Neo4jGraph
from langchain_community.graphs.neo4j_graph import Neo4jGraph
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.chat_models import ChatOllama
from langchain_core.documents.base import Document
from langchain_community.document_loaders import (
    TextLoader,
    UnstructuredFileLoader,
    JSONLoader,
)

from utils.langchainLGT import LLMGraphTransformer 
from utils.checkpoint_manager import CheckpointManager
from utils.neo4j_utils import *
from utils.document_utils import *
from typing import Sequence

from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget
from langchain_community.vectorstores import Neo4jVector
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_ollama import OllamaEmbeddings
import os

from langchain_experimental.llms.ollama_functions import OllamaFunctions
from neo4j import  Driver
from openai import OpenAI
from langchain.chat_models import ChatOpenAI

import logging
from tqdm import tqdm
import json
import re
from tqdm import tqdm
from time import sleep
from datetime import datetime

from PyPDF2 import PdfReader
import pdfplumber
from docx import Document as DocxDocument
import docx
from markdown import markdown

import pickle
from typing import List, Dict, Any
from pathlib import Path

from dotenv import load_dotenv

if not load_dotenv():
    print("Warning: .env not correctly set up")
else:
    print("Proceed")


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from utils.langchainLGT import LLMGraphTransformer


Proceed


---

#### Configure Const
- `MODEL`: The primary LLM for entity and relationship extraction, and later Chatbot with HybridGRAG intergration.
- `EMB_MODEL`: The text embedding model that will be used for the RAG part of HybridGRAG.
- `CHECKPOINT_PATH`: Path for JSON checkpoint in the automatic KG construction pipeline.
- `DOC_DIR`: Your documents directory.
- `ALLOWED_NODES`: List of type of nodes that you only want in your KG.
- `ALLOWED_RELATIONSHIPS`: List of type of relations that you only want in your KG.
- `STRICT_MODE`: Whether to explicitly enforce node and relation types or not.

In [3]:
MODEL = "qwen2.5-7b-instruct"
EMB_MODEL = "text-embedding-nomic-embed-text-v1.5@q8_0"
CHECKPOINT_PATH = "en_processing_checkpoint.json"
DOC_DIR = "documents/"
ALLOWED_NODES = []
ALLOWED_RELATIONSHIPS = []
STRICT_MODE = False

#### Check to see if the `neo4j` database have been populated yet

In [4]:
if not is_database_populated():
    print("DB is not populated")
else:
    print("DB IS POPULATED")

DB is not populated


---

## MAIN AUTOMATIC KNOWLEDGE GRAPH CONSTRUCTION PIPELINE

##### Step 1.
- Put you documents in `DOC_DIR`, supported formats: `.txt`, `.md`, `.json`, `.docx`, `.pdf` (Mostly will only extract text). Scanned PDF will not work.
- Chunking strategy: Default text chunking is 100 words (" " delimiter) with 20 words over lap.
- Pipeline: Load all the files in `DOC_DIR` and process chunking.

##### Step 2.
- Converting `Document` to `GraphDocument` with `LLMGraphTransformer` from `langchain`, using `MODEL` as the entity relation extractor.

##### Step 3.
- Adding to **Neo4j** database, complete the pipeline.

In [6]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

checkpoint = CheckpointManager(CHECKPOINT_PATH)

graph = Neo4jGraph()
client = ChatOpenAI(
    base_url="http://127.0.0.1:8000/v1",
    api_key="lm-studio",
    model=MODEL,
    temperature=0,
    request_timeout=240,
)

llm_transformer = LLMGraphTransformer(
    llm=client,
    allowed_nodes=ALLOWED_NODES,
    allowed_relationships=ALLOWED_RELATIONSHIPS,
    strict_mode=STRICT_MODE,
)

file_paths = sorted(
    [
        os.path.join(root, f)
        for root, _, files in os.walk(DOC_DIR)
        if ".ipynb_checkpoints" not in root
        for f in files
        if f.lower().endswith(("md", "json", "docx", "pdf", "txt"))
    ]
)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=lambda x: len(x.split()),
    separators=[" "]
)

documents: List[Document] = []

# Stage 1: File loading and splitting
for file_path in tqdm(file_paths, desc="Processing files"):
    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        content = f.read()
    file_hash = hash(content)

    # Check if the file has been processed already by checking the content hash
    if checkpoint.is_file_processed(file_path, file_hash):
        logging.info(f"File {file_path} has already been processed.")
        continue  # Skip if the file has already been processed

    try:
        # Load and clean file content
        content = load_and_clean(file_path)
        if not content:
            logging.warning(f"File {file_path} is empty after cleaning. Skipping.")
            continue

        # Split the document into chunks
        chunks = text_splitter.create_documents([content])

        # Add metadata (filename and any other relevant details)
        for chunk in chunks:
            chunk.metadata = {
                'source': file_path,  # Store the filename in the chunk metadata
                'hash': file_hash  # Store the hash of the file content for comparison
            }

        documents.extend(chunks)

        # Add file to checkpoint with content hash
        checkpoint.add_processed_file(file_path, file_hash)  # Ensure file_hash is stored
        logging.info(f"Processed {file_path} into {len(chunks)} chunks")

    except Exception as e:
        logging.error(f"Failed to process {file_path}: {e}")
        continue  # Skip this file and continue with others

# Stage 2: Graph document conversion
graph_documents = []
for doc in tqdm(documents, desc="Converting to graph"):
    doc_id = f"{doc.metadata.get('source', 'unknown')}_{hash(doc.page_content)}"

    # Check if this graph document has already been processed
    if checkpoint.is_graph_doc_processed(doc_id):
        logging.info(f"Graph document {doc_id} has already been processed.")
        continue

    max_retries = 5
    retry_delay = 1  # Seconds before retrying
    attempt = 0

    while attempt < max_retries:
        try:
            # Attempt to convert the document
            graph_doc = llm_transformer.convert_to_graph_documents([doc])[0]

            # Assign original metadata to GraphDocument
            graph_doc.source = doc
            graph_documents.append(graph_doc)

            # Mark this graph document as processed
            checkpoint.add_processed_graph_doc(doc_id)
            logging.info(f"Converted document {doc_id} to GraphDocument.")
            break  # Success, exit retry loop

        except Exception as e:
            attempt += 1
            logging.warning(f"Attempt {attempt}/{max_retries} failed for {doc_id}: {e}")

            if attempt < max_retries:
                sleep(retry_delay)  # Wait before retrying
            else:
                logging.error(f"Failed to convert document {doc_id} after {max_retries} attempts.")
                break  # Stop retrying after max attempts

# Stage 3: Add to Neo4j
batch_size = 10
for doc in tqdm(graph_documents, desc="Inserting documents"):
    doc_hash = hash(doc.source.page_content)

    if checkpoint.is_document_inserted(doc_hash):
        logging.info(f"Document with hash {doc_hash} has already been inserted.")
        continue

    try:
        source = doc.source.metadata.get('source', 'unknown')
        file_hash = doc.source.metadata.get('hash', doc_hash)
        doc_id = str(doc_hash)

        # Insert or update the Document node with page_content
        doc_query = f"""
        MERGE (doc:Document {{id: '{doc_id}'}})
        ON CREATE SET doc.source = '{source}', doc.hash = {file_hash}, doc.page_content = '{doc.source.page_content}'
        ON MATCH SET doc.page_content = '{doc.source.page_content}', doc.updated = timestamp()
        """
        graph.query(doc_query)

        for node in doc.nodes:
            # Apply only the __Entity__ label to all nodes
            node_query = f"""
            MERGE (n:__Entity__ {{id: '{node.id}'}})
            ON CREATE SET n.type = '{node.type}'
            """
            graph.query(node_query)
            
            # Linking Document to Node
            link_doc_node_query = f"""
            MATCH (doc:Document {{id: '{doc_id}'}})
            MATCH (n:__Entity__ {{id: '{node.id}'}})
            MERGE (doc)-[:Mentions]->(n)
            """
            graph.query(link_doc_node_query)
        
        for rel in doc.relationships:
            rel_query = f"""
            MERGE (source:__Entity__ {{id: '{rel.source.id}'}})
            MERGE (target:__Entity__ {{id: '{rel.target.id}'}})
            MERGE (source)-[r:{rel.type}]->(target)
            """
            graph.query(rel_query)

        checkpoint.add_inserted_document(doc_id)
        logging.info(f"Inserted document {doc_id}")

    except Exception as e:
        logging.error(f"Failed to insert document {doc_id}: {e}")
        continue

logging.info("Processing completed successfully")        

2025-03-20 11:33:26,274 - INFO - Processed documents/Agrawal et al. - 2024 - Can Knowledge Graphs Reduce Hallucinations in LLMs A Survey.pdf into 179 chunks
Processing files: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.30it/s]
2025-03-20 11:33:27,220 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK", ?it/s]
2025-03-20 11:33:27,234 - INFO - Converted document documents/Agrawal et al. - 2024 - Can Knowledge Graphs Reduce Hallucinations in LLMs A Survey.pdf_-5399310952819964288 to GraphDocument.
2025-03-20 11:33:27,342 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK".05it/s]
2025-03-20 11:33:27,343 - INFO - Converted document documents/Agrawal et al. - 2024 - Can Knowledge Graphs Reduce Hallucinations in LLMs A Survey.pdf_95917820899393132 to GraphDocument.
2025-03-20 11:33:27,455 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 

---

#### Create embeddings for Textual Nodes (Document Nodes)

In [7]:
import requests
import json

graph = Neo4jGraph()

def get_embedding(text):
    response = requests.post(
        "http://127.0.0.1:8000/v1/embeddings",
        headers={"Content-Type": "application/json"},
        json={"input": text, "model": EMB_MODEL}
    )
    return response.json()["data"][0]["embedding"]

class LMStudioEmbeddings:
    def __init__(self, prepend_document="search_document: ", prepend_query="search_query: "):
        self.prepend_document = prepend_document
        self.prepend_query = prepend_query

    def embed_documents(self, texts):
        return [get_embedding(self.prepend_document + text) for text in texts]

    def embed_query(self, text):
        return get_embedding(self.prepend_query + text)

embeddings = LMStudioEmbeddings()

vector_index = Neo4jVector.from_existing_graph(
    embedding=embeddings,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["page_content"],
    embedding_node_property="embedding"
)

class CustomRetriever:
    def __init__(self, vector_index):
        self.vector_index = vector_index

    def get_relevant_documents(self, query):
        retrieved_docs = self.vector_index.similarity_search(query)
        formatted_docs = []
        for doc in retrieved_docs:
            source = doc.metadata.get('source', 'Unknown Source')
            content = f"In document: {source}: {doc.page_content}\n"
            formatted_docs.append(Document(page_content=content, metadata=doc.metadata))
        return formatted_docs

vector_retriever = CustomRetriever(vector_index)

query = "Your query here"
results = vector_retriever.get_relevant_documents(query)
for result in results:
    print(result.page_content)

In document: documents/Agrawal et al. - 2024 - Can Knowledge Graphs Reduce Hallucinations in LLMs A Survey.pdf: 
page_content: Proceedings of the 46th Interna-
tional ACM SIGIR Conference on Research and De-
velopment in Information Retrieval , pages 781–790.
Jinheon Baek, Alham Fikri Aji, and Amir Saffari. 2023.
Knowledge-augmented language model prompting
for zero-shot knowledge graph question answering.
arXiv preprint arXiv:2306.04136 .
Teodoro Baldazzi, Luigi Bellomarini, Stefano Ceri,
Andrea Colombo, Andrea Gentili, and Emanuel
Sallinger. 2023. Fine-tuning large enterprise

In document: documents/Agrawal et al. - 2024 - Can Knowledge Graphs Reduce Hallucinations in LLMs A Survey.pdf: 
page_content: In Proceedings of the 61st Annual Meeting of
the Association for Computational Linguistics (Vol-
ume 1: Long Papers) , pages 9802–9822.
Xuting Mao, Hao Sun, Xiaoqian Zhu, and Jianping Li.
2022. Financial fraud detection using the related-
party transaction knowledge graph. Procedia Com-

---

#### Create Neo4j Indices for graph queries

In [8]:
driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"],
                os.environ["NEO4J_PASSWORD"]))

def create_fulltext_index(tx):
    query = '''
CREATE FULLTEXT INDEX `fulltext_entity_index` 
FOR (n:__Entity__) 
ON EACH [n.id, n.type];
    '''
    tx.run(query)

def create_index():
    with driver.session() as session:
        session.execute_write(create_fulltext_index)
        print("Fulltext index created successfully.")

try:
    create_index()
except:
    pass

driver.close()

---

#### Entity extraction from user's query using LLM

In [9]:
class Entities(BaseModel):
    names: list[str] = Field(..., description="Extracted entities from the text.")

llm = ChatOpenAI(
    openai_api_base="http://127.0.0.1:8000/v1",
    openai_api_key="lm-studio",
    model=MODEL,
    temperature=0
)

prompt = ChatPromptTemplate.from_template(
"""
You need to extract all entites present within the user's query.
Only return a JSON object with the key "entities" containing the extracted entities.
ONLY RETURN THE JSON OBJECT. NO FURTHER EXPLANATION.

#####
EXAMPLES:

Query: "Who is the president of the USA?"
Return: {{ "entities": ["President", "USA"] }}

Query: "What products does Apple offer?"
Return: {{ "entities": ["Apple", "product"] }}

Query: "Who is the founder of Microsoft?"
Return: {{ "entities": ["founder", "Microsoft"] }}

#####
Now extract the entites from the query: {question}
"""
)

def extract_entities(question):
    formatted_prompt = prompt.format(question=question)
    response = llm.invoke(formatted_prompt)
    
    response_text = response.content if hasattr(response, "content") else str(response)
    
    cleaned_response = re.sub(r"```json|```", "", response_text).strip()
    
    try:
        parsed_response = json.loads(cleaned_response)
        
        if "entities" not in parsed_response:
            return {"error": "No 'entities' field found in response", "raw": cleaned_response}
        
        parsed_response["names"] = parsed_response.pop("entities")
        
        return Entities(**parsed_response)
    
    except json.JSONDecodeError as e:
        return {"error": f"JSON Decode Error: {str(e)}", "raw": cleaned_response}
    except KeyError as e:
        return {"error": f"Missing key: {str(e)}", "raw": cleaned_response}
    except TypeError as e:
        return {"error": f"Type Error: {str(e)}", "raw": cleaned_response}

question = "Does Microsoft have any employee named John?"
entities = extract_entities(question)

print(entities)

2025-03-20 11:41:50,275 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"


names=['Microsoft', 'employee', 'John']


---

#### Querying Neo4j Graph database

In [10]:
def graph_retriever(question: str) -> str:
    result = []
    
    entities = extract_entities(question)
    print(entities)
    if isinstance(entities, dict):
        return "Doesn't seem to have any information on this."
    
    for entity in entities.names:
        formatted_query = entity.replace(" ", "_")
        fuzzy_query = f"{formatted_query}*"
        query_length = len(formatted_query)

        response = graph.query(
            """
CALL db.index.fulltext.queryNodes('fulltext_entity_index', $query, {limit:10}) 
YIELD node, score
WITH node, score, $query_length AS query_length, size(node.id) AS id_length
WHERE score >= 0.8 OR (toFloat(query_length) / id_length) >= 0.8
CALL {
  WITH node
  MATCH (node)-[r]->(neighbor)
  RETURN CASE 
    WHEN 'Document' IN labels(node) 
    THEN coalesce([label IN labels(node) WHERE label <> '__Entity__'][0], 'Unknown') + ": " + node.source + ' - ' + type(r) + ' -> ' + neighbor.id 
    ELSE node.id + ' - ' + type(r) + ' -> ' + neighbor.id
  END AS output
  UNION ALL
  WITH node
  MATCH (node)<-[r]-(neighbor)
  RETURN CASE 
    WHEN 'Document' IN labels(neighbor) 
    THEN coalesce([label IN labels(neighbor) WHERE label <> '__Entity__'][0], 'Unknown') + ": " + neighbor.source + ' - ' + type(r) + ' -> ' + node.id
    ELSE neighbor.id + ' - ' + type(r) + ' -> ' + node.id
  END AS output
}
RETURN output LIMIT 15
            """,
            {"query": fuzzy_query, "query_length": query_length},
        )
        
        if response:
            filtered = [el['output'].replace("_", " ") for el in response if not re.search(r"^[0-9a-f]{32}", el['output'])]
            result.extend(filtered)
    
    return "\n".join(result) if result else "Doesn't seem to have sufficient information on this."

In [11]:
print(graph_retriever("What is IELTS?"))

2025-03-20 11:41:52,073 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"


names=['IELTS']
Doesn't seem to have sufficient information on this.


---

#### HybridGRAG: Both GraphRAG and RAG

In [12]:
def full_retriever(question: str):
    graph_data = graph_retriever(question)
    vector_data = [el.page_content for el in vector_retriever.get_relevant_documents(question)]
    
    final_data = f"""
{graph_data}

{"".join(vector_data)}
    """

    final_data = final_data.replace("page_content: ", "")

    return final_data

In [13]:
print(full_retriever("Does The Verge have any reports on Elon Musk?"))

2025-03-20 11:41:55,124 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"


names=['The Verge', 'reports', 'Elon Musk']

Doesn't seem to have sufficient information on this.

In document: documents/Agrawal et al. - 2024 - Can Knowledge Graphs Reduce Hallucinations in LLMs A Survey.pdf: 
and Illia Polosukhin. 2017. Attention is all
you need. Advances in neural information processing
systems , 30.
Blerta Veseli, Simon Razniewski, Jan-Christoph Kalo,
and Gerhard Weikum. 2023. Evaluating the knowl-
edge base completion potential of gpt. Findings of
EMNLP 2023 .
Boxin Wang, Wei Ping, Peng Xu, Lawrence McAfee,
Zihan Liu, Mohammad Shoeybi, Yi Dong, Oleksii
Kuchaiev, Bo Li, Chaowei
In document: documents/Agrawal et al. - 2024 - Can Knowledge Graphs Reduce Hallucinations in LLMs A Survey.pdf: 
new knowl-
edge. In Proceedings of the 2023 Conference onEmpirical Methods in Natural Language Processing ,
pages 1397–1414.
Zhangyue Yin, Qiushi Sun, Cheng Chang, Qipeng
Guo, Junqi Dai, Xuan-Jing Huang, and Xipeng Qiu.
2023b. Exchange-of-thought: Enhancing large lan-
guage model

---

## Chatbot with HybridGRAG Demo

In [14]:
import requests

LM_STUDIO_URL = "http://127.0.0.1:8000/v1/chat/completions"

def query_lm_studio(question):
    """
    Fetches context using full_retriever() and queries LM Studio for a response.
    """
    context = full_retriever(question)

    prompt = f"""
You are a smart and helpful assitant.
Answer the user's query based on the given context.
Think step by step, reviewing all information within the context before answering.
Never give an answer containing false information.

#####
CONTEXT:
{context}

#####
USER'S QUERY:
{question}
"""

    payload = {
        "model": MODEL,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.5
    }
    response = requests.post(LM_STUDIO_URL, json=payload)

    print(f"\nCONTEXT{'=' * 90}\n{context}")
    print("ANSWER" + "=" * 90 + "\n")
    if response.status_code == 200:
        return response.json()["choices"][0]["message"]["content"]
    else:
        return f"Error: {response.text}"

In [15]:
print(query_lm_studio("What can you tell me about knowledge graphs. Some papers on it? Answer consisely."))

2025-03-20 11:42:00,230 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"


names=['knowledge graphs', 'papers']


Knowledge graphs - USED TO CREATE -> faithful reasoning paths based on various relations
Document: documents/Agrawal et al. - 2024 - Can Knowledge Graphs Reduce Hallucinations in LLMs A Survey.pdf - Mentions -> Knowledge graphs
KB-Binder - INTEGRATES -> Knowledge Graphs
Document: documents/Agrawal et al. - 2024 - Can Knowledge Graphs Reduce Hallucinations in LLMs A Survey.pdf - Mentions -> Knowledge Graphs
arXiv preprint arXiv:2112.02732 - RELATED TO -> knowledge graphs for commonsense question answering
Document: documents/Agrawal et al. - 2024 - Can Knowledge Graphs Reduce Hallucinations in LLMs A Survey.pdf - Mentions -> knowledge graphs for commonsense question answering
Knowledge graphs (KGs) - ORGANIZE INFORMATION IN -> structured format
Document: documents/Agrawal et al. - 2024 - Can Knowledge Graphs Reduce Hallucinations in LLMs A Survey.pdf - Mentions -> Knowledge graphs (KGs)

In document: documents/Agrawal et al. - 2024 - Can Knowledge 

## Some useful utilities

#### Start Neo4j
```bash
docker-compose up -d --build
```

#### Stop Neo4j
```bash
docker-compose down
```

#### Delete database
```cypher
MATCH (n) DETACH DELETE n;
```

#### Display n number of nodes
```cypher
MATCH (n)
RETURN n LIMIT 25
```

#### Indixes manipulation
```cypher
SHOW INDEXES;
DROP INDEX vector;
```