The goal was to build a Knowledge graph based retrieval augmented generation system.
The knowledge graph was built from 'triplets' extracted from sentences.
The triplets were of the form 'head', 'tail', 'relationship'.
An example of how such a triplet can be extracted from a sentence is shown below.

In [33]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load model & tokenizer
model_name = "Babelscape/rebel-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Prompt-style input
sentence = "Insulin regulates glucose metabolism in the human body."
prompt = f"extract relation triplets from: {sentence}"

# Encode and generate
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
outputs = model.generate(
    **inputs,
    max_length=512,
    num_beams=5,
    early_stopping=True
)

# Decode
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Raw Output:", output_text)

def extract_triplets(text):
    triplets = []
    lines = text.strip().split("\n")
    for line in lines:
        parts = line.strip().split("  ")  # split on exactly two spaces
        if len(parts) == 3:
            head, tail, relation = map(str.strip, parts)
            triplets.append((head, tail, relation))
    return triplets


triplets = extract_triplets(output_text)
print("Extracted Triplets:", triplets)


Raw Output:  Insulin  glucose metabolism  subject has role
Extracted Triplets: [('Insulin', 'glucose metabolism', 'subject has role')]


The sentences or text was obtained by scraping the website https://patents.justia.com/. 
The code used for that is as given below.
This data was saved to justia_patents.csv

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time

# Base URL (modify query if needed)
BASE_URL = "https://patents.justia.com/search?q=HVAC&page={}"

# Headers to mimic a real browser request
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

# Open a CSV file to save results
with open("justia_patents.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Title", "Link", "Abstract"])  # Write the header

    # Loop through the first 35 pages
    for page in range(1, 36):
        url = BASE_URL.format(page)
        print(f"🔍 Scraping Page {page}...")

        # Send request to Justia Patents
        response = requests.get(url, headers=HEADERS)

        # Check if request was successful
        if response.status_code != 200:
            print(f"❌ Error: Page {page} could not be fetched (Status Code: {response.status_code})")
            continue  # Skip this page and move to the next

        # Parse the HTML
        soup = BeautifulSoup(response.text, "html.parser")

        # Find the main container with patents
        patent_container = soup.find("div", id="search-results", class_="wrapper")

        # Check if container exists
        if not patent_container:
            print(f"⚠️ No patents found on page {page}.")
            continue

        # Find all individual patent entries
        patents = patent_container.find_all("li")

        # Loop through patents and extract information
        for patent in patents:
            title_elem = patent.find("h6")  # Patent title
            link_elem = patent.find("a")  # Patent link
            abstract_elem = patent.find("div", class_="abstract")  # Abstract

            # Extract text safely
            title = title_elem.text.strip() if title_elem else "No title"
            link = "https://patents.justia.com" + link_elem["href"] if link_elem else "No link"
            abstract = abstract_elem.text.strip() if abstract_elem else "No abstract available"

            # Save to CSV
            writer.writerow([title, link, abstract])

        print(f"✅ Page {page} scraped successfully!")

        # Add a short delay to avoid getting blocked
        time.sleep(2)

print("🎉 All 35 pages scraped successfully! Data saved to justia_patents.csv")


Now, this data was cleaned, each sentence was segmented and triplets were extracted using the code given below. The model used for classifying the parts of the sentence was Babelscape/rebel-large.

In [20]:
import spacy
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

class RebelComponent:
    def __init__(self, nlp, model_name='Babelscape/rebel-large', device=0):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

        self.device = device

    def __call__(self, doc):
        text = doc.text
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
# Don't add .to(self.device)

        outputs = self.model.generate(**inputs, max_length=512, num_beams=5, early_stopping=True)
        output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        doc._.relations = self.extract_triplets(output_text)
        return doc

    def extract_triplets(self, text):
        triplets = []
        parts = text.split("<triplet>")
        for part in parts[1:]:
            try:
                head = part.split("<subj>")[1].split("<obj>")[0].strip()
                relation = part.split("<obj>")[1].split("<tail>")[0].strip()
                tail = part.split("<tail>")[1].strip()
                triplets.append({'head': head, 'relation': relation, 'tail': tail})
            except IndexError:
                continue
        return triplets

# Register the custom component in spaCy's pipeline
@spacy.registry.misc("rebel_component")
def create_rebel_component():
    return RebelComponent



In [32]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import spacy
import csv

# Load spaCy for sentence splitting
nlp = spacy.load("en_core_web_sm")

# Load Babelscape REBEL model
model_name = "Babelscape/rebel-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Triplet parser based on double-space format
def extract_triplets(text):
    triplets = []
    lines = text.strip().split("\n")
    for line in lines:
        parts = line.strip().split("  ")
        if len(parts) == 3:
            head, tail, relation = map(str.strip, parts)
            triplets.append((head, tail, relation))
    return triplets

# Read input CSV
df = pd.read_csv(r"C:\Users\Admin\Desktop\hvac_kg_project\justia_patents.csv")
# Python uses 0-based indexing

# Store output triplets here
all_triplets = []

# Process each row
for idx, row in df.iterrows():
    title = row.get("Title", "")
    link = row.get("Link", "")
    abstract_raw = row.get("Abstract", "")

    # Clean abstract text
    if "Abstract:" in abstract_raw:
        abstract_text = abstract_raw.split("Abstract:")[-1].strip()
    else:
        abstract_text = abstract_raw.strip()

    # Sentence segmentation using spaCy
    doc = nlp(abstract_text)
    sentences = [sent.text.strip() for sent in doc.sents]

    # Process each sentence
    for sentence in sentences:
        if not sentence:
            continue
        inputs = tokenizer(sentence, return_tensors="pt", truncation=True).to(device)
        outputs = model.generate(**inputs, max_length=256)
        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        triplets = extract_triplets(decoded_output)

        for head, tail, relation in triplets:
            all_triplets.append({
                "Title": title,
                "Link": link,
                "Sentence": sentence,
                "Head": head,
                "Tail": tail,
                "Relation": relation
            })

# Write to output CSV
output_file = "justia_triplets_all.csv"
with open(output_file, mode="w", newline='', encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=["Title", "Link", "Sentence", "Head", "Tail", "Relation"])
    writer.writeheader()
    for row in all_triplets:
        writer.writerow(row)

print(f"✅ Triplets extracted and saved to {output_file}")


✅ Triplets extracted and saved to justia_triplets_all.csv


After saving these triplets, basic preprocessing was performed.

In [7]:
import pandas as pd

# Load your triplets file
df = pd.read_csv(r"C:\Users\Admin\Desktop\hvac_kg_project\scripts\justia_triplets_all.csv")

# Step 1: Drop rows with any missing values in head, tail, or relation
df = df.dropna(subset=["Head", "Tail", "Relation"])

# Step 2: Normalize text — strip and lowercase
for col in ["Head", "Tail", "Relation"]:
    df[col] = df[col].astype(str).str.strip().str.lower()

# Step 3: Remove self-loops (where head == tail)
df = df[df["Head"] != df["Tail"]]

# Step 4: Remove very short or meaningless entries (e.g., 1-character)
df = df[df["Head"].str.len() > 1]
df = df[df["Tail"].str.len() > 1]
df = df[df["Relation"].str.len() > 1]

# Step 5: Drop exact duplicates (same head, tail, relation)
df_cleaned = df.drop_duplicates(subset=["Head", "Tail", "Relation"])

# Optional: reset index
df_cleaned = df_cleaned.reset_index(drop=True)

# Step 6: Save the cleaned triplets to a new CSV
df_cleaned.to_csv(r"C:\Users\Admin\Desktop\hvac_kg_project\scripts\cleaned_triplets.csv", index=False)
print("Original rows:", len(df))
print("After cleaning:", len(df_cleaned))

print("✅ Cleaned triplets saved to 'cleaned_triplets.csv'")


Original rows: 1378
After cleaning: 619
✅ Cleaned triplets saved to 'cleaned_triplets.csv'


These cleaned triplets were used to create a knowledge graph in Neo4j, where 'Head' and 'Tail' where treated as entities and 'Relation' was treated as relations. 

Langchain was installed. 

In [12]:
%pip install -U langchain langchain-community


Collecting langchain
  Downloading langchain-0.3.23-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.51 (from langchain)
  Downloading langchain_core-0.3.51-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8 (from langchain)
  Downloading langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
Collecting langsmith<0.4,>=0.1.17 (from langchain)
  Downloading langsmith-0.3.24-py3-none-any.whl.metadata (15 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading sqlalchemy-2.0.40-cp312-cp312-win_amd64.whl.metadata (9.9 kB)
Collecting tenacity!=8.4.0,<10,>=8.1.0 (from langchain-community)
  Downloading tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-set

After this, the connection with neo4j was tested.

In [15]:
from py2neo import Graph

# If Neo4j Desktop is running and your DB is started, this should work
graph = Graph("bolt://localhost:7687")

def test_graph_connection():
    try:
        result = graph.run("MATCH (n) RETURN n LIMIT 1").data()
        return "Success!" if result else "Connected, but no data found."
    except Exception as e:
        return f"Neo4j connection error: {e}"

print(test_graph_connection())


Success!


The llm's working was tested. 

In [18]:
from langchain_community.llms import Ollama

llm = Ollama(model="mistral")

response = llm.invoke("What is a patent?")
print(response)


 A patent is an intellectual property right granted by the government that gives its owner the exclusive rights to exclude others from making, using, selling, and importing an invention for a specified period of time. The purpose of a patent is to promote innovation and technological advancement by providing inventors with financial incentives for their inventions. To be eligible for a patent, the invention must meet certain requirements, such as being new, useful, and non-obvious. Patents can be granted for various types of inventions, including mechanical devices, chemical compounds, electrical circuits, software algorithms, and more.


A function was defined to search the graph and retrieve context. 

In [20]:
def search_graph(graph, user_query):
    # You can modify this to search Tail or Relation too
    cypher_query = f"""
    MATCH (h)-[r]->(t)
    WHERE toLower(h.Name) CONTAINS toLower('{user_query}')
       OR toLower(t.Name) CONTAINS toLower('{user_query}')
       OR toLower(r.`relation`) CONTAINS toLower('{user_query}')
    RETURN h.Name AS Head, r.relation AS Relation, t.Name AS Tail
    LIMIT 10
    """
    results = graph.run(cypher_query).data()
    
    # Format result into a context string
    context = "\n".join([f"{row['Head']} --{row['Relation']}--> {row['Tail']}" for row in results])
    return context


Whether the function is working properly was tested with the example given below.

In [21]:
user_query = "engine"  # You can replace this with any user input
context = search_graph(graph, user_query)
print("📘 Retrieved context:\n", context)


📘 Retrieved context:
 hvac energy analytics engine --facet of--> hvac system
hvac energy analytics engine --facet of--> hvac system
expressive decision tables --instance of--> engine
expressive decision tables --instance of--> engine
start-stop --subclass of--> engine operation
start-stop --subclass of--> engine operation
powertrain control module --use--> engine operation
powertrain control module --use--> engine operation
parametric model --instance of--> knowledge-based engineering library
parametric model --instance of--> knowledge-based engineering library


Then, it was tested whether the llm is answering the queries correctly and with reference to the context.

In [22]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import Ollama

# Step 1: Connect to Ollama (make sure it's running)
llm = Ollama(model="mistral")

# Step 2: Create prompt template
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful assistant. Use the context below to answer the question.
If the context does not contain enough information, say "I don't know based on the current knowledge."

Context:
{context}

Question: {question}

Answer:
"""
)

# Step 3: Wrap into a chain
qa_chain = LLMChain(llm=llm, prompt=prompt_template)

# Step 4: Use your previous context + user query
response = qa_chain.run({
    "context": context,  # this comes from your graph search
    "question": user_query
})

print("🤖 Answer from LLM:\n", response)


🤖 Answer from LLM:
  In the given context, an "engine" is a facet of both HVAC (Heating, Ventilation, and Air Conditioning) systems and is also associated with powertrain control modules. The "engine" concept is further subclassified into "engine operation", which includes "start-stop". Additionally, it interacts with "parametric models", which are instances of the "knowledge-based engineering library".


Whether or not gradio is working was tested.

In [24]:
import gradio as gr
from py2neo import Graph

def test_connection(question):
    try:
        graph = Graph("bolt://localhost:7687")  # No auth for desktop Neo4j
        result = graph.run("MATCH (n) RETURN COUNT(n) AS count").data()
        count = result[0]['count']
        return f"Neo4j is connected ✅. Your graph has {count} nodes.\nYou asked: {question}"
    except Exception as e:
        return f"❌ Error connecting to Neo4j: {str(e)}"

demo = gr.Interface(
    fn=test_connection,
    inputs=gr.Textbox(label="Test Question"),
    outputs="text",
    title="Neo4j Test",
    description="Checks if Neo4j Desktop connection works"
)

demo.launch()


* Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.




All of the above functionalities were integrated to get one user interface where you can enter topic specific queries and the llm with reference to the knowledge graph, gives you answers. 

In [32]:
import gradio as gr
from py2neo import Graph
from langchain_community.llms import Ollama
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# ---- SETUP ----

# Connect to Neo4j Desktop (adjust the connection if needed)
graph = Graph("bolt://localhost:7687")  # No auth assumed for Neo4j Desktop

# Connect to local Ollama model
llm = Ollama(model="mistral")

# Prompt template for LLM
prompt_template = PromptTemplate(
    input_variables=["context", "user_query"],
    template="""
You are a helpful assistant. Use the context below to answer the question.
If the context does not contain enough information, say "I don't know based on the current knowledge."

Context:
{context}

Question: {user_query}

Answer:
"""
)
qa_chain = LLMChain(llm=llm, prompt=prompt_template)

# ---- GRAPH SEARCH ----
def search_graph(graph, user_query):
    # You can modify this to search Tail or Relation too
    cypher_query = f"""
    MATCH (h)-[r]->(t)
    WHERE toLower(h.Name) CONTAINS toLower('{user_query}')
       OR toLower(t.Name) CONTAINS toLower('{user_query}')
       OR toLower(r.`relation`) CONTAINS toLower('{user_query}')
    RETURN h.Name AS Head, r.relation AS Relation, t.Name AS Tail
    LIMIT 10
    """
    results = graph.run(cypher_query).data()
    
    # Format result into a context string
    context = "\n".join([f"{row['Head']} --{row['Relation']}--> {row['Tail']}" for row in results])
    return context

def answer_question(user_query):
    context = search_graph(graph, user_query)
    if context.startswith("Error") or "No matching" in context:
        return f"🔍 " + context
    response = qa_chain.run({
        "context": context,
        "user_query": user_query
    })
    return f"🤖 Answer: {response}\n\n📚 Context Used:\n{context}"

demo = gr.Interface(
    fn=answer_question,
    inputs=gr.Textbox(label="Ask a question about your knowledge graph"),
    outputs=gr.Textbox(label="Answer with context"),
    title="Neo4j + Ollama RAG Chatbot",
    description="Ask any question. The bot searches your Neo4j knowledge graph for relevant facts, then generates an answer using a local CPU-friendly LLM.",
)

# Launch
demo.launch()

* Running on local URL:  http://127.0.0.1:7871

To create a public link, set `share=True` in `launch()`.


