# Build RAG chatbot using neo4j(Knowledge Graph)

Create neo4j knowledge graph, create a UI on it, user gives query in natural language, you will use LLM to convert it into cypher, an then ask query on knowledge graph

### 1. Installation requirements

In [1]:
%pip install --upgrade --quiet  neo4j
%pip install --upgrade --quiet  langchain-openai langchain-community
%pip install --upgrade --quiet  tiktoken
%pip install --upgrade --quiet  docx2txt

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### 2. We want to use OpenAIEmbeddings so we have to get the OpenAI API Key.

In [1]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

OpenAI API Key:········


### 3. Load packages

In [2]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Neo4jVector
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import Docx2txtLoader

### 4. Load Chatrapati Shivaji Maharaj Diary docx file and then split text in small chunk 

In [3]:
loader = Docx2txtLoader("C:/Users/SSK/Ch. S M project/data/docx/Can 1111.docx")

documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()

ValueError: File path C:/Users/SSK/Ch. S M project/data/docx/Can 1111.docx is not a valid file or url

### 5. Connect to the neo4j database

In [11]:
# Neo4jVector requires the Neo4j database credentials

url = "bolt://localhost:7687"
username = "neo4j"
password = "password"


## Similarity Search with Cosine Distance (Default)

In [12]:
# The Neo4jVector Module will connect to Neo4j and create a vector index if needed.

db = Neo4jVector.from_documents(
    docs, OpenAIEmbeddings(), url=url, username=username, password=password
)

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

## Working with vectorstore

Above, we created a vectorstore from scratch. However, often times we want to work with an existing vectorstore. In order to do that, we can initialize it directly.

In [15]:
index_name = "vector"  # default index name

store = Neo4jVector.from_existing_index(
    OpenAIEmbeddings(),
    url=url,
    username=username,
    password=password,
    index_name=index_name,
)

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
import streamlit as st
import os
from neo4j import GraphDatabase
import spacy
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, ServiceContext, load_index_from_storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from langchain_openai import OpenAI  # Import the correct package for LangChain with OpenAI support

# ---- NEO4J SETUP ----
neo4j_uri = "bolt://localhost:7687"
neo4j_user = "neo4j"
neo4j_password = "12345678"
driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))

# ---- ENVIRONMENT VARIABLES ----
os.environ["OPENAI_API_KEY"] = "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"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# ---- PROMPT TEMPLATE ----
prompt_template = """
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Graph Insights: {graph_insights}
Question: {question}

Answer the question and provide additional helpful information,
based on the pieces of information and graph insights, if applicable. Be succinct.

Responses should be properly formatted to be easily read.
"""
# Define the context for your prompt
context = "This directory contains multiple documents providing examples and solutions for various programming tasks."

# Data ingestion: load all files from a directory
directory_path = r"C:\Users\SSK\Ch. S M project\data"
reader = SimpleDirectoryReader(input_dir=directory_path)
documents = reader.load_data()

# Load spacy model
nlp = spacy.load("en_core_web_sm")

# Function to extract entities and relationships from documents
def populate_graph(documents, driver, nlp):
    with driver.session() as session:
        for doc in documents:
            doc_text = doc.text  # Assuming each document has a 'text' attribute
            nlp_doc = nlp(doc_text)
            concepts = [ent.text for ent in nlp_doc.ents if ent.label_ == "ORG" or ent.label_ == "PRODUCT"]

            for concept in concepts:
                session.run("MERGE (:Concept {name: $concept})", concept=concept)

            for i, concept in enumerate(concepts):
                if i + 1 < len(concepts):
                    next_concept = concepts[i + 1]
                    session.run(
                        """
                        MATCH (c1:Concept {name: $concept}), (c2:Concept {name: $next_concept})
                        MERGE (c1)-[:RELATED_TO]->(c2)
                        """,
                        concept=concept, next_concept=next_concept
                    )

# Populate the Neo4j graph
populate_graph(documents, driver, nlp)

# Split the documents into nodes
text_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=200)
nodes = text_splitter.get_nodes_from_documents(documents, show_progress=True)

# Set up embedding model and LLM using OpenAI API
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
llm = OpenAI(api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")

# Create service context
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm)

# Create vector store index
vector_index = VectorStoreIndex.from_documents(documents, show_progress=True, service_context=service_context, node_parser=nodes)
vector_index.storage_context.persist(persist_dir="./storage_mini")

# Load the index from storage
storage_context = StorageContext.from_defaults(persist_dir="./storage_mini")
index = load_index_from_storage(storage_context, service_context=service_context)

# Query Engine Setup
query_engine = index.as_query_engine(service_context=service_context)

# Streamlit UI components
st.title("Graph-Powered Query System")

question = st.text_input("Ask your question:", "Explain Python?")
if st.button("Ask"):
    graph_insights = ""
    with driver.session() as session:
        result = session.run(
            """
            MATCH (c:Concept)
            WHERE toLower(c.name) CONTAINS toLower($question)
            OPTIONAL MATCH (c)-[r:RELATED_TO]->(other:Concept)
            RETURN c.name AS concept, collect(other.name) AS related_concepts
            """,
            question=question
        )
        insights = []
        for record in result:
            insights.append(f"Concept: {record['concept']}, Related Concepts: {', '.join(record['related_concepts'])}")
        graph_insights = "\n".join(insights) if insights else "No relevant graph insights found."
    
    # Construct the query prompt
    query_prompt = prompt_template.format(context=context, graph_insights=graph_insights, question=question)
    response = query_engine.query(query_prompt)
    
    st.write("### Answer:")
    st.write(response.response)
