In [13]:
import os

from dotenv import load_dotenv

import textwrap

from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader

In [14]:
loader = PyPDFLoader("cv.pdf")
pages = loader.load_and_split()

In [15]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
chunks = text_splitter.split_documents(pages)

In [16]:
import warnings
warnings.filterwarnings("ignore")

load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI') 
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'
NEO4J_DATABASE = 'neo4j'
VECTOR_INDEX_NAME = 'pdf_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'

kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

In [17]:
neo4j_vector_store = Neo4jVector.from_documents(
    embedding=OpenAIEmbeddings(),
    documents=chunks,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name=VECTOR_INDEX_NAME,
    text_node_property=VECTOR_SOURCE_PROPERTY,
    embedding_node_property=VECTOR_EMBEDDING_PROPERTY,
)



In [18]:
# Create a PDF node
cypher = """
MERGE (p:PDF {name: $pdfName})
RETURN p
"""
kg.query(cypher, params={'pdfName': "cv.pdf"})

# Connect chunks to their parent PDF with a PART_OF relationship
cypher = """
MATCH (c:Chunk), (p:PDF)
WHERE p.name = $pdfName
MERGE (c)-[newRelationship:PART_OF]->(p)
RETURN count(newRelationship)
"""
kg.query(cypher, params={'pdfName': "cv.pdf"})

# Create a NEXT relationship between subsequent chunks
cypher = """
MATCH (c1:Chunk), (c2:Chunk)
WHERE c1.chunkSeqId = c2.chunkSeqId - 1
MERGE (c1)-[r:NEXT]->(c2)
RETURN count(r)
"""
kg.query(cypher)

[{'count(r)': 0}]

In [19]:
# Create a retriever from the vector store
retriever = neo4j_vector_store.as_retriever()

# Create a chatbot Question & Answer chain from the retriever
chain = RetrievalQAWithSourcesChain.from_chain_type(
    OpenAI(temperature=0), 
    chain_type="stuff",
    retriever=retriever
)

In [20]:
question = "What is the main topic of this PDF document?"
answer = chain(
    {"question": question},
    return_only_outputs=True,
)
print(textwrap.fill(answer["answer"]))

 The main topic of this PDF document is the author's CV and their
experience in web development, blockchain, and software engineering.


In [21]:
question = "List me all the tools he used for frontend development."
answer = chain(
    {"question": question},
    return_only_outputs=True,
)
print(textwrap.fill(answer["answer"]))

 The tools used for frontend development are React, Next.js, Tailwind,
Zustand, React Query, i18n, Zod, StoryBook, Styled Components, shadcn,
MUI, AntD, AntV Chart.
