# Adding PDF to Neo4j knowledge graph

### Imports and environmental variables

General setup of model & LLM transformer

In [1]:
import os
from dotenv import load_dotenv
from langchain_community.graphs import Neo4jGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
neo4j_uri = os.getenv("NEO4J_URI")
neo4j_username = os.getenv("NEO4J_USERNAME")
neo4j_password = os.getenv("NEO4J_PASSWORD")

os.environ["OPENAI_API_KEY"] = openai_api_key
os.environ["NEO4J_URI"] = neo4j_uri
os.environ["NEO4J_USERNAME"] = neo4j_username
os.environ["NEO4J_PASSWORD"] = neo4j_password
    
graph = Neo4jGraph()

llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")

llm_transformer = LLMGraphTransformer(llm=llm)

Test on wikipedia loader

In [2]:
from langchain_community.document_loaders import WikipediaLoader
from langchain_text_splitters import TokenTextSplitter

# Read the wikipedia article
raw_documents = WikipediaLoader(query="Elizabeth I").load()
print(raw_documents[:3])
# Define chunking strategy
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
documents = text_splitter.split_documents(raw_documents[:3])



  lis = BeautifulSoup(html).find_all('li')


[Document(page_content='Elizabeth I (7 September 1533 – 24 March 1603) was Queen of England and Ireland from 17 November 1558 until her death in 1603. She was the last monarch of the House of Tudor.\nElizabeth was the only surviving child of Henry VIII and Anne Boleyn, his second wife, who was executed when Elizabeth was two years old. Anne\'s marriage to Henry was annulled, and Elizabeth was declared illegitimate. Henry restored her to the line of succession when she was 10, via the Third Succession Act 1543. After Henry\'s death in 1547, Elizabeth\'s younger half-brother Edward VI ruled until his own death in 1553, bequeathing the crown to a Protestant cousin, Lady Jane Grey, and ignoring the claims of his two half-sisters, the Catholic Mary and the younger Elizabeth, in spite of statutes to the contrary. Edward\'s will was set aside within weeks of his death and Mary became queen, deposing and executing Jane. During Mary\'s reign, Elizabeth was imprisoned for nearly a year on suspic

In [4]:
print("Documents:", documents)

Documents: [Document(page_content='Elizabeth I (7 September 1533 – 24 March 1603) was Queen of England and Ireland from 17 November 1558 until her death in 1603. She was the last monarch of the House of Tudor.\nElizabeth was the only surviving child of Henry VIII and Anne Boleyn, his second wife, who was executed when Elizabeth was two years old. Anne\'s marriage to Henry was annulled, and Elizabeth was declared illegitimate. Henry restored her to the line of succession when she was 10, via the Third Succession Act 1543. After Henry\'s death in 1547, Elizabeth\'s younger half-brother Edward VI ruled until his own death in 1553, bequeathing the crown to a Protestant cousin, Lady Jane Grey, and ignoring the claims of his two half-sisters, the Catholic Mary and the younger Elizabeth, in spite of statutes to the contrary. Edward\'s will was set aside within weeks of his death and Mary became queen, deposing and executing Jane. During Mary\'s reign, Elizabeth was imprisoned for nearly a yea

Parse PDF into text

- use pytesseract OCR as pdf might not contain "text" but rather images of text

In [4]:
import fitz
import pytesseract
from PIL import Image
import io
from langchain_community.document_loaders import PDFPlumberLoader

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_text_from_pdf(file_name):
    text = ""
    doc = fitz.open(file_name)
    for page_num in range(len(doc)):
        # Load the page
        page = doc.load_page(page_num)
        
        # Convert the PDF page to an image
        image_data = convert_page_to_image(page)
        
        # Convert image data back to a PIL Image object
        image = Image.open(io.BytesIO(image_data))
        
        # Perform OCR using pytesseract on the image and add to text
        ocr_text = pytesseract.image_to_string(image)
        text += ocr_text
        
    return text

def convert_page_to_image(page):
    # Get the pixmap of the page as bytes
    pixmap = page.get_pixmap()
    img_bytes = pixmap.tobytes()
    
    return img_bytes

# Example usage
file_name = 'documents/test_document_1.pdf'
pdf_text = extract_text_from_pdf(file_name)
# print(pdf_text)
print(len(pdf_text))

2302


extract body text using openAI

In [43]:
# Define your prompt and append pdf_text
prompt = f"""
You will be given text extracted from a news article PDF.
The text will contain the main body content, as well as irrelevant sections such as headers, trending news headlines, random metadata, slogans, or the news outlet name. 
Your task is to extract only the relevant body text from the article, excluding all other irrelevant information.
To guide you, the body text typically:
- Is written in paragraph form, with multiple sentences forming a coherent narrative
- Does not contain short, fragmented phrases or single-sentence headlines
- Does not include the news outlet name, slogans, or metadata
- May include quotes or attributions to sources within the paragraphs
Please output only the extracted body text, without any additional formatting or comments.

Extracted PDF Text:
{pdf_text}
"""

# Invoke the language model with the combined prompt
response = llm.invoke(prompt)

content="A roadside bomb went off Friday in Egypt's northern Sinai Peninsula, killing two members of the country’s security forces and wounding five, security and medical officials said. According to the officials, the security forces were patrolling in the town of Bir al-Abd when their armored vehicle was hit by a remotely-detonated bomb. The wounded were transferred to a military hospital in Sinai’s coastal city of El-Arish. The officials spoke on condition of anonymity to discuss the attacks with the media.\n\nFriday's bombing was the second in the past three days. On Wednesday, one member of the security forces was killed and three were wounded in a roadside bombing in a village near Rafah, a town on the border with the Gaza Strip.\n\nThere was no clear claim of responsibility for Friday's attack, but the Islamic State group posted a statement on Friday, saying it was behind Wednesday's bombing and three other recent attacks. The claims could not be independently verified.\n\nEgypt

clean body text

In [45]:
import re

clean_text = re.sub(r'\n', '', response.content)

print(clean_text)

A roadside bomb went off Friday in Egypt's northern Sinai Peninsula, killing two members of the country’s security forces and wounding five, security and medical officials said. According to the officials, the security forces were patrolling in the town of Bir al-Abd when their armored vehicle was hit by a remotely-detonated bomb. The wounded were transferred to a military hospital in Sinai’s coastal city of El-Arish. The officials spoke on condition of anonymity to discuss the attacks with the media.Friday's bombing was the second in the past three days. On Wednesday, one member of the security forces was killed and three were wounded in a roadside bombing in a village near Rafah, a town on the border with the Gaza Strip.There was no clear claim of responsibility for Friday's attack, but the Islamic State group posted a statement on Friday, saying it was behind Wednesday's bombing and three other recent attacks. The claims could not be independently verified.Egypt has been battling an

split by tokens


In [51]:
# Define chunking strategy
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
texts = text_splitter.split_text(clean_text)
print(texts[0])

A roadside bomb went off Friday in Egypt's northern Sinai Peninsula, killing two members of the country’s security forces and wounding five, security and medical officials said. According to the officials, the security forces were patrolling in the town of Bir al-Abd when their armored vehicle was hit by a remotely-detonated bomb. The wounded were transferred to a military hospital in Sinai’s coastal city of El-Arish. The officials spoke on condition of anonymity to discuss the attacks with the media.Friday's bombing was the second in the past three days. On Wednesday, one member of the security forces was killed and three were wounded in a roadside bombing in a village near Rafah, a town on the border with the Gaza Strip.There was no clear claim of responsibility for Friday's attack, but the Islamic State group posted a statement on Friday, saying it was behind Wednesday's bombing and three other recent attacks. The claims could not be independently verified.Egypt has been battling an

feed into LLMGraphTransformer Module 

In [53]:
from langchain_core.documents import Document

documents = [Document(page_content=texts[0])]
# Extract graph data
graph_documents = llm_transformer.convert_to_graph_documents(documents)
# Store to neo4j
graph.add_graph_documents(
  graph_documents, 
  baseEntityLabel=True, 
  include_source=True
)

now that it is working, lets define some methods for reusability

### Final code

In [59]:
import os
import re
import fitz
import pytesseract
from PIL import Image
import io
from dotenv import load_dotenv
from langchain_community.graphs import Neo4jGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI
from langchain_text_splitters import TokenTextSplitter
from langchain_core.documents import Document

load_dotenv()

def extract_text_from_pdf(file_name):
    text = ""
    doc = fitz.open(file_name)
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        image_data = convert_page_to_image(page)
        image = Image.open(io.BytesIO(image_data))
        ocr_text = pytesseract.image_to_string(image)
        text += ocr_text
    return text

def convert_page_to_image(page):
    pixmap = page.get_pixmap()
    img_bytes = pixmap.tobytes()
    return img_bytes

def generate_body_text(pdf_text, llm):
    prompt = f"""
    You will be given text extracted from a news article PDF.
    The text will contain the main body content, as well as irrelevant sections such as headers, trending news headlines, random metadata, slogans, or the news outlet name. 
    Your task is to extract only the relevant body text from the article, excluding all other irrelevant information.
    To guide you, the body text typically:
    - Is written in paragraph form, with multiple sentences forming a coherent narrative
    - Does not contain short, fragmented phrases or single-sentence headlines
    - Does not include the news outlet name, slogans, or metadata
    - May include quotes or attributions to sources within the paragraphs
    Please output only the extracted body text, without any additional formatting or comments.

    Extracted PDF Text:
    {pdf_text}
    """
    response = llm.invoke(prompt)
    
    print(response.content)
    
    clean_text = re.sub(r'\n', '', response.content)
    return clean_text

def process_and_store_text(clean_text, llm_transformer, graph):
    text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
    texts = text_splitter.split_text(clean_text)
    documents = [Document(page_content=text) for text in texts]
    graph_documents = llm_transformer.convert_to_graph_documents(documents)
    graph.add_graph_documents(graph_documents, baseEntityLabel=True, include_source=True)
    
def main(file_name):
    openai_api_key = os.getenv("OPENAI_API_KEY")
    neo4j_uri = os.getenv("NEO4J_URI")
    neo4j_username = os.getenv("NEO4J_USERNAME")
    neo4j_password = os.getenv("NEO4J_PASSWORD")
    
    os.environ["OPENAI_API_KEY"] = openai_api_key
    os.environ["NEO4J_URI"] = neo4j_uri
    os.environ["NEO4J_USERNAME"] = neo4j_username
    os.environ["NEO4J_PASSWORD"] = neo4j_password
    
    graph = Neo4jGraph()
    llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")
    llm_transformer = LLMGraphTransformer(llm=llm)
    
    pdf_text = extract_text_from_pdf(file_name)
    clean_text = generate_body_text(pdf_text, llm)
    process_and_store_text(clean_text, llm_transformer, graph)

if __name__ == "__main__":
    file_name = 'documents/test_document_1.pdf'
    main(file_name)