In [None]:
stepback_system_message = """
You are an expert at world knowledge. Your task is to step back
and paraphrase a question to a more generic step-back question, which
is easier to answer. Here are a few examples

"input": "Could the members of The Police perform lawful arrests?"
"output": "what can the members of The Police do?"

"input": "Jan Sindel’s was born in what country?"
"output": "what is Jan Sindel’s personal history?"

Here's a paraphrased version of the question:


"""

In [None]:
import ollama

def generate_stepback(question: str):
    user_message = f"""{question}"""
    step_back_question = ollama.chat(
        model='llama3.2',
        messages=[
            {"role": "system", "content": stepback_system_message},
            {"role": "user", "content": user_message}
        ],
    )
    return step_back_question


In [None]:
question = "Which team did Thierry Audel play for from 2007 to 2008?"
step_back_question = generate_stepback(question)
print(f"Stepback results: {step_back_question['message']}")

In [None]:
import requests

remote_pdf_url = "https://arxiv.org/pdf/1709.00666.pdf"
pdf_filename = "ch03-downloaded.pdf"

response = requests.get(remote_pdf_url)

if response.status_code == 200:
    with open(pdf_filename, "wb") as pdf_file:
        pdf_file.write(response.content)
else:
    print("Failed to download the PDF. Status code:", response.status_code)

In [None]:
import pdfplumber

text = ""

with pdfplumber.open(pdf_filename) as pdf:
    for page in pdf.pages:
        text += page.extract_text()

In [None]:
import re

def split_text_by_titles(text):
    title_pattern = re.compile(r"(\n\d+[A-Z]?\. {1,3}.{0,60}\n)", re.DOTALL)
    titles = title_pattern.findall(text)
    sections = re.split(title_pattern, text)
    sections_with_titles = []
    sections_with_titles.append(sections[0])
    for i in range(1, len(titles) + 1):
        section_text = sections[i * 2 -1].strip() + "\n" + sections[i*2].strip()
        sections_with_titles.append(section_text)
    return sections_with_titles

In [None]:
sections = split_text_by_titles(text)
print(f"Number of sections: {len(sections)}")

In [None]:
def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    return len(string.split())

In [None]:
from utils import chunk_text

parent_chunks = []
for s in sections:
    parent_chunks.extend(chunk_text(s, 200, 40))

In [None]:
print(parent_chunks[:3])  # Display the first 3 chunks for verification

In [None]:
from utils import embed
from utils import neo4j_driver

cypher_import_query = """
MERGE (pdf:PDF {id:$pdf_id})
MERGE (p:Parent {id:$pdf_id + '-' + $id})
SET p.text = $parent
MERGE (pdf)-[:HAS_PARENT]->(p)
WITH p, $children AS children, $embeddings as embeddings
UNWIND range(0, size(children) - 1) AS child_index
MERGE (c:Child {id: $pdf_id + '-' + $id + '-' + toString(child_index)})
SET c.text = children[child_index], c.embedding = embeddings[child_index]
MERGE (p)-[:HAS_CHILD]->(c);
"""

for i, chunk in enumerate(parent_chunks):
    child_chunks = chunk_text(chunk, 500, 20)
    embeddings = embed(child_chunks)

    neo4j_driver.execute_query(
        cypher_import_query,
        id=str(i),
        pdf_id='1709.00666',
        parent=chunk,
        children=child_chunks,
        embeddings=embeddings.tolist()
    )
    

In [None]:
neo4j_driver.execute_query("""
MATCH p=(pdf:PDF)-[:HAS_PARENT]->()-[:HAS_CHILD]->()
RETURN p LIMIT 25
""")

In [None]:
neo4j_driver.execute_query("""CREATE VECTOR INDEX parent IF NOT EXISTS FOR (c:Child) ON c.embedding""")

In [None]:
from typing import List


retrieval_query = """
CALL db.index.vector.queryNodes($index_name, $k * 4, $question_embedding)
YIELD node, score
MATCH (node)<-[:HAS_CHILD]-(parent)
WITH parent, max(score) AS score
RETURN parent.text AS text, score
ORDER BY score DESC
LIMIT toInteger($k)
"""

def parent_retrieval(question: str, k: int = 4) -> List[str]:
    question_embedding = embed([question])[0]
    similar_records, _, _ = neo4j_driver.execute_query(
        retrieval_query,
        index_name="parent",
        k=k,
        question_embedding=question_embedding.tolist()
    )
    return [r['text'] for r in similar_records]

In [None]:
system_message = "You're en Einstein expert, but can only use the provided documents to respond to the question."

def generate_answer(question: str, documents: List[str]) -> str:
    user_message = f"""
Use the following documents to answer the question that will follow:
{documents}

---

The question to answer using information only from the above documents: {question}
"""
    
    result = ollama.chat(
        model='llama3.2',
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ],
    )
    return result['message']['content']

In [None]:
def rag_pipeline(question: str) -> str:
    stepback_prompt = generate_stepback(question)
    stepback_prompt = stepback_prompt['message']['content']
    print(f"Stepback prompt: {stepback_prompt}")
    documents = parent_retrieval(stepback_prompt)
    answer = generate_answer(question, documents)
    return answer


In [None]:
rag_pipeline("Who was the Einsten's collaborator on sound reproduction system?")

In [None]:
rag_pipeline("When was Einstein granted the patent for his blouse design?")