In [1]:
%load_ext dotenv
%dotenv

In [2]:
import requests

remote_pdf_url = "https://arxiv.org/pdf/1709.00666.pdf"
pdf_filename = "ch03-downloaded.pdf"

response = requests.get(remote_pdf_url)

if response.status_code == 200:
    with open(pdf_filename, "wb") as pdf_file:
        pdf_file.write(response.content)
else:
    print("Failed to download the PDF. Status code:", response.status_code)

In [3]:
import pdfplumber

text = ""

with pdfplumber.open(pdf_filename) as pdf:
    for page in pdf.pages:
        text += page.extract_text()

In [4]:
import re

from utils import num_tokens_from_string


def split_text_by_titles(text):
    # Compile a regular expression pattern for titles that
    # match lines starting with one or more digits, an optional uppercase letter,
    # followed by a dot, a space, and then up to 50 characters
    title_pattern = re.compile(r"(\n\d+[A-Z]?\. {1,3}.{0,50}\n)", re.DOTALL)
    # Find all titles using the pattern
    titles = title_pattern.findall(text)
    # Split the text at these titles
    sections = re.split(title_pattern, text)
    sections_with_titles = []
    # Append the first section
    sections_with_titles.append(sections[0])
    # Iterate over the rest of sections
    for i in range(1, len(titles) + 1):
        section_text = sections[i * 2 - 1].strip() + "\n" + sections[i * 2].strip()
        sections_with_titles.append(section_text)

    return sections_with_titles


sections = split_text_by_titles(text)
print(f"Number of sections: {len(sections)}")

for s in sections:
    print(num_tokens_from_string(s))

Number of sections: 8
154
254
4186
570
2703
1441
194
600


In [None]:
print(sections[1])

In [6]:
from utils import chunk_text

parent_chunks = []
for s in sections:
    parent_chunks.extend(chunk_text(s, 2000, 40))

for c in parent_chunks:
    print(num_tokens_from_string(c))

154
254
434
503
447
478
467
451
465
469
444
109
442
135
453
452
562
460
440
393
466
451
463
88
194
415
199


In [7]:
from utils import neo4j_driver

In [8]:
index_name = "parent"
try:
    neo4j_driver.execute_query(
        f"CALL db.index.vector.createNodeIndex($index_name, 'Child', 'embedding', 1536, 'cosine')",
        index_name=index_name,
    )
except Exception as e:
    print("Vector Index already exists")

Vector Index already exists


In [9]:
from utils import embed

for i, chunk in enumerate(parent_chunks):
    child_chunks = chunk_text(chunk, 500, 20)
    embeddings = embed(child_chunks)
    # Add to neo4j
    cypher_query = """
    MERGE (p:Parent {id:$id})
    SET p.text = $parent
    WITH p, $children AS children, $embeddings as embeddings
    UNWIND range(0, size(children) - 1) AS child_index
    MERGE (c:Child {id: $id + '-' + toString(child_index)})
    SET c.text = children[child_index], c.embedding = embeddings[child_index]
    MERGE (p)-[:HAS_CHILD]->(c)
    """
    neo4j_driver.execute_query(
        cypher_query,
        id=str(i),
        parent=chunk,
        children=child_chunks,
        embeddings=embeddings,
    )

In [11]:
import os
from openai import OpenAI
from utils import chat


question = "At what time was Einstein really interested in experimental works?"

system_message = f"""
"You are an expert at world knowledge. Your task is to step back "
"and paraphrase a question to a more generic step-back question, which "
"is easier to answer. Here are a few examples
"input": "Could the members of The Police perform lawful arrests?"
"output": "what can the members of The Police do?"

"input": "Jan Sindel’s was born in what country?"
"output": "what is Jan Sindel’s personal history?"
"""
user_message = f"""{question}"""

print("Question:", question)

step_back_question = chat(
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
    ]
)
print(f"Stepback results: {step_back_question}")

Question: At what time was Einstein really interested in experimental works?
Stepback results: What was Einstein's interest in experimental works?


In [12]:
question_embedding = embed([step_back_question])[0]

vector_k = 10
final_k = 4

query = """
CALL db.index.vector.queryNodes($index_name, $vector_k, $question_embedding)
YIELD node, score
MATCH (node)<-[:HAS_CHILD]-(parent)
WITH parent, max(score) AS score
RETURN parent.text AS text, score
ORDER BY score DESC
LIMIT toInteger($final_k)
"""
similar_records, _, _ = neo4j_driver.execute_query(
    query,
    question_embedding=question_embedding,
    vector_k=vector_k,
    final_k=final_k,
    index_name=index_name,
)

for record in similar_records:
    print(record["text"])
    print(record["score"])
    print("======")

63. Einstein’s Inventions and Patents
Table 1: Patents of Jacob Einstein (Albert Einstein's uncle).
Date Patent Collaborators Description
number
30/08/1890 CH2131 Sebastian Kornprobst New electrical measuring and registering
apparatus
31/12/1886 DE41824 J. A. Essberger Improvements in electric arc lamps
30/11/1889 DE53207 ‐ Automatic circuit breaker for electric arc
lamps
26/02/1890 DE53546 Sebastian Kornprobst Apparatus for stabilising irregular indicator
movement in electric meter displays
21/11/1889 DE53846 Sebastian Kornprobst Improvements to electric measurement
apparatus
23/02/1890 DE60361 Sebastian Kornprobst Spring loaded friction wheel
10.10/1893 DE74429 Control of carbonisation in electric arc
lamps
DE‐Germany, CH‐Switzerland
Considering Einstein’s upbringing, his interest in inventions and patents was not unusual.
Being a manufacturer’s son, Einstein grew upon in an environment of machines and instruments.
When his father’s company obtained the contract to illuminate Munich 

In [13]:
system_message = "You're en Einstein expert, but can only use the provided documents to respond to the questions."
user_message = f"""
Use the following documents to answer the question that will follow:
{[doc["text"] for doc in similar_records]}

---

The question to answer using information only from the above documents: {question}
"""

print("Question:", question)

result = chat(
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
    ]
)
print("Response:", result)

Question: At what time was Einstein really interested in experimental works?
Response: Einstein was genuinely interested in experimental works during his ETH days.
