In [None]:
import requests

remote_pdf_url = "https://arxiv.org/pdf/1709.00666.pdf"
pdf_filename = "ch02-downloaded.pdf"

response = requests.get(remote_pdf_url)

if response.status_code == 200:
    with open(pdf_filename, "wb") as pdf_file:
        pdf_file.write(response.content)
else:
    print("Failed to download the PDF. Status code:", response.status_code)

In [None]:
import pdfplumber

text = ""

with pdfplumber.open(pdf_filename) as pdf:
    for page in pdf.pages:
        text += page.extract_text()

print(text[0:20])

In [None]:
def chunk_text(text, chunk_size, overlap, split_on_whitespace_only=True):
    chunks = []
    index = 0

    while index < len(text):
        if split_on_whitespace_only:
            prev_whitespace = 0
            left_index = index - overlap
            while left_index >= 0:
                if text[left_index] == " ":
                    prev_whitespace = left_index
                    break
                left_index -= 1
            next_whitespace = text.find(" ", index + chunk_size)
            if next_whitespace == -1:
                next_whitespace = len(text)
            chunk = text[prev_whitespace:next_whitespace].strip()
            chunks.append(chunk)
            index = next_whitespace + 1
        else:
            start = max(0, index - overlap + 1)
            end = min(index + chunk_size + overlap, len(text))
            chunk = text[start:end].strip()
            chunks.append(chunk)
            index += chunk_size

    return chunks

In [None]:
chunks = chunk_text(text, 500, 40)
print(len(chunks))
print(chunks[0])
print("---")
print(chunks[1])

In [None]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

def embed(texts):
    # 2. Encode the texts to get their embeddings
    embeddings = model.encode(texts, convert_to_tensor=True)
    return embeddings


embeddings = embed(chunks)
print(len(embeddings))
print(len(embeddings[0]))



In [None]:
print(embeddings.shape)

In [None]:
from neo4j import GraphDatabase

driver = GraphDatabase.driver("neo4j://localhost:7687", auth=("neo4j", "password"))

In [None]:
driver.execute_query("""CREATE VECTOR INDEX pdf IF NOT EXISTS
FOR (c:Chunk)
ON c.embedding""")

In [None]:
cypher_query = '''
WITH $chunks as chunks, range(0, size($chunks)) AS index
UNWIND index AS i
WITH i, chunks[i] AS chunk, $embeddings[i] AS embedding
MERGE (c:Chunk {index: i})
SET c.text = chunk, c.embedding = embedding
'''

# neo4j driverはtensorを直接扱えないため、リストに変換する必要がある
embeddings = [e.tolist() for e in embeddings]

driver.execute_query(cypher_query, chunks=chunks, embeddings=embeddings)

In [None]:
records, _, _ = driver.execute_query("""
MATCH (c:Chunk) WHERE c.index = 0 RETURN c.embedding, c.text
""")

print(records[0]["c.text"][0:30])
print(records[0]["c.embedding"][0:3])

In [None]:
question = "At what time was Einstein really interested in experimental works?"
question_embedding = embed([question])[0]

In [None]:
query = '''
CALL db.index.vector.queryNodes('pdf',2, $question_embedding) YIELD node AS hits, score
RETURN hits.text AS text, score, hits.index AS index
'''

similar_records, _, _ = driver.execute_query(query, question_embedding=question_embedding.tolist())
print(similar_records)

In [None]:
for record in similar_records:
    print(record["text"])
    print(record["score"], record["index"])
    print("---------")

In [None]:
!ollama run llama3.2

In [None]:
import ollama

system_message = "You're en Einstein expert, but can only use the provided documents to respond to the questions."
user_message = f"""
Use the following documents to answer the question that will follow:
{[doc["text"] for doc in similar_records]}

---

The question to answer using information only from the above documents: {question}
"""

print("Question:", question)

stream = ollama.chat(
    model='llama3.2',
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message}
    ],
    stream=True,
)

for chunk in stream:
    print(chunk['message']['content'], end='', flush=True)


In [None]:
try :
    driver.execute_query(f"CREATE FULLTEXT INDEX ftPdfChunk FOR (c:Chunk) ON EACH [c.text]")
except:
    print("Fulltext Index already exists")

In [None]:
result = driver.execute_query("SHOW INDEXES YIELD name, type, entityType, labelsOrTypes, properties, options")

for record in result:
    print(record)

In [None]:
hybrid_query = '''
CALL {
    // vector index
    CALL db.index.vector.queryNodes('pdf', $k, $question_embedding) YIELD node, score
    WITH collect({node:node, score:score}) AS nodes, max(score) AS max
    UNWIND nodes AS n
    // We use 0 as min
    RETURN n.node AS node, (n.score / max) AS score
    UNION
    // keyword index
    CALL db.index.fulltext.queryNodes('ftPdfChunk', $question, {limit: $k})
    YIELD node, score
    WITH collect({node:node, score:score}) AS nodes, max(score) AS max
    UNWIND nodes AS n
    // We use 0 as min
    RETURN n.node AS node, (n.score / max) AS score
}
// dedup
WITH node, max(score) AS score ORDER BY score DESC LIMIT $k
RETURN node, score
'''

similar_hybrid_records, _, _ = driver.execute_query(hybrid_query, question_embedding=question_embedding.tolist(), question=question, k=4)

for record in similar_hybrid_records:
    print(record["node"]["text"])
    print(record["score"], record["node"]["index"])
    print("======")




In [None]:
user_message = f"""
Use the following documents to answer the question that will follow:
{[doc["node"]["text"] for doc in similar_hybrid_records]}

---
The question to answer using information only from the above documents: {question}
"""

print("Question:", question)

stream = ollama.chat(
    model='llama3.2',
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message}
    ],
    stream=True,
)

for chunk in stream:
    print(chunk['message']['content'], end='', flush=True)