In [2]:
from src.pdf_reader import extract_full_text

text = extract_full_text("study_in_scarlet.pdf")  # Replace with actual file
text[:1000]  # Show first 1000 chars


'A Study In Scarlet\nArthur Conan Doyle\nThis text is provided to you “as-is” without any warranty. No warranties of any kind, expressed or implied, are made to you as to the\ntext or any medium it may be on, including but not limited to warranties of merchantablity or ﬁtness for a particular purpose.\nThis text was formatted from various free ASCII and HTML variants. See http://sherlock-holm.es for an electronic form of this text\nand additional information about it.\nThis text comes from the collection’s version 3.1.\nTable of contents\nPart I\nMr. Sherlock Holmes . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n5\nThe Science Of Deduction . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n8\nThe Lauriston Garden Mystery . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n12\nWhat John Rance Had To Tell. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

In [4]:
from src.pdf_reader import extract_text_by_page
from src.chunker import chunk_text

pages = extract_text_by_page("study_in_scarlet.pdf")
chunks = chunk_text(pages)

print(f"Total chunks: {len(chunks)}")
print(chunks[0][:300])  # Preview first chunk
total_chars = sum(len(c) for c in chunks)
print(f"Total characters: {total_chars}")



Total chunks: 541
A Study In Scarlet
Arthur Conan Doyle
This text is provided to you “as-is” without any warranty. No warranties of any kind, expressed or implied, are made to you as to the
text or any medium it may be on, including but not limited to warranties of merchantablity or ﬁtness for a particular purpose.
T
Total characters: 256521


In [6]:
from src.pdf_reader import extract_text_by_page
from src.chunker import chunk_text
from src.embedding import embed_texts

pages = extract_text_by_page("study_in_scarlet.pdf")
chunks = chunk_text(pages)

embeddings = embed_texts(chunks[:5])  # First few for testing
print(f"{len(embeddings)} embeddings created. Each dim: {len(embeddings[0])}")


  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.92it/s]

5 embeddings created. Each dim: 384





In [2]:
from src.pdf_reader import extract_text_by_page
from src.chunker import chunk_text
from src.embedding import embed_texts
from src.vector_store import get_or_create_collection, add_chunks_to_collection

# 1) read + chunk
pages = extract_text_by_page("study_in_scarlet.pdf")
chunks = chunk_text(pages)

# 2) embed
embeddings = embed_texts(chunks)

# 3) store
collection = get_or_create_collection("my-pdf-docs")
add_chunks_to_collection(collection, chunks, embeddings, source="study_in_scarlet.pdf")

print("Docs in collection:", collection.count())


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 17/17 [00:28<00:00,  1.66s/it]


Docs in collection: 1082


In [3]:
from src.index_manager import ingest_pdf

collection = ingest_pdf("study_in_scarlet.pdf")
print("Docs in collection:", collection.count())


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 17/17 [00:10<00:00,  1.58it/s]

Docs in collection: 1082





In [10]:
from src.search import search_query

hits = search_query("who is sherlock?", k=3)
print(hits)


Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 53.85it/s]

{'documents': ['and additional information about it.\nThis text comes from the collection’s version 3.1.\nTable of contents\nPart I\nMr. Sherlock Holmes . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n5\nThe Science Of Deduction . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n8\nThe Lauriston Garden Mystery . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n12', 'and additional information about it.\nThis text comes from the collection’s version 3.1.\nTable of contents\nPart I\nMr. Sherlock Holmes . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n5\nThe Science Of Deduction . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n8\nThe Lauriston Garden Mystery . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n12', 'his tape and his glass in his pocket.\n“They




In [1]:
from src.qa_pipeline import iterative_summarize

result = summarize_pdf("Summarize the PDF", k=5)
print("=== SUMMARY ===")
print(result["summary"])


hello


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


NameError: name 'summarize_pdf' is not defined

In [3]:
from src.qa_pipeline import iterative_summarize
result = iterative_summarize("Summarize the PDF", k=10)
print(result["summary"])


Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.57it/s]


"The Book of                Life" is a remarkable mixture of shrewdness and absurdity" “He was an uncommon drunk sort o’ man,” said John Rance. “You seem to be a walking calendar of crime,’ Stamford said.


In [5]:
from src.qa_pipeline import iterative_summarize

result = iterative_summarize("Summarize the PDF", k=10)

print("=== FINAL SUMMARY ===")
print(result["summary"])


Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 18.09it/s]


=== FINAL SUMMARY ===
"The Book of                Life" is a remarkable mixture of shrewdness and absurdity" “He was an uncommon drunk sort o’ man,” said John Rance. “You seem to be a walking calendar of crime,’ Stamford said.


In [6]:
from src.rag_qa import answer_question_rag

print(answer_question_rag("What is this book about?", k=5))


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 2 files: 100%|█████████████████████████████████████████████████████████████████| 2/2 [10:10<00:00, 305.34s/it]
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 13.45it/s]
Device set to use cpu
Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.11it/s]
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy t

ValueError: Input length of input_ids is 300, but `max_length` is set to 300. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.

In [7]:
import ollama

response = ollama.chat(
    model='mistral',
    messages=[
        {"role": "user", "content": "Summarize the plot of A Study in Scarlet."}
    ]
)

print(response['message']['content'])


 Title: A Study in Scarlet

Author: Arthur Conan Doyle

Plot Summary:

"A Study in Scarlet" is a detective mystery novel that introduces Sherlock Holmes and Dr. John Watson. The story begins with Holmes, a consulting detective living at 221B Baker Street in London, and his new roommate, the military doctor John Watson, returning from an injury sustained during the Second Anglo-Afghan War.

The narrative unfolds as Watson writes about their first case together. They are approached by Inspector Lestrade of Scotland Yard, who asks Holmes to consult on a mysterious murder case. The victim was found dead in a house with the word "Rache" (German for "revenge") scrawled on the wall.

Holmes deduces that the crime scene is not the actual site of the murder, and they find a trail leading to Broughton-Hall, a remote country estate in Lancashire, England. After examining the house, Holmes identifies it as the home of one Enoch J. Drebber and Joseph T. Stamford, American brothers who are notorious

In [1]:
from src.rag_qa import answer_question_rag
print(answer_question_rag("What is this book about?", k=5))


hello


  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 31.13it/s]


The book is titled "The Book of Life" and it attempts to show how much an observant person can learn by examining everything that comes their way accurately and systematically. It is described as a mixture of shrewdness and absurdity, with the reasoning being close and intense but the deductions appearing far-fetched and exaggerated. The author's knowledge seems remarkable in some areas but extremely limited in others, particularly contemporary literature, philosophy, and politics.


In [1]:
from src.pdf_reader import extract_text_by_page
from src.chunker import chunk_text
from src.embedding import embed_texts
from src.vector_store import get_or_create_collection, add_chunks_to_collection

# Step 1: Extract & chunk the PDF
pages = extract_text_by_page("study_in_scarlet.pdf")
chunks = chunk_text(pages)

# Step 2: Test with first 50 chunks
sample_chunks = chunks[:50]
sample_embeddings = embed_texts(sample_chunks)

# Step 3: Add to vector store
collection = get_or_create_collection("my-pdf-docs")
add_chunks_to_collection(collection, sample_chunks, sample_embeddings, source="study_in_scarlet.pdf")


hello


  from tqdm.autonotebook import trange
No sentence-transformers model found with name hkunlp/instructor-base. Creating a new one with mean pooling.
`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.


In [2]:
from src.rag_qa import answer_question_rag

print(answer_question_rag("What is this book about?", k=5))


`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.


This book appears to be "A Study In Scarlet" by Arthur Conan Doyle, as indicated in the context provided. However, without additional information, it's challenging to summarize the entire plot of the book based solely on the provided text. The story primarily revolves around the character of Sherlock Holmes and his friend Dr. John Watson, and the mystery they solve together. This particular segment of the text suggests that the characters are discussing a strange man, possibly connected to a case or mystery, and Holmes' unique methods of gathering information.
