In [4]:
import sys
!{sys.executable} -m pip install pypdf


Collecting pypdf
  Using cached pypdf-6.7.1-py3-none-any.whl.metadata (7.1 kB)
Using cached pypdf-6.7.1-py3-none-any.whl (331 kB)
Installing collected packages: pypdf
Successfully installed pypdf-6.7.1


In [5]:
from pypdf import PdfReader
from typing import List

def load_pdf(file_path: str) -> List[dict]:
    """Loads a pdf and ectract text page by page
    return:
        List of dictionaries with page content and metadata"""
    
    reader=PdfReader(file_path)
    documents=[]

    for page_number,page in enumerate(reader.pages):
        text=page.extract_text()
        if text:
            documents.append(
                {
                    "page_content": text.strip(),
                    "metadata": {
                        "page": page_number +1,
                        "source": file_path
                    },
                }
            )

    return documents

In [6]:
from typing import List, Dict

def chunk_text(
        documents: List[Dict],
        chunk_size: int=500,
        overlap: int=100,
)-> List[Dict]:
    """
    Splits Documents into overlapping chunks
    Args: 
        documents: Output from loader
        chunk_size: Approx number of words per chunk
        overlap: overlap between chunks

    Returns:
        List of chunk dictionaries
    """
    chunks=[]
    chunk_id=0

    for doc in documents:
        words=doc["page_content"].split()

        start=0
        while start < len(words):
            end=start +chunk_size
            chunk_words = words[start:end]
            chunk_text=" ".join(chunk_words)

            chunks.append(
                {
                    "chunk_id": chunk_id,
                    "text": chunk_text,
                    "metadata": doc["metadata"],
                }
            )

            chunk_id +=1
            start +=chunk_size -overlap

    return chunks

In [7]:
# test_ingestion.py

from ingestion.loader import load_pdf
from ingestion.chunker import chunk_text

if __name__ == "__main__":
    file_path = "sample-1.pdf" 

    print("Loading PDF...")
    documents = load_pdf(file_path)
    print(f"Pages loaded: {len(documents)}")

    print("Chunking text...")
    chunks = chunk_text(documents)

    print(f"Total chunks created: {len(chunks)}")

    print("\nFirst chunk preview:")
    print(chunks[0]["text"][:300])


Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 30 0 (offset 0)


Loading PDF...
Pages loaded: 11
Chunking text...
Total chunks created: 21

First chunk preview:
GROWW PAY SERVICES PRIVATE LIMITED Vaishnavi Tech Park, South Tower, 4th Floor, Sarjapur Main Road, Bengaluru, Karnataka -560103 CIN: U67100KA2021PTC149048 email: corp.secretarial@groww.in Tel:080-69601300 Terms of Use (Last updated – May 2023) Introduction The Unified Payments Interface (“UPI”) is 


In [1]:
import uuid
from typing import List, Dict


def chunk_text(
    documents: List[Dict],
    chunk_size: int = 500,
    overlap: int = 100,
) -> List[Dict]:
    """
    Splits documents into overlapping chunks and assigns doc_id.
    """

    chunks = []
    doc_id = str(uuid.uuid4())  # unique document ID
    chunk_id = 0

    for doc in documents:
        words = doc["page_content"].split()
        start = 0

        while start < len(words):
            end = start + chunk_size
            chunk_words = words[start:end]
            chunk_text = " ".join(chunk_words)

            chunks.append(
                {
                    "doc_id": doc_id,
                    "chunk_id": chunk_id,
                    "text": chunk_text,
                    "metadata": {
                        **doc["metadata"],
                        "doc_id": doc_id,
                        "chunk_id": chunk_id,
                    },
                }
            )

            chunk_id += 1
            start += chunk_size - overlap

    return chunks


In [3]:
import sys
!{sys.executable} -m pip install ingestion


Collecting ingestion

ERROR: Cannot install ingestion==0.0.24, ingestion==0.0.25, ingestion==0.0.26, ingestion==0.0.27, ingestion==0.0.28, ingestion==0.0.29, ingestion==0.0.30, ingestion==0.0.31, ingestion==0.0.32, ingestion==0.0.33, ingestion==0.0.34, ingestion==0.0.35, ingestion==0.0.36, ingestion==0.0.38, ingestion==0.0.39, ingestion==0.0.40, ingestion==0.0.41 and ingestion==0.0.42 because these package versions have conflicting dependencies.
ERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts



  Using cached ingestion-0.0.42-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting azure-keyvault-secrets==4.4.0 (from ingestion)
  Using cached azure_keyvault_secrets-4.4.0-py3-none-any.whl.metadata (28 kB)
Collecting azure-storage-blob==12.12.0 (from ingestion)
  Using cached azure_storage_blob-12.12.0-py3-none-any.whl.metadata (25 kB)
Collecting pandas==1.4.2 (from ingestion)
  Downloading pandas-1.4.2-cp310-cp310-win_amd64.whl.metadata (12 kB)
Collecting openpyxl==3.0.9 (from ingestion)
  Downloading openpyxl-3.0.9-py2.py3-none-any.whl.metadata (2.4 kB)
Collecting xlrd==2.0.1 (from ingestion)
  Downloading xlrd-2.0.1-py2.py3-none-any.whl.metadata (3.4 kB)
Collecting sqlalchemy==1.4.37 (from ingestion)
  Downloading SQLAlchemy-1.4.37-cp310-cp310-win_amd64.whl.metadata (10.0 kB)
INFO: pip is looking at multiple versions of ingestion to determine which version is compatible with other requirements. This could take a while.
Collecting ingestion
  Downloading ingestion-0.0.41-py2.py3-no

In [17]:
from ingestion.loader import load_pdf
from ingestion.chunker import chunk_text

documents = load_pdf("../sample-1.pdf")

chunks = chunk_text(documents)

print("Total chunks:", len(chunks))
print("Example chunk:")
print(chunks[0])


Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 30 0 (offset 0)


Total chunks: 21
Example chunk:
{'chunk_id': 0, 'text': 'GROWW PAY SERVICES PRIVATE LIMITED Vaishnavi Tech Park, South Tower, 4th Floor, Sarjapur Main Road, Bengaluru, Karnataka -560103 CIN: U67100KA2021PTC149048 email: corp.secretarial@groww.in Tel:080-69601300 Terms of Use (Last updated – May 2023) Introduction The Unified Payments Interface (“UPI”) is a payment platform built by National Payments Corporation of India (“NPCI”) that allows instant online payments between the bank accounts of any two parties. UPI offers an architecture and a set of standard Application Programming Interface (“API") specifications to facilitate these online payments. It aims to simplify and provide a single interface across all NPCI systems besides creating interoperability and superior customer experience. These terms and conditions (“Terms”) regulate the payments done under UPI , a payment service platform (“Platform”) developed by NPCI, an umbrella organisation incorporated in 2008 and acting as the 

In [21]:
# embeddings/embedder.py

from sentence_transformers import SentenceTransformer
from typing import List

class Embedder:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def embed_texts(self, texts: List[str]) -> List[List[float]]:
        """
        Converts list of texts into embedding vectors.
        """
        embeddings = self.model.encode(texts, show_progress_bar=True)
        return embeddings.tolist()



In [4]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.append(project_root)


In [5]:
import embeddings.embedder
print(dir(embeddings.embedder))


  from .autonotebook import tqdm as notebook_tqdm


['Embedder', 'List', 'SentenceTransformer', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__']


In [9]:
import os

os.environ["HF_HOME"] = r"C:\Users\venka_5gwzxwk\hf_cache"
os.makedirs(os.environ["HF_HOME"], exist_ok=True)


In [12]:
import os
import huggingface_hub

print("HF_HOME:", os.environ.get("HF_HOME"))
print("HUGGINGFACE_HUB_CACHE:", os.environ.get("HUGGINGFACE_HUB_CACHE"))
print("TRANSFORMERS_CACHE:", os.environ.get("TRANSFORMERS_CACHE"))

from huggingface_hub import hf_hub_download
print("HF default cache:", huggingface_hub.constants.HF_HOME)


HF_HOME: C:\Users\venka_5gwzxwk\hf_cache
HUGGINGFACE_HUB_CACHE: None
TRANSFORMERS_CACHE: None
HF default cache: D:\huggingface


In [1]:
import os

os.environ["HF_HOME"] = r"C:\Users\venka_5gwzxwk\hf_cache"
os.environ["HUGGINGFACE_HUB_CACHE"] = r"C:\Users\venka_5gwzxwk\hf_cache"
os.environ["TRANSFORMERS_CACHE"] = r"C:\Users\venka_5gwzxwk\hf_cache"

os.makedirs(r"C:\Users\venka_5gwzxwk\hf_cache", exist_ok=True)


In [4]:
import sys
import os

project_root = os.path.abspath("..")
sys.path.append(project_root)

print("Project root added:", project_root)


Project root added: c:\Users\venka_5gwzxwk\OneDrive\Desktop\100x\notebook-lm


In [6]:
import sys
import os

sys.path.append(os.path.abspath(".."))


In [7]:
from ingestion.loader import load_pdf
from ingestion.chunker import chunk_text

documents = load_pdf("../sample-1.pdf")
chunks = chunk_text(documents)

print("Chunks created:", len(chunks))


Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 30 0 (offset 0)


Chunks created: 21


In [8]:
from embeddings.embedder import Embedder

embedder = Embedder()

texts = [chunk["text"] for chunk in chunks]
vectors = embedder.embed_texts(texts)

print("Number of vectors:", len(vectors))
print("Vector dimension:", len(vectors[0]))


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Number of vectors: 21
Vector dimension: 384


In [12]:
import sys
!{sys.executable} -m pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-cp310-win_amd64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-cp310-win_amd64.whl (18.9 MB)
   ---------------------------------------- 0.0/18.9 MB ? eta -:--:--
    --------------------------------------- 0.3/18.9 MB ? eta -:--:--
   - -------------------------------------- 0.5/18.9 MB 1.4 MB/s eta 0:00:14
   - -------------------------------------- 0.5/18.9 MB 1.4 MB/s eta 0:00:14
   - -------------------------------------- 0.8/18.9 MB 817.9 kB/s eta 0:00:23
   -- ------------------------------------- 1.3/18.9 MB 1.2 MB/s eta 0:00:15
   ---- ----------------------------------- 2.1/18.9 MB 1.6 MB/s eta 0:00:11
   ----- ---------------------------------- 2.6/18.9 MB 1.8 MB/s eta 0:00:10
   ------ --------------------------------- 3.1/18.9 MB 1.9 MB/s eta 0:00:09
   ------- -------------------------------- 3.7/18.9 MB 1.9 MB/s eta 0:00:08
   -------- ------------------------------- 4.2/18.9 MB 2.0 MB/s eta 0:00:08
   -

In [13]:
# vectorstore/store.py

import faiss
import numpy as np
from typing import List


class VectorStore:
    def __init__(self, dimension: int):
        self.dimension = dimension
        self.index = faiss.IndexFlatL2(dimension)
        self.text_chunks = []

    def add(self, vectors: List[List[float]], chunks: List[dict]):
        vectors_np = np.array(vectors).astype("float32")
        self.index.add(vectors_np)
        self.text_chunks.extend(chunks)

    def search(self, query_vector: List[float], top_k: int = 3):
        query_np = np.array([query_vector]).astype("float32")
        distances, indices = self.index.search(query_np, top_k)

        results = []
        for idx in indices[0]:
            results.append(self.text_chunks[idx])

        return results


In [14]:
from vectorstore.store import VectorStore

dimension = len(vectors[0])
store = VectorStore(dimension)

store.add(vectors, chunks)

print("Vectors stored:", store.index.ntotal)


Vectors stored: 21


In [15]:
# llm/generator.py

import requests


class LLMGenerator:
    def __init__(self, model_name="llama3"):
        self.model_name = model_name
        self.url = "http://localhost:11434/api/generate"

    def generate(self, prompt: str) -> str:
        response = requests.post(
            self.url,
            json={
                "model": self.model_name,
                "prompt": prompt,
                "stream": False,
            },
        )

        return response.json()["response"]


In [24]:
def build_prompt(question, retrieved_chunks):
    context_blocks = []

    for chunk in retrieved_chunks:
        page = chunk["metadata"].get("page", "Unknown")
        source = chunk["metadata"].get("source", "Unknown")
        chunk_id = chunk.get("chunk_id", "Unknown")

        block = f"""
Source: {source}
Page: {page}
Chunk ID: {chunk_id}

{chunk['text']}
"""
        context_blocks.append(block)

    context = "\n\n---\n\n".join(context_blocks)

    prompt = f"""
You are a document assistant.

Answer using ONLY the context below.
Cite the page number like (Page X).

Context:
{context}

Question:
{question}

Answer:
"""

    return prompt


In [28]:
from llm.generator import LLMGenerator

llm = LLMGenerator()

question = "The trademarks displayed on the Groww UPI App will become property of whoms?"

query_vector = embedder.embed_texts([question])[0]
retrieved = store.search(query_vector, top_k=3)

prompt = build_prompt(question, retrieved)

answer = llm.generate(prompt)

print(answer)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

According to (Page 8), "The trademarks, logos and service marks displayed on the Groww UPI App (“Marks”) are the property of Groww or other respective third parties, as the case may be."

So, the trademarks displayed on the Groww UPI App will become property of either Groww or other respective third parties.


In [29]:
print(answer)

print("\nSources:")
for i, chunk in enumerate(retrieved):
    print(f"[Source {i+1}] Page {chunk['metadata']['page']} ({chunk['metadata']['source']})")


According to (Page 8), "The trademarks, logos and service marks displayed on the Groww UPI App (“Marks”) are the property of Groww or other respective third parties, as the case may be."

So, the trademarks displayed on the Groww UPI App will become property of either Groww or other respective third parties.

Sources:
[Source 1] Page 8 (../sample-1.pdf)
[Source 2] Page 10 (../sample-1.pdf)
[Source 3] Page 2 (../sample-1.pdf)


In [30]:
def answer_question(question: str):
    query_vector = embedder.embed_texts([question])[0]
    retrieved = store.search(query_vector, top_k=3)

    prompt = build_prompt(question, retrieved)
    answer = llm.generate(prompt)

    citations = []

    for chunk in retrieved:
        citations.append({
            "source": chunk["metadata"]["source"],
            "page": chunk["metadata"]["page"],
            "chunk_id": chunk["chunk_id"],
            "highlight_text": chunk["text"][:300]
        })

    return {
        "answer": answer,
        "citations": citations
    }


In [32]:
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel

app = FastAPI()

app.mount("/files", StaticFiles(directory="uploads"), name="files")


class QueryRequest(BaseModel):
    question: str


@app.post("/query")
def query_doc(request: QueryRequest):
    result = answer_question(request.question)
    return result


In [34]:
MODES = {
    "qa": """
You are a document assistant.
Answer the question using ONLY the provided context.
Cite sources.
""",

    "summary": """
You are a professional summarizer.
Generate a concise but comprehensive summary of the document using the context.
Structure it with headings and bullet points.
""",

    "study_guide": """
Create a structured study guide from the document.
Include:
- Key Concepts
- Important Definitions
- Important Rules
- Practical Implications
""",

    "faq": """
Generate 5-10 frequently asked questions and answers
based strictly on the document context.
""",

    "timeline": """
Extract chronological events from the document
and present them in a timeline format.
""",

    "flashcards": """
Create 10 flashcards in Q&A format
based strictly on the document.
Keep them concise and exam-focused.
""",

    "key_arguments": """
Extract the main arguments or core themes
from the document and explain each briefly.
"""
}


In [35]:
def generate_from_mode(mode: str, question: str = None):
    if mode not in MODES:
        raise ValueError("Invalid mode")

    # If question mode, use it
    query = question if question else "Summarize the entire document"

    query_vector = embedder.embed_texts([query])[0]
    retrieved = store.search(query_vector, top_k=5)

    context_blocks = []
    for chunk in retrieved:
        context_blocks.append(chunk["text"])

    context = "\n\n---\n\n".join(context_blocks)

    prompt = f"""
{MODES[mode]}

Context:
{context}

"""

    if question:
        prompt += f"\nQuestion:\n{question}\n"

    response = llm.generate(prompt)

    return {
        "mode": mode,
        "output": response
    }


In [36]:
result = generate_from_mode("summary")
print(result["output"])


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

**TERMS AND CONDITIONS**

### 1. GENERAL TERMS

* GROWW PAY SERVICES PRIVATE LIMITED (Groww) endeavors to execute and process transactions as per the defined process, but shall not be held responsible for non-responsiveness, delay, or failure of systems due to circumstances beyond its control.
* The Terms and Conditions may change at any time without notice. You are obligated to exercise independent diligence before arriving at any decision.

### 2. USE OF GROWW UPI APP

* Groww grants a non-exclusive, limited privilege to access and use the Groww UPI App subject to compliance with these Terms.
* You agree not to engage in activities that may adversely affect the use of the Groww UPI App or its systems.

### 3. DISCLAIMERS

* The Services, including all content, software, functions, materials, and information, are provided "as is" without representation or warranty of any kind.
* Groww shall not be responsible for service interruptions or breaches of security associated with the transm

In [37]:
result = generate_from_mode("study_guide")
print(result["output"])


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Here is a structured study guide based on the document:

**Key Concepts**

1. Terms of Service: The terms and conditions under which Groww Pay Services Private Limited (Groww) provides its services.
2. Payment Transactions: The transactions carried out through Groww's UPI App, including fund transfer and merchant transactions.
3. User Conduct: The behavior expected from users of the Groww UPI App, including compliance with applicable laws and regulations.
4. Intellectual Property Rights: The rights owned by Groww to its content, software, functions, materials, and information made available on or accessible through the Groww UPI App.

**Important Definitions**

1. "As Is": A disclaimer that Groww makes no representation or warranty for the services provided.
2. "Non-Exclusive, Limited Privilege": The right granted to users to access and use the Groww UPI App subject to compliance with the Terms.
3. "PSP" (Payment Service Provider): A third-party payment service provider that facilitate

In [None]:
result = generate_from_mode("faq")
print(result["output"])


Batches:   0%|          | 0/1 [00:00<?, ?it/s]