In [None]:
# ==============================================================
# 0. INSTALL DEPENDENCIES (latest versions – no pinning)
# ==============================================================
!pip install -q \
    sentence-transformers \
    langchain \
    langchain-community \
    pypdf \
    faiss-cpu \
    transformers \
    accelerate \
    bitsandbytes \
    torch

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m80.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.9/323.9 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency co

In [None]:
# ==============================================================
# 1. IMPORTS
# ==============================================================
import os
import urllib.request
import textwrap
from IPython.display import display, Markdown

import torch
from sentence_transformers import SentenceTransformer

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [None]:
# ==============================================================
# 2. DOWNLOAD SAMPLE PDF (replace with your own via upload later)
# ==============================================================
SAMPLE_PDF_URL = "https://www.examples.com/business/application-software.html"  # 2023 Form 1040
PDF_PATH = "sample_1040.pdf"

if not os.path.exists(PDF_PATH):
    print("Downloading sample PDF...")
    urllib.request.urlretrieve(SAMPLE_PDF_URL, PDF_PATH)
    print(f"Downloaded → {PDF_PATH}")

In [None]:
# ==============================================================
# 3. LOAD & CHUNK PDF
# ==============================================================
loader = PyPDFLoader(PDF_PATH)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""]
)
chunks = splitter.split_documents(documents)
chunk_texts = [c.page_content for c in chunks]

print(f"Created {len(chunks)} chunks from {len(documents)} pages.")

Created 11 chunks from 2 pages.


In [None]:
# ==============================================================
# 4. EMBEDDING MODEL (LangChain wrapper – required for FAISS.from_texts)
# ==============================================================
embedder = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
# ==============================================================
# 5. BUILD FAISS VECTOR STORE
# ==============================================================
vectorstore = FAISS.from_texts(
    texts=chunk_texts,
    embedding=embedder
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

In [None]:
# ==============================================================
# 6. LOAD SMALL LLM (TinyLlama-1.1B-Chat – fully open, no login)
# ==============================================================
MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
llm = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True
)

generator = pipeline(
    "text-generation",
    model=llm,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.3,
    do_sample=True,
    repetition_penalty=1.1,
    pad_token_id=tokenizer.eos_token_id
)

Device set to use cuda:0


In [None]:
# ==============================================================
# 7. RAG PROMPT TEMPLATE (TinyLlama chat style)
# ==============================================================
RAG_PROMPT = """<|system|>
You are a helpful assistant. Answer using only the provided context.
If the answer is not in the context, say: "I cannot answer based on the provided document."
</|system|>
<|user|>
Context:
{context}

Question: {question}
</|user|>
<|assistant|>
"""

prompt = PromptTemplate.from_template(RAG_PROMPT)

In [None]:
# ==============================================================
# 8. RETRIEVAL + GENERATION FUNCTION
# ==============================================================
def answer_question(question: str):
    docs = retriever.invoke(question)
    context = "\n\n".join([d.page_content for d in docs])
    formatted = prompt.format(context=context, question=question)
    output = generator(formatted)[0]["generated_text"]
    # Extract after last <|assistant|>
    answer = output.split("<|assistant|>")[-1].strip()
    return answer, docs

In [None]:
# ==============================================================
# 9. INTERACTIVE ASK FUNCTION (FIXED)
# ==============================================================
def ask(q: str):
    ans, sources = answer_question(q)
    display(Markdown(f"**Q:** {q}\n\n**A:** {ans}"))
    print("\n--- Top-4 Retrieved Chunks ---")
    for i, src in enumerate(sources, 1):
        # Safely get page number: default to 0 if missing
        raw_page = src.metadata.get("page", 0)
        page_num = int(raw_page) + 1  # Convert to int, add 1 for human-readable
        snippet = textwrap.shorten(src.page_content, width=500)
        print(f"\n[{i}] Page {page_num}\n{snippet}")

In [None]:
ask("Where do I report estimated tax payments I made during the year?")

**Q:** Where do I report estimated tax payments I made during the year?

**A:** Yes, you can report estimated tax payments you made during the year by completing Form 1040, U.S. Individual Income Tax Return, and attaching Schedule 6, Line 12, which asks for the date and amount of each payment. The form should be filed with your federal income tax return.


--- Top-4 Retrieved Chunks ---

[1] Page 1
W-2, see instructions. 1 a Total amount from Form(s) W-2, box 1 (see instructions) . . . . . . . . . . . . . 1a b Household employee wages not reported on Form(s) W-2 . . . . . . . . . . . . . 1b c Tip income not reported on line 1a (see instructions) . . . . . . . . . . . . . . 1c d Medicaid waiver payments not reported on Form(s) W-2 (see instructions) . . . . . . . . 1d e Taxable dependent care benefits from Form 2441, line 26 . . . . . . . . . . . . 1e f Employer-provided adoption [...]

[2] Page 1
32 Add lines 27, 28, 29, and 31. These are your total other payments and refundable credits . . 32 33 Add lines 25d, 26, and 32. These are your total payments . . . . . . . . . . . . 33 Refund 34 If line 33 is more than line 24, subtract line 24 from line 33. This is the amount you overpaid . . 34 35a Amount of line 34 you want refunded to you. If Form 8888 is attached, check here . . . . 35a Direct deposit? See instructions. b Routing number c

In [None]:
# ==============================================================
# 10. TRY IT!
# ==============================================================
ask("What is Application Software")
ask("What is Tax Consultancy")

**Q:** What is Application Software

**A:** Answer: Application software refers to any software that is used by individuals, organizations, or businesses to perform specific tasks such as accounting, data management, project management, or customer relationship management. Examples of application software include Microsoft Office Suite, SAP Business One, and Salesforce.


--- Top-4 Retrieved Chunks ---

[1] Page 1
W-2, see instructions. 1 a Total amount from Form(s) W-2, box 1 (see instructions) . . . . . . . . . . . . . 1a b Household employee wages not reported on Form(s) W-2 . . . . . . . . . . . . . 1b c Tip income not reported on line 1a (see instructions) . . . . . . . . . . . . . . 1c d Medicaid waiver payments not reported on Form(s) W-2 (see instructions) . . . . . . . . 1d e Taxable dependent care benefits from Form 2441, line 26 . . . . . . . . . . . . 1e f Employer-provided adoption [...]

[2] Page 1
Form1040 2024U.S. Individual Income Tax Return Department of the Treasury—Internal Revenue Service OMB No. 1545-0074 IRS Use Only—Do not write or staple in this space. For the year Jan. 1–Dec. 31, 2024, or other tax year beginning , 2024, ending , 20 See separate instructions. Your first name and middle initial Last name Your social security number If joint return, spouse’s first name and middle initial Last name Spouse’s social security number

**Q:** What is Tax Consultancy

**A:** "Do you offer tax consultancy?"


--- Top-4 Retrieved Chunks ---

[1] Page 1
z Add lines 1a through 1h . . . . . . . . . . . . . . . . . . . . . . 1z Attach Sch. B if required. 2a Tax-exempt interest . . . 2a b Taxable interest . . . . . 2b 3a Qualified dividends . . . 3a b Ordinary dividends . . . . . 3b 4a IRA distributions . . . . 4a b Taxable amount . . . . . . 4b 5a Pensions and annuities . . 5a b Taxable amount . . . . . . 5b 6a Social security benefits . . 6a b Taxable amount . . . . . . 6b c If you elect to use the lump-sum election method, check here (see [...]

[2] Page 1
W-2, see instructions. 1 a Total amount from Form(s) W-2, box 1 (see instructions) . . . . . . . . . . . . . 1a b Household employee wages not reported on Form(s) W-2 . . . . . . . . . . . . . 1b c Tip income not reported on line 1a (see instructions) . . . . . . . . . . . . . . 1c d Medicaid waiver payments not reported on Form(s) W-2 (see instructions) . . . . . . . . 1d e Taxable dependent care benefits from Form 2441, line 26 . . . . . 