<a href="https://colab.research.google.com/github/upkarsssharma/notebooks/blob/main/NLP_13%2614Sep'25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

sentences = [
    "man bites dog",
    "dog bites man",
    "the dog is friendly",
    "the man is kind"
]

# CountVectorizer builds the vocabulary and the BoW matrix for us.
# It also prints feature names (columns) which are easy to explain in class.
cv = CountVectorizer()  # default: token pattern for words
X = cv.fit_transform(sentences)  # sparse matrix [n_samples, |V|]
V = cv.get_feature_names_out()

# Show as a neat table for teaching
bow_df = pd.DataFrame(X.toarray(), columns=V, index=[f"s{i}" for i in range(1, len(sentences)+1)])
bow_df

#"man bites dog",
# "dog bites man",


Unnamed: 0,bites,dog,friendly,is,kind,man,the
s1,1,1,0,0,0,1,0
s2,1,1,0,0,0,1,0
s3,0,1,1,1,0,0,1
s4,0,0,0,1,1,1,1


In [None]:
import re, math
from collections import defaultdict, Counter
import pandas as pd

def tok(s): return re.findall(r"\w+|[^\w\s]", s.lower())
corpus = ["the cat sat", "the cat ate", "the dog sat"]
V = sorted(set(t for s in corpus for t in tok(s)) | {"<bos>", "<eos>"})

bigram = defaultdict(Counter); unigram = Counter()
for s in corpus:
    T = ["<bos>"] + tok(s) + ["<eos>"]
    unigram.update(T)
    for a,b in zip(T, T[1:]): bigram[a][b] += 1

def P(next_tok, prev_tok, alpha=1.0):
    num = bigram[prev_tok][next_tok] + alpha
    den = sum(bigram[prev_tok].values()) + alpha*len(V)
    return num/den

# Show P(next | "the")
pd.Series({n: round(P(n, "ate"), 3) for n in V}).sort_values(ascending=False).head(8)

# Perplexity

Unnamed: 0,0
<eos>,0.25
<bos>,0.125
ate,0.125
cat,0.125
dog,0.125
sat,0.125
the,0.125


In [None]:
def greedy(prefix="the", max_steps=5, a=1.0):
    seq = ["<bos>"] + tok(prefix)
    for _ in range(max_steps):
        prev = seq[-1]
        nxt = max(V, key=lambda n: P(n, prev, a))
        seq.append(nxt)
        if nxt=="<eos>": break
    return " ".join(t for t in seq if t not in {"<bos>","<eos>"})

print(greedy("ate"))

ate


# RNN


In [None]:
import torch
import torch.nn as nn
import numpy as np

In [None]:
text = 'hello world' * 10
text

'hello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello worldhello world'

In [None]:
chars = sorted(list(set(text)))
chars

[' ', 'd', 'e', 'h', 'l', 'o', 'r', 'w']

In [None]:
char_to_ix = {ch: i for i, ch in enumerate(chars)}
char_to_ix

{' ': 0, 'd': 1, 'e': 2, 'h': 3, 'l': 4, 'o': 5, 'r': 6, 'w': 7}

In [None]:
ix_to_char = {i: ch  for i,ch in enumerate(chars)}
ix_to_char

{0: ' ', 1: 'd', 2: 'e', 3: 'h', 4: 'l', 5: 'o', 6: 'r', 7: 'w'}

In [None]:
vocab_size = len(chars)
vocab_size

8

In [None]:
def one_hot(ix, vocab_size): #Converts everything into 1s and 0s
    if isinstance(ix, torch.Tensor):
        ix = ix.long()
    return torch.eye(vocab_size)[ix]

In [None]:
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__() # Calls the basic blueprint

        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden=None):
        out, hidden = self.rnn(x, hidden)
        out = self.fc(out)
        return out, hidden

In [None]:
model = SimpleRNN(vocab_size, 50, vocab_size)
criterion = nn.CrossEntropyLoss() # Error in predictions -> (predicted - actual)^2
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [None]:
model

SimpleRNN(
  (rnn): RNN(8, 50, batch_first=True)
  (fc): Linear(in_features=50, out_features=8, bias=True)
)

In [None]:
#Training set -> Input and Output
X = torch.tensor([[char_to_ix[c] for c in text[:-1]]], dtype=torch.long)
y = torch.tensor([[char_to_ix[c] for c in text[1:]]], dtype=torch.long)

In [None]:
# Training
for epoch in range(100):
    # Zero gradients (PyTorch accumulates; reset each step)
    optimizer.zero_grad()
    # Convert indices to one-hot vectors, float for math ops
    onehot_x = one_hot(X, vocab_size).float()  # Shape: (1, 99, vocab_size)
    # Forward: Get predictions
    output, _ = model(onehot_x)
    # Compute loss: Flatten output to (99, vocab), targets to (99,)
    loss = criterion(output.reshape(-1, vocab_size), y.reshape(-1))
    # Backward: Compute gradients
    loss.backward()
    # Update weights
    optimizer.step()
    # Print progress – loss should drop as it learns!
    print(f"Epoch {epoch+1}, Loss: {loss.item():.2f}")

Epoch 1, Loss: 2.05
Epoch 2, Loss: 1.93
Epoch 3, Loss: 1.82
Epoch 4, Loss: 1.71
Epoch 5, Loss: 1.55
Epoch 6, Loss: 1.31
Epoch 7, Loss: 1.14
Epoch 8, Loss: 1.25
Epoch 9, Loss: 0.84
Epoch 10, Loss: 0.90
Epoch 11, Loss: 0.70
Epoch 12, Loss: 0.62
Epoch 13, Loss: 0.55
Epoch 14, Loss: 0.48
Epoch 15, Loss: 0.40
Epoch 16, Loss: 0.33
Epoch 17, Loss: 0.28
Epoch 18, Loss: 0.23
Epoch 19, Loss: 0.18
Epoch 20, Loss: 0.15
Epoch 21, Loss: 0.12
Epoch 22, Loss: 0.09
Epoch 23, Loss: 0.08
Epoch 24, Loss: 0.06
Epoch 25, Loss: 0.05
Epoch 26, Loss: 0.05
Epoch 27, Loss: 0.04
Epoch 28, Loss: 0.04
Epoch 29, Loss: 0.03
Epoch 30, Loss: 0.03
Epoch 31, Loss: 0.02
Epoch 32, Loss: 0.02
Epoch 33, Loss: 0.02
Epoch 34, Loss: 0.02
Epoch 35, Loss: 0.02
Epoch 36, Loss: 0.01
Epoch 37, Loss: 0.01
Epoch 38, Loss: 0.01
Epoch 39, Loss: 0.01
Epoch 40, Loss: 0.01
Epoch 41, Loss: 0.01
Epoch 42, Loss: 0.01
Epoch 43, Loss: 0.01
Epoch 44, Loss: 0.01
Epoch 45, Loss: 0.01
Epoch 46, Loss: 0.01
Epoch 47, Loss: 0.01
Epoch 48, Loss: 0.01
E

In [None]:
def generate(model, start_char, length=20):
    # Set to eval mode: Disables dropout, etc. (though none here)
    model.eval()
    # Initial hidden state: None (starts from zero)
    hidden = None
    # Start with the given char
    gen = [start_char]
    # Get index of start char
    ix = char_to_ix[start_char]  # Integer index
    # Prepare single-step input tensor
    x = torch.tensor([[ix]], dtype=torch.long)  # Shape: (1, 1)
    # One-hot it for the model
    onehot_x = one_hot(x, vocab_size).float()  # Shape: (1, 1, vocab_size)

    # Loop to generate 'length' new chars
    for _ in range(length):
        # Forward: Predict next, update hidden (memory carries over!)
        out, hidden = model(onehot_x, hidden)
        # Get probs for last position (current step)
        probs = torch.softmax(out[0, -1], dim=0)  # Softmax: Turn logits to probabilities
        # Sample next index (multinomial: weighted random choice – adds variety!)
        next_ix = torch.multinomial(probs, 1).item()  # Integer output
        # Append the char
        gen.append(ix_to_char[next_ix])
        # Update for next step
        ix = next_ix
        x = torch.tensor([[ix]], dtype=torch.long)  # Long tensor
        onehot_x = one_hot(x, vocab_size).float()  # Float for model
    # Join into string
    return ''.join(gen)

# Run generation with a fun tweak
# Try starting with 'h' – should echo "hello world" patterns
print(generate(model, 'h', 20))

hello worldhello worl


# Transformers

In [None]:
import torch
import torch.nn as nn
from torch.nn.functional import softmax
import numpy as np

# Toy vocab and data (simple sentences)
sentences = [ "My parents loved me.", "I knew they did.", "They failed spectacularly to embrace that I would inevitably grow older in their quest to remain young and relevant.", "They would have to acknowledge at some point that the chubby toddler fat would give way to sprawling awkward limbs.", "The dawning muscle of youth would become the sinews of the solid flesh of adolescence.", "We were but the briefest amount of time away from when adolescence would become the teen years.", "I don't think they thought much about how my growing up would ever impact their business.", "I did believe that if they could have put me into some amber and frozen me in time they would have.", "They would have done so without a doubt." ] * 5
all_words = set(w for s in sentences for w in s.split())
all_words.add('<pad>')  # Dedicated padding token
words = sorted(all_words)
word_to_ix = {w: i for i, w in enumerate(words)}
ix_to_word = {i: w for i, w in enumerate(words)}
vocab_size = len(words)
pad_ix = word_to_ix['<pad>']  # e.g., 9 – avoids conflicting with real words

# Positional encoding (unchanged – adds "position stamps" to embeddings)
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# Simple Transformer (unchanged – uses built-in attention magic)
class SimpleTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=64, nhead=4, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        self.transformer = nn.Transformer(d_model, nhead, num_layers, num_layers, batch_first=True)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt):
        src = self.embedding(src) * np.sqrt(64)  # Scale embeddings (standard trick)
        src = self.pos_encoder(src)
        tgt = self.embedding(tgt) * np.sqrt(64)
        tgt = self.pos_encoder(tgt)
        output = self.transformer(src, tgt)
        return self.fc(output)

# Prep data with proper padding
def prep_data(sent, max_len):
    indices = [word_to_ix[w] for w in sent.split()]  # No default 0 – all real words
    if len(indices) < max_len:
        indices += [pad_ix] * (max_len - len(indices))  # Pad with <pad>
    return torch.tensor(indices, dtype=torch.long)

# Find max length (both sentences are 6 words)
max_len = max(len(s.split()) for s in sentences)

# Prepare X (source: full seq minus last) and y (target: full seq shifted right)
X = torch.stack([prep_data(s, max_len)[:-1] for s in sentences])  # Shape: (10, 5)
y = torch.stack([prep_data(s, max_len)[1:] for s in sentences])   # Shape: (10, 5)

# Quick train (5 epochs – watch loss drop as it learns patterns)
model = SimpleTransformer(vocab_size)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=pad_ix)  # Ignore <pad> in loss

for epoch in range(50):
    optimizer.zero_grad()
    tgt_input = y[:, :-1]  # Feed shifted targets to decoder (length 4)
    output = model(X, tgt_input)  # Predicts next tokens (shape: (10, 4, vocab))
    target = y[:, 1:]  # Align targets to predictions (length 4)
    loss = criterion(output.reshape(-1, vocab_size), target.reshape(-1))
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.2f}")

# Generate interesting output (builds sequence step-by-step)
def generate(model, start_words, seq_len=10):  # Renamed for clarity (pad to seq_len)
    model.eval()
    words = start_words.split()
    # Pad source to seq_len with <pad>
    src_indices = [word_to_ix.get(w, pad_ix) for w in words] + [pad_ix] * (seq_len - len(words))
    src = torch.tensor([src_indices], dtype=torch.long)  # Shape: (1, seq_len)
    # Start decoder with first word
    tgt = torch.tensor([[word_to_ix.get(words[0], pad_ix)]], dtype=torch.long)

    for _ in range(seq_len):
        with torch.no_grad():
            out = model(src, tgt)
            probs = softmax(out[0, -1], dim=0)  # Probs for next word
            next_ix = torch.multinomial(probs, 1).item()
            if next_ix == pad_ix: break  # Stop if predicts pad
            next_word = ix_to_word[next_ix]
            words.append(next_word)
            tgt = torch.cat([tgt, torch.tensor([[next_ix]], dtype=torch.long)], dim=1)
    return ' '.join(words[:seq_len + 1])  # Limit output length

# Run fun generations – tweak start_words for laughs!
print(generate(model, "The boy", seq_len=20))  # E.g., "the cat sat on the mat" remix
# print(generate(model, "the dog", seq_len=6))  # E.g., "the dog ran in the park" hybrid

Epoch 1, Loss: 4.71
Epoch 2, Loss: 4.16
Epoch 3, Loss: 3.88
Epoch 4, Loss: 3.55
Epoch 5, Loss: 3.28
Epoch 6, Loss: 3.03
Epoch 7, Loss: 2.80
Epoch 8, Loss: 2.59
Epoch 9, Loss: 2.42
Epoch 10, Loss: 2.26
Epoch 11, Loss: 2.10
Epoch 12, Loss: 1.95
Epoch 13, Loss: 1.83
Epoch 14, Loss: 1.71
Epoch 15, Loss: 1.61
Epoch 16, Loss: 1.50
Epoch 17, Loss: 1.40
Epoch 18, Loss: 1.33
Epoch 19, Loss: 1.26
Epoch 20, Loss: 1.19
Epoch 21, Loss: 1.12
Epoch 22, Loss: 1.08
Epoch 23, Loss: 1.01
Epoch 24, Loss: 0.96
Epoch 25, Loss: 0.93
Epoch 26, Loss: 0.87
Epoch 27, Loss: 0.83
Epoch 28, Loss: 0.78
Epoch 29, Loss: 0.76
Epoch 30, Loss: 0.71
Epoch 31, Loss: 0.69
Epoch 32, Loss: 0.66
Epoch 33, Loss: 0.64
Epoch 34, Loss: 0.61
Epoch 35, Loss: 0.59
Epoch 36, Loss: 0.57
Epoch 37, Loss: 0.54
Epoch 38, Loss: 0.53
Epoch 39, Loss: 0.50
Epoch 40, Loss: 0.48
Epoch 41, Loss: 0.47
Epoch 42, Loss: 0.45
Epoch 43, Loss: 0.44
Epoch 44, Loss: 0.43
Epoch 45, Loss: 0.40
Epoch 46, Loss: 0.39
Epoch 47, Loss: 0.39
Epoch 48, Loss: 0.37
E

In [None]:
print(generate(model, "The", seq_len=20))

The into point they me. in don't amber and the way me. me. sinews would grow way me. me. but me.


In [None]:
!pip install langchain langchain-huggingface faiss-cpu sentence-transformers

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading langchain_huggingface-0.3.1-py3-none-any.whl (27 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu, langchain-huggingface
Successfully installed faiss-cpu-1.12.0 langchain-huggingface-0.3.1


In [None]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.29-py3-none-any.whl.metadata (2.9 kB)
Collecting requests<3,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.6.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.6.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.6.7->langchain-community)
  Downloading mypy_extensions-1.1.0-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_community-0.3.29-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Upload notes: In Colab, Files > Upload > jee_newton_notes.txt
from langchain.document_loaders import TextLoader  # For .txt; use PyPDFLoader for PDF
loader = TextLoader("jee_newton_notes.txt")
documents = loader.load()
print("Loaded Notes Preview:", documents[0].page_content[:200])

Loaded Notes Preview: JEE Physics Notes: Newton's Laws of Motion

Section 1: Newton's Second Law
The acceleration of an object is directly proportional to the net force acting on it and inversely proportional to its mass. 


In [22]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)  # Small chunks for precision
texts = text_splitter.split_documents(documents)
print("Chunks Created:", len(texts))
for i, chunk in enumerate(texts):
    print(f"Chunk {i+1}: {chunk.page_content[:100]}...")  # Shows splits, e.g., one chunk = "Newton's Second Law... Example: A 5 kg..."

NameError: name 'documents' is not defined

In [23]:
# Upload notes: In Colab, Files > Upload > jee_newton_notes.txt
from langchain.document_loaders import TextLoader  # For .txt; use PyPDFLoader for PDF
loader = TextLoader("jee_newton_notes.txt")
documents = loader.load()
print("Loaded Notes Preview:", documents[0].page_content[:200])

RuntimeError: Error loading jee_newton_notes.txt

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # Free, fast
vectorstore = FAISS.from_documents(texts, embeddings)
# Test Retrieval (no LLM yet)
query = "Newton's second law example"
retrieved_docs = vectorstore.similarity_search(query, k=1)  # Top 1 for exact
print("Retrieved Exact Snippet:", retrieved_docs[0].page_content)

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # Free, fast
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Retrieved Exact Snippet: JEE Physics Notes: Newton's Laws of Motion

Section 1: Newton's Second Law
The acceleration of an object is directly proportional to the net force acting on it and inversely proportional to its mass. Mathematically, F = ma, where F is net force, m is mass, a is acceleration.


In [None]:
query = "give example problem of newtons second law"
retrieved_docs = vectorstore.similarity_search(query, k=1)  # Top 1 for exact
print("Retrieved Exact Snippet:", retrieved_docs[0].page_content)

Retrieved Exact Snippet: JEE Physics Notes: Newton's Laws of Motion

Section 1: Newton's Second Law
The acceleration of an object is directly proportional to the net force acting on it and inversely proportional to its mass. Mathematically, F = ma, where F is net force, m is mass, a is acceleration.


In [None]:
# Install for local model (run once)
!pip install transformers accelerate

from langchain_huggingface import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

# Load local model (downloads ~250MB first time)
model_id = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=120,  # Shorter for focus
    temperature=0.1,  # Low for certainty
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id  # Fixes padding warning
)
llm = HuggingFacePipeline(pipeline=pipe)

# Custom prompt (integrated here to avoid default)
prompt_template = """You are a JEE Tutor. Use ONLY the context from notes to answer—no other methods. Stick to exact solution.
Context: {context}
Question: {question}
Answer step-by-step, citing notes:"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 1}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)
query = "Solve: 5 kg block pushed with 10 N on frictionless surface. Acceleration?"
result = qa_chain.invoke({"query": query})
print("Basic RAG Answer:", result['result'])
print("Source Used:", result['source_documents'][0].page_content)



tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cuda:0


Basic RAG Answer: You are a JEE Tutor. Use ONLY the context from notes to answer—no other methods. Stick to exact solution.
Context: Example Problem: A 5 kg block is pushed with a constant 10 N force on a frictionless horizontal surface. Find the acceleration.
Solution: Use F = ma directly as per syllabus. Rearrange: a = F / m = 10 N / 5 kg = 2 m/s².
Question: Solve: 5 kg block pushed with 10 N on frictionless surface. Acceleration?
Answer step-by-step, citing notes:
Solution: Use F = ma directly as per syllabus. Rearrange: a = F / m = 10 N / 5 kg = 2 m/s².
Question: Solve: 5 kg block pushed with 10 N on frictionless surface. Acceleration?
Answer step-by-step, citing notes:
Solution: Use F = ma directly as per syllabus. Rearrange: a = F / m = 10 N / 5 kg = 2 m/s².
Question: Solve: 5 kg block pushed with 10 N on frictionless surface. Acceleration
Source Used: Example Problem: A 5 kg block is pushed with a constant 10 N force on a frictionless horizontal surface. Find the acceleration.
S

In [None]:
# Enhanced prompt for structure/quiz
prompt_template = """You are a JEE Tutor. Use ONLY the context from notes to answer—no other methods. Stick to exact solution.
Context: {context}
Question: {question}
Answer step-by-step, citing notes. End with a practice tip and quick quiz question:"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 1}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)
query = "Solve: 5 kg block pushed with 10 N on frictionless surface. Acceleration?"
result = qa_chain.invoke({"query": query})
print("Enhanced RAG Answer:", result['result'])
print("Source Used:", result['source_documents'][0].page_content)

Enhanced RAG Answer: You are a JEE Tutor. Use ONLY the context from notes to answer—no other methods. Stick to exact solution.
Context: Example Problem: A 5 kg block is pushed with a constant 10 N force on a frictionless horizontal surface. Find the acceleration.
Solution: Use F = ma directly as per syllabus. Rearrange: a = F / m = 10 N / 5 kg = 2 m/s².
Question: Solve: 5 kg block pushed with 10 N on frictionless surface. Acceleration?
Answer step-by-step, citing notes. End with a practice tip and quick quiz question:
Question: Solve: 5 kg block pushed with 10 N on frictionless surface.
Answer step-by-step, citing notes. End with a practice tip and quick quiz question:
Question: Solve: 5 kg block pushed with 10 N on frictionless surface.
Answer step-by-step, citing notes. End with a practice tip and quick quiz question:
Question: Solve: 5 kg block pushed with 10 N on frictionless surface.
Answer step-by-step, citing notes. End with a practice tip and quick quiz question:
Question:
Sour