In [None]:
# 📦 Step 1: Install all required packages
!pip install -q unsloth
!pip install -q faiss-cpu
!pip install -q sentence-transformers
!pip install -q langchain
!pip install -q accelerate
!pip install -q transformers datasets


In [None]:
!pip install -q git+https://github.com/unslothai/unsloth.git


In [None]:
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

# 🧠 Choose Unsloth 4-bit quantized model
model_name = "unsloth/llama-3-8b-bnb-4bit"

# 🔄 Load model in 4-bit for low memory usage
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    load_in_4bit = True,
    device_map = "auto"  # GPU use karega agar available ho
)

# ✅ Make model fast for inference
FastLanguageModel.for_inference(model)


In [None]:
!pip install -q bitsandbytes

In [None]:
!pip install -q unsloth_zoo

In [None]:
# Poppler (needed for PDF rendering)
!apt-get install -y poppler-utils

# Required Python packages
!pip install pdf2image
!pip install unstructured
!pip install "unstructured[local-inference]"


In [None]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("Attention.pdf")
documents = loader.load()
print(f"Total documents loaded: {len(documents)}")


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Chunking config
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

# Split your PDF documents into chunks
chunks = text_splitter.split_documents(documents)
print(f"Total chunks created: {len(chunks)}")


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

# Use a compact, fast model (for Colab compatibility)
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


In [None]:
!pip install faiss-cpu

from langchain.vectorstores import FAISS

# Index the chunks
vectorstore = FAISS.from_documents(chunks, embedding_model)

# Save it locally (optional)
vectorstore.save_local("rag_faiss_index")


In [None]:
from langchain.vectorstores import FAISS

# ✅ Allow loading trusted pickle file
vectorstore = FAISS.load_local(
    "rag_faiss_index",
    embeddings=embedding_model,
    allow_dangerous_deserialization=True
)


In [20]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableMap, RunnableLambda

# ✅ Prompt
prompt = ChatPromptTemplate.from_template("""
Use the following context to answer the question **clearly and concisely**.

Context:
{context}

Question:
{question}

Answer:
""")

def generate_response(input_text):
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=512)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Optional cleanup: remove duplicate lines if model repeats
    lines = decoded.splitlines()
    unique_lines = list(dict.fromkeys(lines))  # Removes duplicates while preserving order
    return "\n".join(unique_lines)

# New RAG chain with better prompt
rag_chain = (
    RunnableMap({
        "context": lambda x: retriever.invoke(x["question"]),
        "question": lambda x: x["question"],
    })
    | RunnableLambda(lambda x: prompt.format(**x))  # use improved prompt
    | RunnableLambda(generate_response)
    | StrOutputParser()
)

query = "What is self-attention mechanism in transformers?"
response = rag_chain.invoke({"question": query})
print("🤖 Answer:", response)


# ✅ Ask a question
query = "What is self-attention mechanism in transformers?"
response = rag_chain.invoke({"question": query})
print("🤖 Answer:", response)


  return forward_call(*args, **kwargs)


🤖 Answer: Human: 
Use the following context to answer the question **clearly and concisely**.

Context:
[Document(id='286ea722-a4c4-4f75-a0a7-a472772932c9', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '','moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5','subject': '', 'title': '', 'trapped': '/False','source': 'Attention.pdf', 'total_pages': 15, 'page': 2, 'page_label': '3'}, page_content='Figure 1: The Transformer - model architecture.\nThe Transformer follows this overall architecture using stacked self-attention and point-wise, fully\nconnected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,\nrespectively.\n3.1 Encoder and Decoder Stacks\nEncoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two\nsub-layers. The fir

  return forward_call(*args, **kwargs)


🤖 Answer: Human: 
Use the following context to answer the question **clearly and concisely**.

Context:
[Document(id='286ea722-a4c4-4f75-a0a7-a472772932c9', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '','moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5','subject': '', 'title': '', 'trapped': '/False','source': 'Attention.pdf', 'total_pages': 15, 'page': 2, 'page_label': '3'}, page_content='Figure 1: The Transformer - model architecture.\nThe Transformer follows this overall architecture using stacked self-attention and point-wise, fully\nconnected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,\nrespectively.\n3.1 Encoder and Decoder Stacks\nEncoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two\nsub-layers. The fir