In [1]:
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter

# ✅ Step 1: Extract text from a single PDF file
def parse_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# ✅ Step 2: Chunk the text for RAG-style retrieval
def chunk_text(text, chunk_size=500, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " "]
    )
    return splitter.split_text(text)

# ✅ Step 3: Manually specify PDF paths
pdf_paths = [
    "1706.03762v7.pdf",
    "1810.04805v2.pdf",
    "2005.14165v4.pdf",
    "2010.11929v2.pdf",
    "2302.13971v1.pdf",
    "GenAI_in_Academic_Writing.pdf"
]

# ✅ Step 4: Process PDFs
all_chunks = []

for pdf_file in pdf_paths:
    print(f"📄 Reading: {pdf_file}")
    full_text = parse_pdf(pdf_file)
    chunks = chunk_text(full_text)
    all_chunks.extend(chunks)

print(f"\n✅ Total chunks created from all PDFs: {len(all_chunks)}")

📄 Reading: 1706.03762v7.pdf
📄 Reading: 1810.04805v2.pdf
📄 Reading: 2005.14165v4.pdf
📄 Reading: 2010.11929v2.pdf
📄 Reading: 2302.13971v1.pdf
📄 Reading: GenAI_in_Academic_Writing.pdf

✅ Total chunks created from all PDFs: 1310


In [2]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document

# Step 1: Wrap your chunks as LangChain Documents
documents = [Document(page_content=chunk) for chunk in all_chunks]

# Step 2: Use BAAI/bge-base-en-v1.5 embedding model
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-base-en-v1.5",
    model_kwargs={"device": "cpu"},  # or "cuda" if you have a GPU
    encode_kwargs={"normalize_embeddings": True}  # recommended for bge models
)

# Step 3: Build FAISS index from documents
vector_store = FAISS.from_documents(documents, embedding_model)

# Step 4: Save FAISS index
vector_store.save_local("faiss_store_bge")


  embedding_model = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [4]:
retriever = vector_store.as_retriever(
    search_type="mmr",
    search_kwargs={
        "k": 5,
        "fetch_k": 15,
        "lambda_mult": 0.7
    }
)


In [9]:
from langchain_groq import ChatGroq
from langchain import LLMChain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv

load_dotenv()

# ✅ Instantiate Groq LLaMA3
qlm = ChatGroq(
    model_name="llama3-70b-8192"
)

# ✅ Simplified Prompt (good for most LLMs, including LLaMA3)
prompt = PromptTemplate(
    input_variables=["question"],
    template="""
Rephrase the following question into 3 different, yet related, search queries. List them without any explanation or numbering.

Question: {question}
"""
)



# ✅ Create chain using qlm
multi_query_chain = LLMChain(llm=qlm, prompt=prompt)

# ✅ Function to generate multiple queries
def generate_multi_queries_fn(user_query: str):
    output = multi_query_chain.run(user_query)
    queries = [line.strip() for line in output.strip().split("\n") if line.strip()]
    return queries[:3]

# ✅ Test run
queries = generate_multi_queries_fn("What causes rainbows to form?")
print("Generated Queries:", queries)


Generated Queries: ['how do rainbows occur', 'what creates a rainbow in the sky', 'formation of rainbows explained']


In [10]:
from langchain.schema import Document

def multi_query_retrieve_fn(user_query: str, top_n: int = 5):
    # Step 1: Generate reformulated search queries
    queries = generate_multi_queries_fn(user_query)

    # Step 2: Initialize list of all retrieved documents
    all_docs = []

    # Step 3: Retrieve from original query
    original_docs = retriever.get_relevant_documents(user_query)
    all_docs.extend(original_docs)

    # Step 4: Retrieve for each reformulated query
    for query in queries:
        docs = retriever.get_relevant_documents(query)
        all_docs.extend(docs)

    # Step 5: Deduplicate based on document content
    unique_docs = list({doc.page_content: doc for doc in all_docs}.values())

    # Step 6: Return top-N results
    return unique_docs[:top_n]


In [11]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [12]:
from langchain.schema.runnable import RunnableParallel, RunnablePassthrough,RunnableLambda
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
parallel_chain = RunnableParallel({
    "question": RunnablePassthrough(),
    "context": RunnablePassthrough() | multi_query_retrieve_fn | RunnableLambda(format_docs)
})


In [13]:
parallel_chain.invoke('What Is transformer')

  original_docs = retriever.get_relevant_documents(user_query)


{'question': 'What Is transformer',
 'context': 'language modeling tasks [34].\nTo the best of our knowledge, however, the Transformer is the first transduction model relying\nentirely on self-attention to compute representations of its input and output without using sequence-\naligned RNNs or convolution. In the following sections, we will describe the Transformer, motivate\nself-attention and discuss its advantages over models such as [17, 18] and [9].\n3\nModel Architecture\n\nscribed in Vaswani et al. (2017) and released in\nthe tensor2tensor library.1 Because the use\nof Transformers has become common and our im-\nplementation is almost identical to the original,\nwe will omit an exhaustive background descrip-\ntion of the model architecture and refer readers to\nVaswani et al. (2017) as well as excellent guides\nsuch as “The Annotated Transformer.”2\nIn this work, we denote the number of layers\n(i.e., Transformer blocks) as L, the hidden size as\n\n6\nResults\n6.1\nMachine Trans

In [14]:
from langchain.prompts import PromptTemplate

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""You are a knowledgeable AI assistant tasked with answering questions strictly based on the provided context.

Instructions:
- Use only the context to construct your answer.
- If the context is insufficient, say "The provided context does not contain enough information."
- Be concise, factual, and avoid speculation.
- Do not mention the existence of context in your answer.

Context:
{context}

Question:
{question}

Answer:"""
)


In [15]:
from langchain_groq import ChatGroq
from dotenv import load_dotenv
import os

# Load environment variables (make sure GROQ_API_KEY is set in your .env file)
load_dotenv()

# Define the LLM
llm = ChatGroq(
    model_name="llama3-70b-8192",  # ✅ Best model from Groq
    temperature=0.2  # Optional: keeps responses factual and deterministic
)

# Optional test prompt (for sanity check)
test_prompt = "Explain the transformer architecture in 200 words."
response = llm.invoke(test_prompt)
print("Response:\n", response.content)


Response:
 The Transformer architecture is a type of neural network introduced in 2017 by Vaswani et al. in the paper "Attention is All You Need". It's primarily designed for sequence-to-sequence tasks, such as machine translation, but has since been widely adopted in other natural language processing (NLP) applications.

The Transformer architecture is based on self-attention mechanisms, which allow the model to weigh the importance of different input elements relative to each other. This is different from traditional recurrent neural networks (RNNs), which process sequences sequentially and have recurrence connections that allow them to capture long-range dependencies.

The Transformer model consists of an encoder and a decoder. The encoder takes in a sequence of tokens (e.g., words or characters) and outputs a continuous representation of the input sequence. The decoder generates the output sequence, one token at a time, based on the encoder's output and self-attention mechanisms.



In [17]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

# Define the final RAG chain
rag_chain = prompt | llm | StrOutputParser()


In [18]:
final_chain = parallel_chain | rag_chain


In [52]:
response = final_chain.invoke("What is bert?")
print(response)


BERT is a pre-trained deep bidirectional representation model designed to pre-train on unlabeled text by jointly conditioning on both left and right context in all layers.


In [51]:
from langchain_core.runnables import Runnable
from langchain_core.documents import Document

from sentence_transformers import SentenceTransformer, util

# Load models
similarity_model = SentenceTransformer("all-MiniLM-L6-v2")


# ✅ Your LangChain chain
# Assuming final_chain is already defined as your LangChain chain
# from your previous steps: final_chain = LLMChain(prompt=..., llm=..., etc.)

# ✅ Transformers & Generative AI QA Dataset
qa_dataset = [
    ("What is a Transformer in deep learning?",
     "A Transformer is a neural network architecture based on self-attention mechanisms that allows models to weigh the importance of different parts of the input data dynamically."),

    ("How does self-attention work in Transformers?",
     "Self-attention allows a model to focus on different positions of the input sequence to capture contextual relationships by computing attention scores between all token pairs."),

    ("What is the role of positional encoding in Transformers?",
     "Positional encoding provides the model with information about the position of tokens in a sequence, which is essential since Transformers lack recurrence."),

    ("Define encoder and decoder in Transformer architecture.",
     "The encoder processes the input sequence to generate hidden representations, while the decoder uses these representations to produce the output sequence step-by-step."),

    ("What is Generative AI?",
     "Generative AI refers to AI systems that can create new content, such as text, images, or music, often using models like GPT, DALL·E, or diffusion models."),

    ("How is GPT different from BERT?",
     "GPT is a unidirectional, autoregressive language model designed for generation tasks, while BERT is bidirectional and primarily used for understanding tasks like classification."),

    ("What are large language models (LLMs)?",
     "Large language models are neural networks trained on massive corpora to understand and generate human-like language based on statistical patterns."),

    ("What is fine-tuning in Generative AI?",
     "Fine-tuning involves taking a pre-trained generative model and adapting it to a specific domain or task by training it on a smaller, specialized dataset."),

    ("Explain prompt engineering in the context of LLMs.",
     "Prompt engineering is the practice of designing input prompts to guide the output behavior of a language model effectively for specific tasks."),

    ("What are attention heads in Transformers?",
     "Attention heads are components within multi-head attention that allow the model to attend to information from different representation subspaces at different positions.")
]

# ✅ Evaluate your chain
for question, reference_answer in qa_dataset:
    # Run your chain (LangChain Runnable)
    predicted_answer = final_chain.invoke(question)

    # Compute Sentence Similarity
    embeddings = similarity_model.encode([predicted_answer, reference_answer], convert_to_tensor=True)
    cosine_sim = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()



    # Print Results
    print(f"🔹 Question: {question}")
    print(f"🧠 Predicted Answer: {predicted_answer}")
    print(f"✅ Reference Answer: {reference_answer}")
    print(f"🔗 Cosine Similarity: {cosine_sim:.4f}")
    
    print("-" * 100)


🔹 Question: What is a Transformer in deep learning?
🧠 Predicted Answer: A Transformer is a transduction model that relies entirely on self-attention to compute representations of its input and output without using sequence-aligned RNNs or convolution.
✅ Reference Answer: A Transformer is a neural network architecture based on self-attention mechanisms that allows models to weigh the importance of different parts of the input data dynamically.
🔗 Cosine Similarity: 0.8251
----------------------------------------------------------------------------------------------------
🔹 Question: How does self-attention work in Transformers?
🧠 Predicted Answer: In the Transformer, self-attention is used in three different ways, including "encoder-decoder attention" layers, where the queries come from the previous decoder layer, and the memory keys and values come from the output of the encoder, allowing every position in the decoder to attend over all positions in the input sequence.
✅ Reference Answe