<a href="https://colab.research.google.com/github/sumitsingh3072/rag-intern-task/blob/main/rag_intern_task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
!pip install \
    requests==2.32.4 \
    opentelemetry-api==1.37.0 \
    opentelemetry-sdk==1.37.0 \
    opentelemetry-proto==1.37.0 \
    opentelemetry-exporter-otlp-proto-common==1.37.0 \
    langchain_community \
    langchain-huggingface \
    langchain-groq \
    chromadb \
    pypdf \
    rank_bm25 \
    llama-cpp-python



In [26]:
import requests
import tempfile
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_and_split_pdf(source: str, is_url: bool = False):
    """Loads a PDF from a URL or a local file path and splits it into chunks."""
    try:
        if is_url:
            print(f"Downloading PDF from {source}...")
            response = requests.get(source)
            response.raise_for_status()
            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_f:
                temp_f.write(response.content)
                pdf_path = temp_f.name
            print(f"PDF saved temporarily to {pdf_path}")

        else:
            pdf_path = source
            if not os.path.exists(pdf_path):
                print(f"Error: File not found at {pdf_path}")
                return []
        loader = PyPDFLoader(pdf_path)
        docs = loader.load()
        if is_url and 'pdf_path' in locals():
            os.remove(pdf_path)
            print(f"Temporary file {pdf_path} removed.")

    except Exception as e:
        print(f"An error occurred during PDF loading: {e}")
        if is_url and 'pdf_path' in locals() and os.path.exists(pdf_path):
            os.remove(pdf_path)
        return []
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    splits = text_splitter.split_documents(docs)
    print(f"PDF loaded and split into {len(splits)} chunks.")
    return splits

In [27]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
model_name = "BAAI/bge-small-en-v1.5"
model_kwargs = {"device": "cuda"}  # Use "cpu" if you have no GPU
encode_kwargs = {"normalize_embeddings": True}

embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [28]:
from langchain_groq import ChatGroq
from google.colab import userdata

print("Connecting to Groq API...")
groq_api_key = userdata.get('GROQ_API_KEY')

llm = ChatGroq(
    api_key=groq_api_key,
    model_name="llama-3.1-8b-instant"
)

print("Groq LLM is ready!")


Connecting to Groq API...
Groq LLM is ready!


In [29]:
from langchain_community.vectorstores import Chroma
from langchain.retrievers import BM25Retriever, EnsembleRetriever

def setup_retriever(splits):
    if not splits:
        print("No document splits to process. Cannot setup retriever.")
        return None
    vectorstore = Chroma.from_documents(splits, embeddings)
    vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    # 2. Initialize BM25 (Keyword/"Fuzzy")
    doc_texts = [d.page_content for d in splits]
    bm25_retriever = BM25Retriever.from_texts(doc_texts)
    bm25_retriever.k = 3
    ensemble_retriever = EnsembleRetriever(
        retrievers=[bm25_retriever, vector_retriever],
        weights=[0.5, 0.5]
    )
    return ensemble_retriever

In [30]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

# Set up memory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

## Application Logic and workflow
The below cell contains the main application. Upon running the below cell you'll get a text box to enter either "file" or "pdf_url"  You can upload a pdf-file or a enter particular pdf-url. Once the file is uploaded or url is provided the chatbot would start functioning.


In [31]:
print("Welcome to the PDF Chatbot!")
print("Please provide a PDF source.")
from google.colab import files
import os

source_type = input("Is your source a 'url' or a 'file' (for upload)? ")
pdf_path_in_colab = None
splits = []

try:
    if source_type.lower() == 'url':
        pdf_source = input("Enter the PDF URL: ")
        splits = load_and_split_pdf(pdf_source, is_url=True)

    elif source_type.lower() == 'file':
        print("Please upload your PDF file...")
        uploaded = files.upload()

        if not uploaded:
            print("No file was uploaded. Exiting.")
        else:
            filename = list(uploaded.keys())[0]
            content = uploaded[filename]

            with open(filename, 'wb') as f:
                f.write(content)

            pdf_path_in_colab = filename
            print(f"File '{filename}' uploaded successfully to Colab.")

            splits = load_and_split_pdf(pdf_path_in_colab, is_url=False)

            if os.path.exists(pdf_path_in_colab):
                os.remove(pdf_path_in_colab)
                print(f"Removed temporary file: {pdf_path_in_colab}")

    else:
        print("Invalid input. Please enter 'url' or 'file'.")

except Exception as e:
    print(f"An error occurred: {e}")
    if pdf_path_in_colab and os.path.exists(pdf_path_in_colab):
        os.remove(pdf_path_in_colab)
if splits:
    print("Setting up retriever...")
    retriever = setup_retriever(splits)
    print("Creating conversational chain...")
    qa_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        memory=memory,
        verbose=False
    )

    print("\nPDF processed. Chatbot is ready! Type 'exit' to quit.")

    while True:
        query = input("You: ")
        if query.lower() in ["exit", "quit"]:
            break

        result = qa_chain.invoke({"question": query})
        print(f"Bot: {result['answer']}")
else:
    print("Could not process the PDF. Please try again.")

Welcome to the PDF Chatbot!
Please provide a PDF source.
Is your source a 'url' or a 'file' (for upload)? url
Enter the PDF URL: https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
Downloading PDF from https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf...
PDF saved temporarily to /tmp/tmpgb0jfaf_.pdf
Temporary file /tmp/tmpgb0jfaf_.pdf removed.
PDF loaded and split into 43 chunks.
Setting up retriever...
Creating conversational chain...

PDF processed. Chatbot is ready! Type 'exit' to quit.
You: what is this document about
Bot: This document appears to be a research paper or a technical document about a deep learning model for machine translation, specifically for translating English to German and English to French. The document describes the architecture of the model, the training regime, and the experimental setup. 

The model uses a combination of self-attention and point-wise feed-f