In [1]:
!pip install PyPDF2 langchain sentence-transformers faiss-cpu

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-tran

In [None]:
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
from transformers import pipeline # Import pipeline from transformers

# Load environment variables
load_dotenv()

# Function to extract text from uploaded PDFs
def extract_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

# Function to split text into chunks
def split_text_into_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=500)
    chunks = text_splitter.split_text(text)
    return chunks

# Function to create vector store using HuggingFace embeddings
def create_and_save_vector_store(text_chunks):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # HuggingFace embeddings
    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
    vector_store.save_local("faiss_index")

# Function to create the conversational chain using GPT2-large model
def create_prompt_template():
    prompt_template = """
    Answer the question as detailed as possible from the provided context. If the answer contains any structured data like tables or lists, respond in the same format.
    If the answer is not in the provided context, just say, "The answer is not available in the context." Do not provide a wrong answer.

    Context:
    {context}

    Question:
    {question}
    """

    prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
    return prompt

# Function to handle user input and provide a response using GPT2-large
def handle_user_query(user_question):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # Same HuggingFace embeddings
    new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
    docs = new_db.similarity_search(user_question)

    context = "\n\n".join([doc.page_content for doc in docs])  # Combine the documents for context

    prompt = create_prompt_template()
    formatted_prompt = prompt.format(context=context, question=user_question)

    # Load GPT2-large pipeline for text generation
    generator = pipeline('text-generation', model='openai-community/gpt2-large')

    # Generate response using GPT2-large
    response = generator(formatted_prompt, max_length=500, num_return_sequences=1) # Adjust max_length as needed

    # Extract generated text from the pipeline output
    reply_text = response[0]['generated_text'] if response else "No response generated."

    # Since GPT2-large is a text generation model, it might include the prompt in the output.
    # We might need to remove the prompt from the generated text for cleaner reply.
    # A simple approach is to remove the formatted_prompt from the beginning of reply_text if it exists.
    if reply_text.startswith(formatted_prompt):
        reply_text = reply_text[len(formatted_prompt):].strip()

    st.write("Reply: ", reply_text)

# Main function to run the Streamlit app
def main():
    st.set_page_config("Chat PDF")
    st.header("Chat with PDF")

    user_question = st.text_input("Ask a relevant Question")

    if user_question:
        handle_user_query(user_question)

    with st.sidebar:
        st.title("Upload PDF 📂")
        pdf_docs = st.file_uploader("Upload your PDF Files", accept_multiple_files=True)
        if st.button("Process PDF"):
            with st.spinner("Processing..."):
                raw_text = extract_pdf_text(pdf_docs)
                text_chunks = split_text_into_chunks(raw_text)
                create_and_save_vector_store(text_chunks)
                st.success("Processing Done")

if __name__ == "__main__":
    main()

In [3]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.18-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB

In [5]:
!huggingface-cli login --token XXXXXXXXXXXXXXX

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
The token `read` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `read`


In [7]:
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
from transformers import pipeline

# Load environment variables (if you still need them for other parts, otherwise can be removed)
load_dotenv()

# Function to extract text from uploaded PDFs (now accepts file paths instead of streamlit uploaders)
def extract_pdf_text(pdf_file_paths):
    text = ""
    for pdf_path in pdf_file_paths:
        try:
            with open(pdf_path, 'rb') as pdf_file: # Open PDF in binary read mode
                pdf_reader = PdfReader(pdf_file)
                for page in pdf_reader.pages:
                    text += page.extract_text()
        except FileNotFoundError:
            print(f"Error: PDF file not found at path: {pdf_path}")
            continue # Skip to the next file if one is not found
        except Exception as e:
            print(f"Error processing PDF file at {pdf_path}: {e}")
            continue
    return text

# Function to split text into chunks
def split_text_into_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=500)
    chunks = text_splitter.split_text(text)
    return chunks

# Function to create vector store using HuggingFace embeddings
def create_and_save_vector_store(text_chunks):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # HuggingFace embeddings
    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
    vector_store.save_local("faiss_index")
    print("Vector store created and saved locally.")

# Function to create the conversational chain using GPT2-large model
def create_prompt_template():
    prompt_template = """
    Answer the question as detailed as possible from the provided context. If the answer contains any structured data like tables or lists, respond in the same format.
    If the answer is not in the provided context, just say, "The answer is not available in the context." Do not provide a wrong answer.

    Context:
    {context}

    Question:
    {question}
    """

    prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
    return prompt

# Function to handle user input and provide a response using GPT2-large
def handle_user_query(user_question):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # Same HuggingFace embeddings
    try:
        new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
    except Exception as e:
        print(f"Error loading vector store: {e}. Please make sure 'faiss_index' exists and was created correctly.")
        return

    docs = new_db.similarity_search(user_question)

    context = "\n\n".join([doc.page_content for doc in docs])  # Combine the documents for context

    prompt = create_prompt_template()
    formatted_prompt = prompt.format(context=context, question=user_question)

    # Load GPT2-large pipeline for text generation
    generator = pipeline('text-generation', model='meta-llama/Llama-3.2-1B-Instruct', torch_dtype=torch.float16, device_map="auto")

    # Generate response using GPT2-large
    response = generator(formatted_prompt, max_new_tokens=5, num_return_sequences=1) # Changed max_length to max_new_tokens

    # Extract generated text from the pipeline output
    reply_text = response[0]['generated_text'] if response else "No response generated."

    # Remove the prompt from the generated text for cleaner reply (basic approach)
    if reply_text.startswith(formatted_prompt):
        reply_text = reply_text[len(formatted_prompt):].strip()

    print("Question:", user_question)
    print("Reply:", reply_text)

# Main function to run without Streamlit UI
def main():
    pdf_file_paths = [
        "/content/sample.pdf", # Replace with your PDF file paths
    ]

    user_question = "What is the main topic of these documents?" # Replace with your question

    # Process PDF and create vector store if 'faiss_index' does not exist
    if not os.path.exists("faiss_index"):
        print("Processing PDF and creating vector store...")
        raw_text = extract_pdf_text(pdf_file_paths)
        if raw_text: # Only proceed if text was extracted
            text_chunks = split_text_into_chunks(raw_text)
            create_and_save_vector_store(text_chunks)
        else:
            print("No text extracted from PDFs. Vector store creation skipped.")
            return # Exit if no text to process
    else:
        print("Vector store already exists. Loading existing store.")


    # Handle user query
    handle_user_query(user_question)


if __name__ == "__main__":
    main()

Vector store already exists. Loading existing store.


Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


KeyboardInterrupt: 

In [None]:
# Generate response using GPT2-large
response = generator(formatted_prompt, max_new_tokens=500, num_return_sequences=1) # Changed max_length to max_new_tokens

In [None]:
generator = pipeline(
           'text-generation',
           model='meta-llama/Llama-3.2-1B-Instruct',
           torch_dtype=torch.float16,
           device_map="auto"
       )

In [None]:
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
from transformers import pipeline
import torch
# Load environment variables (if you still need them for other parts, otherwise can be removed)
load_dotenv()

# Function to extract text from uploaded PDFs (now accepts file paths instead of streamlit uploaders)
def extract_pdf_text(pdf_file_paths):
    text = ""
    for pdf_path in pdf_file_paths:
        try:
            with open(pdf_path, 'rb') as pdf_file: # Open PDF in binary read mode
                pdf_reader = PdfReader(pdf_file)
                for page in pdf_reader.pages:
                    text += page.extract_text()
        except FileNotFoundError:
            print(f"Error: PDF file not found at path: {pdf_path}")
            continue # Skip to the next file if one is not found
        except Exception as e:
            print(f"Error processing PDF file at {pdf_path}: {e}")
            continue
    return text

# Function to split text into chunks
def split_text_into_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=500)
    chunks = text_splitter.split_text(text)
    return chunks

# Function to create vector store using HuggingFace embeddings
def create_and_save_vector_store(text_chunks):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # HuggingFace embeddings
    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
    vector_store.save_local("faiss_index")
    print("Vector store created and saved locally.")

# Function to create the conversational chain using GPT2-large model
def create_prompt_template():
    prompt_template = """
    Answer the question as detailed as possible from the provided context. If the answer contains any structured data like tables or lists, respond in the same format.
    If the answer is not in the provided context, just say, "The answer is not available in the context." Do not provide a wrong answer.

    Context:
    {context}

    Question:
    {question}
    """

    prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
    return prompt

# Function to handle user input and provide a response using GPT2-large
def handle_user_query(user_question):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # Same HuggingFace embeddings
    try:
        new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
    except Exception as e:
        print(f"Error loading vector store: {e}. Please make sure 'faiss_index' exists and was created correctly.")
        return

    docs = new_db.similarity_search(user_question)

    context = "\n\n".join([doc.page_content for doc in docs])  # Combine the documents for context

    prompt = create_prompt_template()
    formatted_prompt = prompt.format(context=context, question=user_question)

    # Load GPT2-large pipeline for text generation
    generator = pipeline(
           'text-generation',
           model='meta-llama/Llama-3.2-1B-Instruct',
           torch_dtype=torch.float16,
           device_map="auto"
       )
    # Generate response using GPT2-large
    response = generator(formatted_prompt, max_new_tokens=5, num_return_sequences=1) # Changed max_length to max_new_tokens

    # Extract generated text from the pipeline output
    reply_text = response[0]['generated_text'] if response else "No response generated."

    # Remove the prompt from the generated text for cleaner reply (basic approach)
    if reply_text.startswith(formatted_prompt):
        reply_text = reply_text[len(formatted_prompt):].strip()

    print("Question:", user_question)
    print("Reply:", reply_text)

# Main function to run without Streamlit UI
def main():
    pdf_file_paths = [
        "/content/sample.pdf", # Replace with your PDF file paths
    ]

    user_question = "What is the main topic of these documents?" # Replace with your question

    # Process PDF and create vector store if 'faiss_index' does not exist
    if not os.path.exists("faiss_index"):
        print("Processing PDF and creating vector store...")
        raw_text = extract_pdf_text(pdf_file_paths)
        if raw_text: # Only proceed if text was extracted
            text_chunks = split_text_into_chunks(raw_text)
            create_and_save_vector_store(text_chunks)
        else:
            print("No text extracted from PDFs. Vector store creation skipped.")
            return # Exit if no text to process
    else:
        print("Vector store already exists. Loading existing store.")


    # Handle user query
    handle_user_query(user_question)


if __name__ == "__main__":
    main()

Vector store already exists. Loading existing store.


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # Same HuggingFace embeddings
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


https://huggingface.co/spaces/Raijin-ASR/RAG-chat-pdf/blob/main/app.py