<a href="https://colab.research.google.com/github/varunpenumudi/Chat-with-PDF-Using-RAG-Pipeline/blob/main/PDF_RAG_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Necessary Packages/Dependencies Installation

In [None]:
!pip install langchain_huggingface langchain_chroma langchain-groq
!pip  install langchain_community pypdf pymupdf pytesseract

In [None]:
!sudo apt install tesseract-ocr
!sudo apt install libtesseract-dev

## Imports

In [None]:
import getpass
import os

# Imports for working with data
from langchain_community.document_loaders import PyPDFLoader        # Extract PDF
from langchain_text_splitters import RecursiveCharacterTextSplitter # Split PDF into chunks
from langchain_core.documents import Document                       # Convert To Langchain Document
from langchain_huggingface.embeddings import HuggingFaceEmbeddings  # Embedding the data
from langchain_chroma import Chroma                                 # Vector Store

# Import LLM
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate

# Retreival chain to create the pipeline
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

# Imports for ocr of images
import fitz                            # PyMuPDF
from PIL import Image
from io import BytesIO
from pytesseract import pytesseract

# Extract Data from PDF and Split Text

In [None]:
# load pdf

file_path = "example.pdf"

loader = PyPDFLoader(file_path)
docs = loader.load()

print(len(docs))

19


In [None]:
# Functions to extract images contents from the PDF

def extract_images_from_pdf(pdf_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    all_images = []

    # Iterate through all the pages
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)

        # Get all images on the current page
        image_list = page.get_images(full=True)

        # If there are images on this page
        if image_list:
            print(f"[+] Found {len(image_list)} image(s) on page {page_num + 1}")

            # Extract each image
            for img_index, img in enumerate(image_list):
                xref = img[0]  # Image reference
                base_image = pdf_document.extract_image(xref)  # Extract the image

                # Get image bytes
                image_bytes = base_image["image"]

                # Convert bytes to image using PIL
                image = Image.open(BytesIO(image_bytes))

                # Display the image
                all_images.append(image)

    pdf_document.close()
    return all_images

def images_to_text(imgs):
    texts = []
    for img in imgs:
        text = pytesseract.image_to_string(img)
        texts.append(text)
    return texts


image_text_docs = [Document(page_content=text, metadata={"file_name": file_path}) for text in texts]
docs.extend(image_text_docs)

In [None]:
# Split into Chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Store splits in vector DB
# using HuggingFace Embeddings
vectorstore = Chroma.from_documents(documents=splits, embedding=HuggingFaceEmbeddings())
retriever = vectorstore.as_retriever()

## Using the Groq LLM

Get Your API Key for Groq LLM from this url:
<br>
https://console.groq.com/keys

In [None]:
if os.getenv("GROQ_API_KEY") is None:
    os.environ["GROQ_API_KEY"] = getpass.getpass("Enter API Key: ")

llm = ChatGroq(model="llama3-8b-8192")

system_prompt = (
    """
    You are an advanced language model equipped with a Retrieval-Augmented Generation (RAG) pipeline. Your task is to assist in answering user queries related to documents containing semi-structured data such as text, tables, and charts.

    Context: Below is the relevant context retrieved from the document based on the user's query. This context includes text, tables, or other data extracted from the PDF.

    {context}

    Please use the context provided to answer the user's query accurately and thoroughly. Ensure that the response incorporates specific details from the retrieved context, especially when dealing with factual or numerical information.
    """
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, prompt)

# Create the Chain/Pipeline

In [None]:
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


from pprint import pprint

def get_resp(question):
    results = rag_chain.invoke({"input": question})
    resp = results['answer'].split("\n")
    pprint(resp)

In [None]:
get_resp("compare us gdp in 2011 and 2015?")

In [None]:
get_resp("What is Unemployement rates?")

['According to the provided context, the unemployment rate in 2013 varies by '
 'educational attainment. Here are the specific unemployment rates mentioned:',
 '',
 '* All workers: 6.1%',
 '',
 'Additionally, the context provides unemployment rates for different '
 'educational attainment levels:',
 '',
 '* Doctoral degree: (no specific rate mentioned)',
 '* Professional degree: (no specific rate mentioned)',
 "* Master's degree: (no specific rate mentioned)",
 "* Bachelor's degree: (no specific rate mentioned)",
 "* Associate's degree: (no specific rate mentioned)",
 '* Some college, no degree: (no specific rate mentioned)',
 '* High school diploma: (no specific rate mentioned)',
 '* Less than a high school diploma: (no specific rate mentioned)',
 '',
 'Please note that the context only provides unemployment rates for all '
 'workers and by educational attainment, but not for specific demographics '
 'such as men and women.']
