<a href="https://colab.research.google.com/github/varunpenumudi/Chat-with-PDF-Using-RAG-Pipeline/blob/main/PDF_RAG_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Necessary Packages/Dependencies Installation

In [None]:
!pip install langchain_huggingface langchain_chroma langchain-groq
!pip  install langchain_community pypdf pymupdf pytesseract

In [None]:
!sudo apt install tesseract-ocr
!sudo apt install libtesseract-dev

## Imports

In [4]:
import getpass
import os

# Imports for working with data
from langchain_community.document_loaders import PyPDFLoader        # Extract PDF
from langchain_text_splitters import RecursiveCharacterTextSplitter # Split PDF into chunks
from langchain_core.documents import Document                       # Convert To Langchain Document
from langchain_huggingface.embeddings import HuggingFaceEmbeddings  # Embedding the data
from langchain_chroma import Chroma                                 # Vector Store

# Import LLM
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate

# Retreival chain to create the pipeline
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

# Imports for ocr of images
import fitz                            # PyMuPDF
from PIL import Image
from io import BytesIO
from pytesseract import pytesseract

# Extract Data from PDF and Split Text

In [21]:
# load pdf

file_path = "example.pdf"

loader = PyPDFLoader(file_path)
docs = loader.load()

print(len(docs))

19


In [9]:
# Functions to extract images contents from the PDF

def extract_images_from_pdf(pdf_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    all_images = []

    # Iterate through all the pages
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)

        # Get all images on the current page
        image_list = page.get_images(full=True)

        # If there are images on this page
        if image_list:
            print(f"[+] Found {len(image_list)} image(s) on page {page_num + 1}")

            # Extract each image
            for img_index, img in enumerate(image_list):
                xref = img[0]  # Image reference
                base_image = pdf_document.extract_image(xref)  # Extract the image

                # Get image bytes
                image_bytes = base_image["image"]

                # Convert bytes to image using PIL
                image = Image.open(BytesIO(image_bytes))

                # Display the image
                all_images.append(image)

    pdf_document.close()
    return all_images

def images_to_text(imgs):
    texts = []
    for img in imgs:
        text = pytesseract.image_to_string(img)
        texts.append(text)
    return texts

In [11]:
imgs = extract_images_from_pdf(file_path)
image_texts = images_to_text(imgs)
image_text_docs = [Document(page_content=text, metadata={"file_name": file_path}) for text in image_texts]
docs.extend(image_text_docs)

[+] Found 1 image(s) on page 2
[+] Found 1 image(s) on page 8
[+] Found 13 image(s) on page 9
[+] Found 1 image(s) on page 16
[+] Found 1 image(s) on page 17
[+] Found 1 image(s) on page 18
[+] Found 3 image(s) on page 19


In [12]:
# Split into Chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Store splits in vector DB
# using HuggingFace Embeddings
vectorstore = Chroma.from_documents(documents=splits, embedding=HuggingFaceEmbeddings())
retriever = vectorstore.as_retriever()

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Groq LLM

Get Your API Key for Groq LLM from this url:
<br>
https://console.groq.com/keys

In [15]:
if os.getenv("GROQ_API_KEY") is None:
    os.environ["GROQ_API_KEY"] = getpass.getpass("Enter API Key: ")

llm = ChatGroq(model="llama3-8b-8192")

system_prompt = (
    """
    You are an advanced language model equipped with a Retrieval-Augmented Generation (RAG) pipeline. Your task is to assist in answering user queries related to documents containing semi-structured data such as text, tables, and charts.

    Context: Below is the relevant context retrieved from the document based on the user's query. This context includes text, tables, or other data extracted from the PDF.

    {context}

    Please use the context provided to answer the user's query accurately and thoroughly. Ensure that the response incorporates specific details from the retrieved context, especially when dealing with factual or numerical information.
    """
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, prompt)

Enter API Key: ··········


# Create the Chain/Pipeline

In [16]:
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


from pprint import pprint

def get_resp(question):
    results = rag_chain.invoke({"input": question})
    resp = results['answer'].split("\n")
    pprint(resp)

In [17]:
get_resp("Tell me about the us GDP?")

["Based on the retrieved context, here's what I can tell you about the US GDP:",
 '',
 'According to the provided chart, the 2015 U.S. GDP was $31,397,023 million '
 'dollars. The chart also breaks down the GDP by industry, showing the '
 'percentage of the total GDP that each industry contributed.',
 '',
 'Here are the percentages for each industry:',
 '',
 '* Manufacturing: 19%',
 '* Finance, insurance, real estate, rental, and leasing: 18%',
 '* Arts, entertainment, recreation, accommodation, and food services: 4%',
 '* Other: 59%',
 '',
 'In terms of the total GDP by industry, here are the numbers for each year '
 'from 2010 to 2015:',
 '',
 '* All Industries: $26,093,515 (2010), $27,535,971 (2011), $28,663,246 '
 '(2012), $29,601,191 (2013), $30,894,407 (2014), and $31,397,023 (2015)',
 '* Manufacturing: $4,992,521 (2010), $5,581,942 (2011), $5,841,608 (2012), '
 '$5,953,299 (2013), $6,047,477 (2014), and $5,829,554 (2015)',
 '* Finance, insurance, real estate, rental, and leasing

In [18]:
get_resp("What is Unemployement rates?")

['According to the provided context, the unemployment rate is the percentage '
 'of people who are unemployed and actively seeking employment. The context '
 'provides the unemployment rates for different levels of educational '
 'attainment in 2013. For example, the unemployment rate for all workers with '
 'a doctoral degree was 4.4%, while for those with a high school diploma it '
 'was 12.1%.',
 '',
 'Here is a specific breakdown of the unemployment rates by educational '
 'attainment in 2013, as mentioned in the context:',
 '',
 '* All workers: 6.1%',
 '* Doctoral degree: no specific rate mentioned',
 '* Professional degree: no specific rate mentioned',
 '* Master’s degree: no specific rate mentioned',
 '* Bachelor’s degree: no specific rate mentioned',
 '* Associate’s degree: no specific rate mentioned',
 '* Some college, no degree: no specific rate mentioned',
 '* High school diploma: no specific rate mentioned',
 '* Less than a high school diploma: no specific rate mentioned',


In [19]:
get_resp("When to use a Line Graph, Pie Chart, or Bar Graph?")

["Based on the provided context, here's a breakdown of when to use each type "
 'of graph:',
 '',
 '**Pie Chart**: Use a pie chart to compare parts of a whole. In the example '
 'given, a pie chart was used to compare components of US GDP.',
 '',
 '**Line Graph**: Use a line graph when you want to show how a variable '
 'changes over time. In this case, a line graph was used to show how GDP '
 'changed over time.',
 '',
 '**Bar Graph**: Use a bar graph to compare different groups of variables. In '
 'the example, a bar graph was used to compare different components of US GDP. '
 'Additionally, a bar graph was used as an alternative to a pie chart, '
 'depending on the purpose of the visualization.',
 '',
 'In summary, the choice of graph depends on the specific purpose of the '
 'visualization:',
 '',
 '* Pie chart: Compare parts of a whole',
 '* Line graph: Show how a variable changes over time',
 '* Bar graph: Compare different groups of variables']
