# Ollama PDF RAG Notebook

## Import Libraries


In [20]:
# Imports
import pdfplumber
from PIL import Image
import io
from langchain.document_loaders import PyPDFLoader
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
import warnings
import os

# Suppress warnings
warnings.filterwarnings('ignore')

# Set environment variable for protobuf
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"


## Load PDF

In [21]:
# Load PDF with PyPDFLoader
local_path = "Tables_Charts_Graphs.pdf"

def load_pdf():
    """Loads the PDF and extracts text"""
    if local_path:
        loader = PyPDFLoader(local_path)
        data = loader.load()
        print(f"PDF loaded successfully: {local_path}")
        return data
    else:
        print("Upload a PDF file")
        return None

data = load_pdf()


PDF loaded successfully: Tables_Charts_Graphs.pdf


In [39]:
import pdfplumber
from PIL import Image
import io

# Extract Tables and Images with pdfplumber
def extract_tables_and_images(pdf_path):
    tables = []
    images = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            print(f"Processing page {page_num + 1}")
            
            # Extract tables
            table = page.extract_table()
            if table:
                tables.append((page_num + 1, table))  # Store page number with table
            
            # Extract images (graphs, charts)
            for img in page.images:
                try:
                    # Check if the image stream is valid before processing
                    img_stream = img.get('stream')
                    if img_stream:
                        img_bytes = img_stream.get_data()  # Extract raw image data
                        if img_bytes:
                            try:
                                im = Image.open(io.BytesIO(img_bytes))  # Load image from raw bytes
                                im.verify()  # Verify if the image is valid
                                im = Image.open(io.BytesIO(img_bytes))  # Re-open image after verification
                                images.append((page_num + 1, im))  # Store page number with image
                            except Exception as e:
                                print(f"Error extracting image on page {page_num + 1}: {e}")
                    else:
                        print(f"Skipping non-image content on page {page_num + 1}")
                except Exception as e:
                    print(f"Error processing image on page {page_num + 1}: {e}")
    
    return tables, images

# Extract tables and images
local_path = "tables_Charts_Graphs.pdf"
tables, images = extract_tables_and_images(local_path)
print(f"Extracted {len(tables)} tables and {len(images)} images")


Processing page 1
Processing page 2
Processing page 3
Processing page 4
Processing page 5
Processing page 6
Processing page 7
Processing page 8
Error extracting image on page 8: cannot identify image file <_io.BytesIO object at 0x000001EE9032F510>
Processing page 9
Error extracting image on page 9: cannot identify image file <_io.BytesIO object at 0x000001EE90792BB0>
Error extracting image on page 9: cannot identify image file <_io.BytesIO object at 0x000001EE90792BB0>
Error extracting image on page 9: cannot identify image file <_io.BytesIO object at 0x000001EE90792BB0>
Error extracting image on page 9: cannot identify image file <_io.BytesIO object at 0x000001EE90792BB0>
Error extracting image on page 9: cannot identify image file <_io.BytesIO object at 0x000001EE90792BB0>
Error extracting image on page 9: cannot identify image file <_io.BytesIO object at 0x000001EE90792BB0>
Error extracting image on page 9: cannot identify image file <_io.BytesIO object at 0x000001EE90792BB0>
Error 

## Split text into chunks

In [40]:
# Split text into chunks
def split_text(data):
    """Splits the extracted text into smaller chunks"""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_documents(data)
    print(f"Text split into {len(chunks)} chunks")
    return chunks

chunks = split_text(data)


Text split into 18 chunks


## Create vector database

In [41]:
# Create vector database
def create_vector_db(chunks):
    """Creates a vector database from the document chunks"""
    vector_db = Chroma.from_documents(
        documents=chunks,
        embedding=OllamaEmbeddings(model="nomic-embed-text"),
        collection_name="local-rag"
    )
    print("Vector database created successfully")
    return vector_db

vector_db = create_vector_db(chunks)


Vector database created successfully


## Set up LLM and Retrieval

In [42]:
# Set up LLM and retrieval
local_model = "llama3.2"  # Or whichever model you prefer
llm = ChatOllama(model=local_model)


In [43]:

# Query prompt template for question generation
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate 2
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}"""
)

# Set up retriever
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)


## Create chain

In [44]:
## RAG prompt template for answering questions
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""# RAG prompt template for answering questions
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)


In [45]:

# Create the chain
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


prompt = ChatPromptTemplate.from_template(template)

## Chat with PDF

In [46]:
# Chat with PDF - Enhance with Tables and Images
def chat_with_pdf(question):
    """
    Chat with the PDF using the RAG chain and also handle tables and images.
    """
    # If the question asks for tables
    if "table" in question.lower():
        if tables:
            print("Table data from the document:")
            for table in tables:
                for row in table:
                    print(row)  # Print table row-by-row
        else:
            print("No tables found in the document.")
    
    # If the question asks for graphs (images/charts)
    elif "graph" in question.lower() or "chart" in question.lower():
        if images:
            for img in images:
                img.show()  # Show the image using PIL's show() method
        else:
            print("No images (graphs/charts) found in the document.")
    
    else:
        # For regular text-based queries, use the RAG chain
        response = chain.invoke(question)
        print(f"Response: {response}")

In [31]:

# Example Questions
chat_with_pdf("What is the main idea of this document?")


Response: The main idea of this document appears to be an introduction to visual representations of data, specifically tables, charts, and graphs. The document provides examples from various fields such as history, economics, education, psychology, urban affairs, and everyday life, with a focus on explaining when to use different types of graphs (line graph, pie chart, bar graph) to visualize data.


In [32]:
chat_with_pdf("Extract the unemployment details for master's degree holders from page 2 of the document.")


Response: There is no information about unemployment details for master's degree holders in the provided context. The text only discusses various types of graphs and charts, provides examples from history, economics, education, psychology, urban affairs, and everyday life, but does not mention unemployment or specific data related to master's degree holders.


In [33]:
chat_with_pdf("Can you provide the unemployment information based on degree type from page 2?")


Response: No, I couldn't find any unemployment information based on degree type in the provided context. The text only discusses examples and tables related to GDP, Education, Psychology, Urban Affairs, and Everyday Life, but does not mention unemployment rates or any specific data regarding degree types.


In [34]:
chat_with_pdf("Extract the table data from page 6.")


Table data from the document:
['Year', '2010', '2011', '2012', '2013', '2014', '2015']
['All Industries', '26093515', '27535971', '28663246', '29601191', '30895407', '31397023']
['Manufacturing', '4992521', '5581942', '5841608', '5953299', '6047477', '5829554']
['Finance,\nInsurance, Real\nEstate, Rental,\nLeasing', '4522451', '4618678', '4797313', '5031881', '5339678', '5597018']
['Arts,\nEntertainment,\nRecreation,\nAccommodation,\nand Food Service', '964032', '1015238', '1076249', '1120496', '1189646', '1283813']
['Other', '15614511', '16320113', '16948076', '17495515', '18318606', '18686638']
['', '', '', '', None, None, None]
['', '', '', None, None, None, None]
['', '', None, None, None, None, None]
['', None, None, None, None, None, None]
['', '', None, None, None, None, None]
['', None, None, None, None, None, None]
['', '', '', '', '', '', '']
['', '', '', '', '', '', None]


In [16]:
# Example Questions
chat_with_pdf("What is the main idea of this document?")

The main idea of this document appears to be an introduction or guide on using charts and graphs to visualize data. The document covers various types of visual representations (tables, line graphs, pie charts, bar graphs), when to use each one, and provides examples from different fields such as economics, education, psychology, and urban affairs.


In [54]:
# Example 2
chat_with_pdf("Extract the unemployment details from page 2 of the document on type of degree input.")

Response: There is no mention of "unemployment" or a specific page number related to "degree input" in the provided context. The documents only contain information about tables, charts, and graphs with examples from various fields such as history, economics, education, psychology, urban affairs, and everyday life. There is no relevant content regarding unemployment or degree input on any of the pages mentioned.


In [19]:
# Example 3
chat_with_pdf("Can you explain the case study highlighted in the document?")

There is no clear "case study" highlighted in the document. The document appears to be a guide or tutorial on using tables, charts, and graphs to visualize data, with examples from various fields such as education, urban affairs, psychology, economics, and history. It provides explanations, guidelines, and illustrations on when to use different types of charts (line graph, pie chart, bar graph) and how to plot data. However, it does not present a specific case study or scenario that requires explanation.


## Clean up (optional)

In [None]:
# Optional: Clean up when done 
vector_db.delete_collection()
print("Vector database deleted successfully")

Vector database deleted successfully
