# PDF and Images Content Extraction for Large Language Models

In [None]:
!pip install -U pypdfium2

In [None]:
import pypdfium2 as pdfium
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO

In [None]:
def convert_pdf_to_images(file_path, scale=300/72):
    
    pdf_file = pdfium.PdfDocument(file_path)  
    page_indices = [i for i in range(len(pdf_file))]
    
    renderer = pdf_file.render(
        pdfium.PdfBitmap.to_pil,
        page_indices = page_indices, 
        scale = scale,
    )
    
    list_final_images = [] 
    
    for i, image in zip(page_indices, renderer):
        
        image_byte_array = BytesIO()
        image.save(image_byte_array, format='jpeg', optimize=True)
        image_byte_array = image_byte_array.getvalue()
        list_final_images.append(dict({i:image_byte_array}))
    
    return list_final_images

In [None]:
def display_images(list_dict_final_images):
    
    all_images = [list(data.values())[0] for data in list_dict_final_images]

    for index, image_bytes in enumerate(all_images):

        image = Image.open(BytesIO(image_bytes))
        figure = plt.figure(figsize = (image.width / 100, image.height / 100))

        plt.title(f"----- Page Number {index+1} -----")
        plt.imshow(image)
        plt.axis("off")
        plt.show()

In [None]:
convert_pdf_to_images = convert_pdf_to_images('downloads/virendra_cv.pdf')

In [None]:
#convert_pdf_to_images

In [None]:
display_images(convert_pdf_to_images)

## EasyOCR

In [None]:
#!pip install easyocr

from easyocr import Reader

# Load model for the English language
language_reader = Reader(["en"])

In [None]:
def extract_text_with_easyocr(list_dict_final_images):
    
    image_list = [list(data.values())[0] for data in list_dict_final_images]
    image_content = []
    
    for index, image_bytes in enumerate(image_list):
        
        image = Image.open(BytesIO(image_bytes))
        raw_text = language_reader.readtext(image)
        raw_text = "\n".join([res[1] for res in raw_text])
                       
        image_content.append(raw_text)
    
    return "\n".join(image_content)

In [None]:
text_with_easy_ocr = extract_text_with_easyocr(convert_pdf_to_images)
print(text_with_easy_ocr)

## LangChain

In [None]:
!pip install langchain

In [None]:
from langchain.document_loaders.image import UnstructuredImageLoader

def extract_text_with_langchain_image(list_dict_final_images):
    
    image_list = [list(data.values())[0] for data in list_dict_final_images]
    image_content = []
    
    for index, image_bytes in enumerate(image_list):
        
        image = Image.open(BytesIO(image_bytes))
        loader = UnstructuredImageLoader(image)
        data = loader.load()
        raw_text = data[index].page_content
                       
        image_content.append(raw_text)
    
    return "\n".join(image_content)

In [None]:
!pip install pdfminer.six

In [None]:
!pip uninstall pdfminer.six

In [None]:
text_with_langchain_image = extract_text_with_langchain_image(convert_pdf_to_images)
print(text_with_langchain_image)

In [None]:
from langchain.document_loaders import UnstructuredFileLoader

def extract_text_with_langchain_pdf(pdf_file):
    
    loader = UnstructuredFileLoader(pdf_file)
    documents = loader.load()
    pdf_pages_content = '\n'.join(doc.page_content for doc in documents)
    
    return pdf_pages_content

In [None]:
text_with_langchain_files = extract_text_with_langchain_pdf("downloads\specialization_certificate.pdf")
print(text_with_langchain_files)

In [None]:
!pip install python-magic-bin

In [None]:
import magic
import pandas as pd

In [None]:
file_path = "downloads\specialization_certificate.pdf"  # Replace with the actual path to your file

In [None]:
mime = magic.Magic(mime=True)
file_type = mime.from_file(file_path)
print(file_type) 

In [None]:
!pip install "unstructured[pdf-fast]"

In [None]:
!pip install "unstructured[pdf]"

In [None]:
!pip uninstall pdfminer.six unstructured -y

In [None]:
!pip install "pdfminer.six<20221105"

In [None]:
!pip install "unstructured[pdf]"

In [None]:
!pip uninstall pdfminer.six unstructured pdf2image pypdf2 -y --user
!pip install "pdfminer.six<20221105" --user
!pip install "unstructured[pdf]" "unstructured[local-inference]" --user
!pip install pdf2image pypdf2 --user

In [None]:
!pip uninstall pdfminer.six unstructured pdf2image pypdf2 -y

In [None]:
!pip install "pdfminer.six<20221105" --user

In [None]:
!pip install "unstructured[pdf]" "unstructured[local-inference]" --user

In [None]:
!pip install pdf2image pypdf2 --user

In [None]:
!pip install onnx==1.12.0

In [None]:
!pip install "unstructured[pdf]" --user

In [None]:
from unstructured.partition.pdf import partition_pdf

# Replace with your PDF file path
elements = partition_pdf("downloads\specialization_certificate.pdf")
for element in elements:
    print(element.text)

In [None]:
!pip install PyPDF2

In [None]:
from PyPDF2 import PdfReader
file_path = "downloads\specialization_certificate.pdf"
def read_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

# Use it like this:
text = read_pdf("downloads\specialization_certificate.pdf")
print(text)