## **PDF extraction with Unstructured & Pandas**

### **1. Import Libraries**
- Import required libraries for file handling, data processing, and PDF extraction.

### **2. Define Directories**
- Set paths for PDFs, output files, and extracted images.  
- Create directories if they don’t exist.

### **3. Helper Functions**
- `serialize_image()`: Convert image to base64 for storage.  
- `categorize_elements()`: Separate text and tables from extracted elements.  

### **4. Process PDFs**
- Use `partition_pdf()` to extract text, tables, and images.  
- Store extracted text and tables in a structured format.  
- Save images as base64-encoded strings.

### **5. Process All PDFs in Directory**
- Iterate through all PDFs and process each file.

### **6. Save Extracted Data**
- Convert extracted data into a Pandas DataFrame.  
- Save the DataFrame as a Parquet file for efficient storage.

<span style="color:red;">### Issue: Extracts text in a multi-column PDF but after the first paragraph, it jumps to the first paragraph of the second column, instead of extracting the second paragraph from the first column.</span>


In [1]:
import os
import json
import pandas as pd
import base64
from unstructured.partition.pdf import partition_pdf
from unstructured.documents.elements import Table, Image, Title, NarrativeText
import unstructured.documents.elements
import pickle

# Define directories
pdf_dir = "./data/"
output_dir = "processed_output/"
image_dir = "./figures/"
os.makedirs(output_dir, exist_ok=True)
os.makedirs(image_dir, exist_ok=True)

#clear the files in the figures durectory
if os.path.isdir(image_dir):
    for filename in os.listdir(image_dir):
        file_path = os.path.join(image_dir, filename)
        if os.path.isfile(file_path):  # Only remove files
            os.remove(file_path)
    print(f"All files in {image_dir} have been deleted.")
else:
    print(f"{image_dir} is not a valid directory.")

# Pandas DataFrame storage
data_entries = []

# Function to serialize image to base64
def serialize_image(image_path):
    """Convert an image to a base64-encoded string."""
    with open(image_path, "rb") as img_file:
        img_bytes = img_file.read()
        return base64.b64encode(img_bytes).decode("utf-8")  # Convert bytes to UTF-8 string

def serialize_object(obj):
    """Serialize an object using pickle and encode it as a base64 string."""
    obj_bytes = pickle.dumps(obj)  # Convert object to bytes
    return base64.b64encode(obj_bytes).decode('utf-8')  # Encode as base64 string


def categorize_elements(raw_pdf_elements):
    """Categorize extracted elements into texts and tables."""
    texts = []
    tables = []
    for element in raw_pdf_elements:
        # Check if element is a Table
        if isinstance(element, unstructured.documents.elements.Table):
            tables.append(element)
        # Check if element is a text block (CompositeElement is a general parent class for text-like elements)
        elif isinstance(element, unstructured.documents.elements.CompositeElement):
            texts.append(element)
    return texts, tables

def process_pdf(pdf_path):
    pdf_name = os.path.basename(pdf_path)
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",
        hi_res_model_name="yolox",
        infer_table_structure=True,
        extract_images_in_pdf=True,
        chunking_strategy="by_title",
        max_characters=4000,
        new_after_n_chars=3800,
        combine_text_under_n_chars=2000,
        image_output_dir_path=image_dir  # Storing images in the ./figures directory
    )

    # Categorize the elements into texts and tables
    texts, tables = categorize_elements(elements)

    # Process text elements
    for text in texts:
        entry = {"pdf_name": pdf_name, "element_type": "text", "content": text.text}
        data_entries.append(entry)

    # Process table elements
    for table in tables:
        entry = {"pdf_name": pdf_name, "element_type": "table", "content": serialize_object(table.to_dict())}
        data_entries.append(entry)

    # Process images in the ./figures directory
    for image_filename in os.listdir(image_dir):
        image_path = os.path.join(image_dir, image_filename)
        if image_filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):  # Filter for image files
            image_base64 = serialize_image(image_path)  # Convert the image to base64 string
            entry = {"pdf_name": pdf_name, "element_type": "image", "content": image_base64}
            data_entries.append(entry)

# ###--Uncomment for processing PDF again & save df----#####
# # Process all PDFs in the directory
# for pdf_file in os.listdir(pdf_dir):
#     if pdf_file.endswith(".pdf"):
#         pdf_path = os.path.join(pdf_dir, pdf_file)
#         print(f"Processing: {pdf_file}")
#         process_pdf(pdf_path)
#         #clear the files in the figures durectory
#         if os.path.isfile(image_dir):
#                 os.remove(image_dir)
            
# # Convert extracted data to a Pandas DataFrame
# df = pd.DataFrame(data_entries)
# # Save DataFrame to a Parquet file (efficient binary format)
# df.to_parquet(os.path.join(output_dir, "extracted_data_raw.parquet"), index=False)

print("Processing complete. Data saved in Pandas DataFrame!")

All files in ./figures/ have been deleted.
Processing complete. Data saved in Pandas DataFrame!


In [2]:
import pandas as pd
# Load the Parquet file into a DataFrame
df = pd.read_parquet(os.path.join(output_dir, "extracted_data_raw.parquet"))
# Check the loaded DataFrame
df

Unnamed: 0,pdf_name,element_type,content
0,Sample_Table.pdf,table,gASV6wIAAAAAAAB9lCiMBHR5cGWUjAVUYWJsZZSMCmVsZW...
1,chap_1_content.pdf,text,1 www.tntextbooks.in LAWS OF MOTION\n\nLearnin...
2,chap_1_content.pdf,text,in this unit.\n\n1\n\n| | 10th_Science_Unit-1....
3,chap_1_content.pdf,text,1 .2 .1 T ypes of I nertia\n\na) Inertia of re...
4,chap_1_content.pdf,text,forces.\n\n(a) Like parallel forces: Two or mo...
5,chap_1_content.pdf,text,1 .4 .5 Rotating Effect of Force\n\nThe door c...
6,chap_1_content.pdf,text,1 .4 .7 Application of T orque\n\n1. Gears:\n\...
7,chap_1_content.pdf,text,1 .5 NE W T ON’ S S E C OND L A W OF MOT I ON\...
8,chap_1_content.pdf,text,1 .7 NE W T ON’ S T H I R D L A W OF MOT I ON\...
9,chap_1_content.pdf,text,Figure 1.7 Conservation of\n\nlinear momentum\...


# Local Llama 3.2 11B/3B Summarization and Image Analysis

This notebook demonstrates how to use the Llama 3.2 11B/3B to summarize text and analyze images within a DataFrame.

## Steps:



<span style="color:red;">### Issue: How to summarize table input?</span>


In [3]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch

modelV, tokenizerV = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.12: Fast Mllama vision patching. Transformers: 4.46.1.
   \\   /|    GPU: NVIDIA GeForce RTX 4070 Ti SUPER. Max memory: 15.693 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.46.1.
   \\   /|    GPU: NVIDIA GeForce RTX 4070 Ti SUPER. Max memory: 15.693 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [73]:
from unsloth.chat_templates import get_chat_template
import re
import base64
from PIL import Image
import io
import numpy as np
from torchvision import transforms

# set the model for inference
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
FastVisionModel.for_inference(modelV) # Enable for inference!

# Configure tokenizer with chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="chatml",  # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},  # ShareGPT style
    map_eos_token=True,  # Maps <|im_end|> to </s> instead
)

def get_summary(text):
    messages = [
        {"from": "system", "value": "You are a helpful AI assistant."},
        {"from": "human", "value": f"Summarize the following text: {text}"},
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(input_ids=inputs, max_new_tokens=512, use_cache=True)
        response = tokenizer.batch_decode(outputs)[0]
        
        # Extract the content after "<|im_start|>assistant" and before "<|im_end|>"
        pattern = r"<\|im_start\|>assistant\n(.*?)<\|im_end\|>"
        matches = re.search(pattern, response, re.DOTALL)
        
        if matches:
            return matches.group(1).strip()  # Extract matched content and remove extra spaces
        else:
            return None
            
def base64_to_tensor(base64_string):
    # Decode base64 string to bytes
    image_data = base64.b64decode(base64_string)
    # Convert bytes to image using PIL
    image = Image.open(io.BytesIO(image_data)).convert("RGB")  # Convert to RGB for LLaMA
    return image
    
def analyze_image(image_tensor):
    instruction = "Describe accurately and concisely what you see in this image."

    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": instruction}
        ]}
    ]
    
    input_text = tokenizerV.apply_chat_template(messages, add_generation_prompt=True)
    
    inputs = tokenizerV(
        image_tensor,
        input_text,
        add_special_tokens=False,
        return_tensors="pt",
    ).to("cuda")

    # Remove TextStreamer, no longer necessary
    with torch.no_grad():
        outputs = modelV.generate(**inputs, max_new_tokens=300, temperature=0.5)
        response = tokenizerV.decode(outputs[0], skip_special_tokens=True)
    return response.split("assistant\n\n")[1].strip() if "assistant\n\n" in response else response
    
def summarize_row(row):
    if row["element_type"] == "text":
        return get_summary(row["content"])
    elif row["element_type"] == "image":
        return analyze_image(base64_to_tensor(row["content"]))
    else:
        return None
        
# ###--Uncomment for summarizing using llama----#####
# df["summary"] = df.apply(summarize_row, axis=1)
# df.to_parquet(os.path.join(output_dir, "extracted_data_summary_llama.parquet"), index=False)


In [74]:
# Load the Parquet file into a DataFrame
df = pd.read_parquet(os.path.join(output_dir, "extracted_data_summary_llama.parquet"))
# Check the loaded DataFrame
df

Unnamed: 0,pdf_name,element_type,content,summary
0,Sample_Table.pdf,table,gASV6wIAAAAAAAB9lCiMBHR5cGWUjAVUYWJsZZSMCmVsZW...,
1,chap_1_content.pdf,text,1 www.tntextbooks.in LAWS OF MOTION\n\nLearnin...,Here's a summary of the text:\n\nThe text disc...
2,chap_1_content.pdf,text,in this unit.\n\n1\n\n| | 10th_Science_Unit-1....,The text describes the concepts of mechanics i...
3,chap_1_content.pdf,text,1 .2 .1 T ypes of I nertia\n\na) Inertia of re...,The text appears to be a summary of concepts r...
4,chap_1_content.pdf,text,forces.\n\n(a) Like parallel forces: Two or mo...,The text describes the concept of forces and r...
5,chap_1_content.pdf,text,1 .4 .5 Rotating Effect of Force\n\nThe door c...,Here's a summary of the text:\n\nThe rotating ...
6,chap_1_content.pdf,text,1 .4 .7 Application of T orque\n\n1. Gears:\n\...,The text appears to be a summary of the laws o...
7,chap_1_content.pdf,text,1 .5 NE W T ON’ S S E C OND L A W OF MOT I ON\...,The given text is a summary of Newton's Second...
8,chap_1_content.pdf,text,1 .7 NE W T ON’ S T H I R D L A W OF MOT I ON\...,The text explains Newton's third law of motion...
9,chap_1_content.pdf,text,Figure 1.7 Conservation of\n\nlinear momentum\...,The text is a proof of the law of conservation...
