## **PDF extraction with Unstructured & Pandas**

### **1. Import Libraries**
- Import required libraries for file handling, data processing, and PDF extraction.

### **2. Define Directories**
- Set paths for PDFs, output files, and extracted images.  
- Create directories if they don’t exist.

### **3. Helper Functions**
- `serialize_image()`: Convert image to base64 for storage.  
- `categorize_elements()`: Separate text and tables from extracted elements.  

### **4. Process PDFs**
- Use `partition_pdf()` to extract text, tables, and images.  
- Store extracted text and tables in a structured format.  
- Save images as base64-encoded strings.

### **5. Process All PDFs in Directory**
- Iterate through all PDFs and process each file.

### **6. Save Extracted Data**
- Convert extracted data into a Pandas DataFrame.  
- Save the DataFrame as a Parquet file for efficient storage.



In [3]:
import os
import json
import pandas as pd
import base64
from unstructured.partition.pdf import partition_pdf
from unstructured.documents.elements import Table, Image, Title, NarrativeText
import unstructured.documents.elements
import pickle

# Define directories
pdf_dir = "./data/"
output_dir = "processed_output/"
image_dir = "./figures/"
os.makedirs(output_dir, exist_ok=True)
os.makedirs(image_dir, exist_ok=True)

#clear the files in the figures durectory
if os.path.isfile(image_dir):
    os.remove(image_dir)

# Pandas DataFrame storage
data_entries = []

# Function to serialize image to base64
def serialize_image(image_path):
    """Convert an image to a base64-encoded string."""
    with open(image_path, "rb") as img_file:
        img_bytes = img_file.read()
        return base64.b64encode(img_bytes).decode("utf-8")  # Convert bytes to UTF-8 string

def serialize_object(obj):
    """Serialize an object using pickle and encode it as a base64 string."""
    obj_bytes = pickle.dumps(obj)  # Convert object to bytes
    return base64.b64encode(obj_bytes).decode('utf-8')  # Encode as base64 string


def categorize_elements(raw_pdf_elements):
    """Categorize extracted elements into texts and tables."""
    texts = []
    tables = []
    for element in raw_pdf_elements:
        # Check if element is a Table
        if isinstance(element, unstructured.documents.elements.Table):
            tables.append(element)
        # Check if element is a text block (CompositeElement is a general parent class for text-like elements)
        elif isinstance(element, unstructured.documents.elements.CompositeElement):
            texts.append(element)
    return texts, tables

def process_pdf(pdf_path):
    pdf_name = os.path.basename(pdf_path)
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",
        hi_res_model_name="yolox",
        infer_table_structure=True,
        extract_images_in_pdf=True,
        chunking_strategy="by_title",
        max_characters=4000,
        new_after_n_chars=3800,
        combine_text_under_n_chars=2000,
        image_output_dir_path=image_dir  # Storing images in the ./figures directory
    )

    # Categorize the elements into texts and tables
    texts, tables = categorize_elements(elements)

    # Process text elements
    for text in texts:
        entry = {"pdf_name": pdf_name, "element_type": "text", "content": text.text}
        data_entries.append(entry)

    # Process table elements
    for table in tables:
        entry = {"pdf_name": pdf_name, "element_type": "table", "content": serialize_object(table.to_dict())}
        data_entries.append(entry)

    # Process images in the ./figures directory
    for image_filename in os.listdir(image_dir):
        image_path = os.path.join(image_dir, image_filename)
        if image_filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):  # Filter for image files
            image_base64 = serialize_image(image_path)  # Convert the image to base64 string
            entry = {"pdf_name": pdf_name, "element_type": "image", "content": image_base64}
            data_entries.append(entry)

####--Uncomment for processing PDF again & save df----#####
# # Process all PDFs in the directory
# for pdf_file in os.listdir(pdf_dir):
#     if pdf_file.endswith(".pdf"):
#         pdf_path = os.path.join(pdf_dir, pdf_file)
#         print(f"Processing: {pdf_file}")
#         process_pdf(pdf_path)
#         #clear the files in the figures durectory
#         if os.path.isfile(image_dir):
#                 os.remove(image_dir)
            
# # Convert extracted data to a Pandas DataFrame
# df = pd.DataFrame(data_entries)
# # Save DataFrame to a Parquet file (efficient binary format)
# df.to_parquet(os.path.join(output_dir, "extracted_data.parquet"), index=False)

print("Processing complete. Data saved in Pandas DataFrame!")

Processing complete. Data saved in Pandas DataFrame!


In [5]:
import pandas as pd
# Load the Parquet file into a DataFrame
df = pd.read_parquet(os.path.join(output_dir, "extracted_data.parquet"))
# Check the loaded DataFrame
print(df)

              pdf_name element_type  \
0     Sample_Table.pdf        table   
1     Sample_Table.pdf        image   
2     Sample_Table.pdf        image   
3     Sample_Table.pdf        image   
4     Sample_Table.pdf        image   
..                 ...          ...   
90  chap_1_content.pdf        image   
91  chap_1_content.pdf        image   
92  chap_1_content.pdf        image   
93  chap_1_content.pdf        image   
94  chap_1_content.pdf        image   

                                              content summary  
0   gASV6wIAAAAAAAB9lCiMBHR5cGWUjAVUYWJsZZSMCmVsZW...    None  
1   /9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...    None  
2   /9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...    None  
3   /9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...    None  
4   /9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...    None  
..                                                ...     ...  
90  /9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...    None  
91  /9j/4AAQSkZJRgA

# OpenAI Summarization and Image Analysis

This notebook demonstrates how to use the OpenAI API to summarize text and analyze images within a DataFrame.

## Steps:

1. **Setup**: 
   - Load the OpenAI API key using `dotenv`.
   - Initialize the `openai` client.

2. **Text Summarization**: 
   - `get_summary` function sends text to OpenAI's GPT-4 model for summarization.

3. **Image Analysis**: 
   - `analyze_image` function sends base64-encoded images to GPT-4 vision model (`gpt-4o`) for analysis.

4. **Apply Functions**: 
   - Summarization is applied to rows with `element_type == "text"`.
   - Image analysis is applied to rows with `element_type == "image"`.

5. **Save and Display**:
   - Save the processed DataFrame as a Parquet file.
   - Print the DataFrame with summaries.



In [18]:
import openai
import pandas as pd
import time
import os
from dotenv import load_dotenv
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

# Set up OpenAI client with your API key
client = openai.OpenAI(api_key=openai_api_key)

# Function to get summary from OpenAI API
def get_summary(text):
    try:
        response = client.chat.completions.create(
            model="gpt-4",  # Use "gpt-3.5-turbo" if needed
            messages=[
                {"role": "system", "content": "Summarize the following content concisely."},
                {"role": "user", "content": text}
            ],
            temperature=0.5,
            max_tokens=100
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error: {e}")
        return None
        
# Function to send base64 image to OpenAI API for analysis
def analyze_image(base64_image):
    try:
        response = client.chat.completions.create(
            model="gpt-4o",  # Vision-enabled model
            messages=[
                {"role": "system", "content": "Analyze the image and describe its contents."},
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "What does this image contain?"},
                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
                    ],
                },
            ],
            temperature=0.5,
            max_tokens=300
        )
        return response.choices[0].message.content.strip()  # Extract text response
    except Exception as e:
        print(f"Error: {e}")
        return None

####--Uncomment for processing PDF again & save df----#####
# # Apply summarization to text elements only
# df["summary"] = df.apply(
#     lambda row: get_summary(row["content"]) if row["element_type"] == "text" else None, axis=1
# )

# # Apply summarization to image elements only
# df["summary"] = df.apply(
#     lambda row: analyze_image(row["content"]) if row["element_type"] == "image" else None, axis=1
# )

# #table not passed correctly for summary.. need to tweak
# # # Apply summarization to table elements only
# # df["summary"] = df.apply(
# #     lambda row: analyze_image(row["content"]) if row["element_type"] == "table" else None, axis=1
# # )

# # Save DataFrame to a Parquet file (efficient binary format)
# df.to_parquet(os.path.join(output_dir, "extracted_data.parquet"), index=False)


              pdf_name element_type  \
0     Sample_Table.pdf        table   
1     Sample_Table.pdf        image   
2     Sample_Table.pdf        image   
3     Sample_Table.pdf        image   
4     Sample_Table.pdf        image   
..                 ...          ...   
90  chap_1_content.pdf        image   
91  chap_1_content.pdf        image   
92  chap_1_content.pdf        image   
93  chap_1_content.pdf        image   
94  chap_1_content.pdf        image   

                                              content  \
0   gASV6wIAAAAAAAB9lCiMBHR5cGWUjAVUYWJsZZSMCmVsZW...   
1   /9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...   
2   /9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...   
3   /9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...   
4   /9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...   
..                                                ...   
90  /9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...   
91  /9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...   
92  /9j/4AAQSkZJRg

In [20]:
import pandas as pd
# Load the Parquet file into a DataFrame
df = pd.read_parquet(os.path.join(output_dir, "extracted_data.parquet"))
# Check the loaded DataFrame
print(df)

              pdf_name element_type  \
0     Sample_Table.pdf        table   
1     Sample_Table.pdf        image   
2     Sample_Table.pdf        image   
3     Sample_Table.pdf        image   
4     Sample_Table.pdf        image   
..                 ...          ...   
90  chap_1_content.pdf        image   
91  chap_1_content.pdf        image   
92  chap_1_content.pdf        image   
93  chap_1_content.pdf        image   
94  chap_1_content.pdf        image   

                                              content  \
0   gASV6wIAAAAAAAB9lCiMBHR5cGWUjAVUYWJsZZSMCmVsZW...   
1   /9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...   
2   /9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...   
3   /9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...   
4   /9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...   
..                                                ...   
90  /9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...   
91  /9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...   
92  /9j/4AAQSkZJRg

In [21]:
df

Unnamed: 0,pdf_name,element_type,content,summary
0,Sample_Table.pdf,table,gASV6wIAAAAAAAB9lCiMBHR5cGWUjAVUYWJsZZSMCmVsZW...,
1,Sample_Table.pdf,image,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,The image depicts a simplified human figure wi...
2,Sample_Table.pdf,image,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,The image depicts a diagram illustrating gravi...
3,Sample_Table.pdf,image,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,The image shows an astronaut in a spacesuit fl...
4,Sample_Table.pdf,image,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,The image is a table explaining the action of ...
...,...,...,...,...
90,chap_1_content.pdf,image,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,The image contains a partial view of an orange...
91,chap_1_content.pdf,image,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,The image appears to be a diagram involving ph...
92,chap_1_content.pdf,image,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,The image contains two diagrams illustrating c...
93,chap_1_content.pdf,image,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,The image shows a partial view of an orange ci...
