In [1]:
import base64
import json
import os
import io
from pdf2image import convert_from_path
from openai import OpenAI

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def extract_and_store(pdf_path, output_json_path, image_output_dir):
    # 1. Setup
    images = convert_from_path(pdf_path)
    exam_data = {"questions": []}
    
    # Ensure image directory exists
    os.makedirs(image_output_dir, exist_ok=True)

    # 2. Iterate Pages (Skip cover)
    for i, page_img in enumerate(images[2:], start=3):
        print(f"Processing Page {i}...")
        
        # Resize for token efficiency
        page_img.thumbnail((1024, 1024))
        
        # Encode
        buff = io.BytesIO()
        page_img.save(buff, format="JPEG")
        b64_img = base64.b64encode(buff.getvalue()).decode("utf-8")

        # 3. AI Extraction
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": (
                        "Extract math questions. "
                        "If the question relies on a diagram (graph, geometry) visible in the image, set 'needs_diagram': true. "
                        "Estimate 'canvas_height' (200-800px) based on answer space."
                    )
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Extract questions to JSON."},
                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64_img}"}}
                    ]
                }
            ],
            response_format={"type": "json_object"}
        )
        
        data = json.loads(response.choices[0].message.content)
        
        # 4. Handle Diagrams
        for q in data.get("questions", []):
            if q.get("needs_diagram"):
                # Ideally, you'd ask the AI for bounding boxes and crop perfectly.
                # For this simplified version, we save the WHOLE page as the diagram reference
                # so the user can see the context.
                img_filename = f"{q['id']}_context.jpg"
                page_img.save(os.path.join(image_output_dir, img_filename))
                q["image_url"] = f"images/{img_filename}"
            else:
                q["image_url"] = None
                
            exam_data["questions"].append(q)

    # 5. Save Data
    with open(output_json_path, "w") as f:
        json.dump(exam_data, f, indent=4)

In [3]:
extract_and_store("test.pdf", "data/questions/2025_paper1.json", "data/questions/images")

Processing Page 3...


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [8]:
import os
import json
import base64
import io
import time
from openai import OpenAI
from pdf2image import convert_from_path
from PIL import Image

# --- CONFIGURATION ---
PDF_PATH = "test.pdf" 
OUTPUT_DIR = "data/question_images"
OUTPUT_JSON = "data/questions.json"
API_KEY = os.getenv("OPENAI_API_KEY")

# 1. USE THE CHEAPER MODEL
MODEL_ID = "gpt-4o-mini" 

client = OpenAI(api_key=API_KEY)

def encode_image(image):
    """Encodes PIL image to base64."""
    # Resize to 768px (smaller = cheaper/faster)
    image = image.copy()
    image.thumbnail((768, 768))
    buffered = io.BytesIO()
    image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")

def get_bounding_boxes_safe(image_b64, page_num):
    prompt = f"""
    Analyze Page {page_num}. Identify bounding boxes for questions (e.g. 1(a), 2).
    Include the question text and the blank answer space below it.
    Return 0-1000 normalized coordinates: [y_min, x_min, y_max, x_max].
    
    OUTPUT JSON:
    {{ "items": [ {{ "id": "q1a", "label": "Q1(a)", "box_2d": [100, 50, 300, 950] }} ] }}
    """
    
    try:
        response = client.chat.completions.create(
            model=MODEL_ID,
            messages=[
                {
                    "role": "user", 
                    "content": [
                        {"type": "text", "text": prompt},
                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}}
                    ]
                }
            ],
            response_format={"type": "json_object"}
        )
        return json.loads(response.choices[0].message.content)
    except Exception as e:
        print(f"   ‚ö†Ô∏è API Error on Page {page_num}: {e}")
        return None

def main():
    if not os.path.exists(PDF_PATH):
        print("‚ùå PDF not found.")
        return

    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # 2. LOAD EXISTING PROGRESS (Checkpointing)
    existing_data = []
    processed_pages = set()
    if os.path.exists(OUTPUT_JSON):
        try:
            with open(OUTPUT_JSON, "r") as f:
                existing_data = json.load(f)
                processed_pages = {item['page'] for item in existing_data}
            print(f"üîÑ Resuming... Skipping {len(processed_pages)} pages already done.")
        except:
            pass

    print(f"üöÄ Loading PDF...")
    images = convert_from_path(PDF_PATH)
    
    # Iterate Pages (Skip covers 0-2)
    for i, page_img in enumerate(images[2:], start=3):
        
        # SKIP if already done
        if i in processed_pages:
            continue

        print(f"üìÑ Processing Page {i}...")
        
        # Call API
        img_b64 = encode_image(page_img)
        data = get_bounding_boxes_safe(img_b64, i)
        
        if data and "items" in data:
            width, height = page_img.size
            
            for item in data["items"]:
                try:
                    # Crop Logic
                    y_min, x_min, y_max, x_max = item["box_2d"]
                    left = (x_min / 1000) * width
                    top = (y_min / 1000) * height
                    right = (x_max / 1000) * width
                    bottom = (y_max / 1000) * height
                    
                    crop_box = (left, top, right, bottom)
                    cropped_img = page_img.crop(crop_box)
                    
                    filename = f"p{i}_{item['id']}.png"
                    save_path = os.path.join(OUTPUT_DIR, filename)
                    cropped_img.save(save_path)
                    
                    existing_data.append({
                        "id": f"2025_{item['id']}",
                        "title": item.get("label", item['id']),
                        "image_path": save_path,
                        "page": i
                    })
                except Exception as e:
                    print(f"      Crop Error: {e}")

            # 3. SAVE IMMEDIATELY (So you don't lose progress)
            with open(OUTPUT_JSON, "w") as f:
                json.dump(existing_data, f, indent=4)
                
            print(f"   ‚úÖ Saved Page {i}. Sleeping...")
        
        # 4. SLEEP TO PREVENT RATE LIMITS
        time.sleep(2) 

    print("üéâ Done!")

if __name__ == "__main__":
    main()

üîÑ Resuming... Skipping 0 pages already done.
üöÄ Loading PDF...
üìÑ Processing Page 3...
   ‚ö†Ô∏è API Error on Page 3: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
üìÑ Processing Page 4...
   ‚ö†Ô∏è API Error on Page 4: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
üìÑ Processing Page 5...
   ‚ö†Ô∏è API Error on Page 5: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information 

KeyboardInterrupt: 