In [1]:
%load_ext autoreload
%autoreload 2

import hashlib
import json
import os
import sys
import uuid
from copy import deepcopy
from datetime import datetime

import PIL.Image
import torch

# PYTHONPATH=PPTAgent/src:$PYTHONPATH
os.sys.path.append('./src')

# --- imports from your project ---
from FlagEmbedding import BGEM3FlagModel
from marker.models import create_model_dict
import induct
from llms import LLM, setup_models
import pptgen
from model_utils import get_image_model, parse_pdf
from multimodal import ImageLabler
from presentation import Presentation
from utils import Config, is_image_path, pjoin, ppt_to_images
from doc_handling import refine_document
from topic_gen import topic_generate

# load api key
with open('api_key.json', 'r') as f:
    api_key = json.load(f)
    openai_api_key = api_key['openai_api_key']

LLM_MODEL = LLM(model="gpt-4o-2024-11-20",
                api_key=openai_api_key)


RUNS_DIR = "runs"
STAGES = [
    "PPT Parsing",
    "PDF/Topic Parsing",
    "Slide Induction",
    "PPT Generation",
    "Success!",
]
CAPTION_PROMPT_PATH = "prompts/caption.txt"
# For demonstration, let's load exactly 1 model (instead of multiple).
NUM_MODELS = 1
DEVICE_COUNT = torch.cuda.device_count() or torch.mps.device_count()


# --- **Task Configuration** ---
output_dir = None
# pdf = "./examples/DeepSeek-R1-short.pdf"
pdf = None
# ppt = "./examples/default_template_unedit.pptx"
ppt = "./examples/default_template-5p.pptx"
topic = "Introduce the history of OpenAI"  # won't be used if pdf is provided
slides = 5

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# If user doesn't specify output_dir, create a unique one.
if not output_dir:
    dt_str = datetime.now().strftime("%Y%m%d_%H%M%S")
    unique_id = str(uuid.uuid4())[:8]
    project_id = f"{dt_str}_{unique_id}"
    output_dir = os.path.join(RUNS_DIR, project_id)
else:
    project_id = output_dir.split('/')[-1]

# Basic checks
if pdf is None and topic is None:
    print("[ERROR] Must provide either --pdf or --topic.")
    sys.exit(1)
if pdf and not os.path.exists(pdf):
    print(f"[ERROR] PDF file not found: {pdf}")
    sys.exit(1)

print("[INFO] Setting up models...")
language_model = LLM_MODEL
vision_model = LLM_MODEL
setup_models(language_model, vision_model)

ppt_template_path=ppt
pdf_path=pdf
topic=topic
slides_count=slides
output_dir=output_dir

print("[DONE] Presentation generation complete.")

[INFO] Setting up models...


[DONE] Presentation generation complete.


In [3]:
"""
Given a PPT template and either a PDF file or a topic string,
generate a final presentation (pptx).
"""

# Create the run folder
os.makedirs(output_dir, exist_ok=True)
print(f"[INFO] Using run directory: {output_dir}")

# -- 1. Load models (text, image, marker) --
# Here we do a simpler single-model approach
text_model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=True, device=0)
image_model = get_image_model(device=0)
marker_model = create_model_dict(device=0, dtype=torch.float16)


[INFO] Using run directory: runs/20250202_215225_6eab0d6a


Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 303202.70it/s]
  colbert_state_dict = torch.load(os.path.join(model_dir, 'colbert_linear.pt'), map_location='cpu')
  sparse_state_dict = torch.load(os.path.join(model_dir, 'sparse_linear.pt'), map_location='cpu')


Loaded layout model datalab-to/surya_layout on device mps with dtype torch.float16
Loaded texify model to mps with torch.float16 dtype


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loaded recognition model vikp/surya_rec2 on device mps with dtype torch.float16
Loaded table recognition model vikp/surya_tablerec on device mps with dtype torch.float16
Loaded detection model vikp/surya_det3 on device mps with dtype torch.float16


In [4]:
# -- 2. Prepare the config objects --
generation_config = Config(output_dir)
# We'll store the "pptx" in a unique subfolder for safety.
pptx_md5 = hashlib.md5(open(ppt_template_path, "rb").read()).hexdigest()
pptx_config = Config(pjoin(RUNS_DIR, project_id, "pptx", pptx_md5))
os.makedirs(pptx_config.RUN_DIR, exist_ok=True)

# If you want to copy the PPT template into the "pptx" folder for caching:
if not os.path.exists(pjoin(pptx_config.RUN_DIR, "source.pptx")):
    os.system(f"cp '{ppt_template_path}' '{pjoin(pptx_config.RUN_DIR, 'source.pptx')}'")

print(f"[INFO] PPT Template MD5: {pptx_md5}")


[INFO] PPT Template MD5: f94b6c959a5907804512df07ddfbbbd6


In [5]:
# -- 3. Parse the PPT Template into a Presentation object --
print("[STAGE] PPT Parsing")
presentation = Presentation.from_file(
    pjoin(pptx_config.RUN_DIR, "source.pptx"), pptx_config
)
ppt_image_folder = pjoin(pptx_config.RUN_DIR, "slide_images")

if not os.path.exists(ppt_image_folder) or len(os.listdir(ppt_image_folder)) == 0:
    ppt_to_images(pjoin(pptx_config.RUN_DIR, "source.pptx"), ppt_image_folder)
    # Because your original code expects the slides to be renamed to slide_{i:04d}.jpg,
    # handle index alignment & remove error slides if any:
    for err_idx, _ in presentation.error_history:
        err_path = pjoin(ppt_image_folder, f"slide_{err_idx:04d}.jpg")
        if os.path.exists(err_path):
            os.remove(err_path)
    # rename real_idx => slide_idx
    for i, slide in enumerate(presentation.slides, start=1):
        slide.slide_idx = i
        old_name = pjoin(ppt_image_folder, f"slide_{slide.real_idx:04d}.jpg")
        new_name = pjoin(ppt_image_folder, f"slide_{slide.slide_idx:04d}.jpg")
        if os.path.exists(old_name):
            os.rename(old_name, new_name)

# You may optionally caption each slide image:
labler = ImageLabler(vision_model=vision_model, presentation=presentation, config=pptx_config)
labler.caption_images()

[STAGE] PPT Parsing


{}

In [6]:
# -- 4. Parse PDF or use a text topic to get doc_json --
print("[STAGE] PDF/Topic Parsing")
if pdf_path:
    # We’ll store PDF results under runs/pdf/<md5>/...
    pdf_md5 = hashlib.md5(open(pdf_path, "rb").read()).hexdigest()
    parsedpdf_dir = pjoin(RUNS_DIR, project_id, "pdf", pdf_md5)
    os.makedirs(parsedpdf_dir, exist_ok=True)
    # If the refined document is not cached, parse the PDF text & refine
    refined_doc_json_path = pjoin(parsedpdf_dir, "refined_doc.json")
    if not os.path.exists(refined_doc_json_path):
        print("[INFO] Parsing PDF ...")
        text_content = parse_pdf(pdf_path, parsedpdf_dir, marker_model)
        # Optional: also caption images found in PDF
        caption_json_path = pjoin(parsedpdf_dir, "caption.json")
        if not os.path.exists(caption_json_path):
            with open(CAPTION_PROMPT_PATH, "r", encoding="utf-8") as f:
                caption_prompt = f.read()
            images_info = {}
            for k in os.listdir(parsedpdf_dir):
                if is_image_path(k):
                    img_path = pjoin(parsedpdf_dir, k)
                    try:
                        text_cap = vision_model(caption_prompt, [img_path])
                        with PIL.Image.open(img_path) as img:
                            size = img.size
                        images_info[img_path] = [text_cap, size]
                    except Exception as e:
                        print(f"[ERROR] Could not caption {k}: {str(e)}")
            with open(caption_json_path, "w", encoding="utf-8") as f:
                json.dump(images_info, f, ensure_ascii=False, indent=4)

        # Now refine the markdown doc => JSON
        doc_json = refine_document(language_model, text_content)
        json.dump(doc_json, open(refined_doc_json_path, "w"), indent=4)
    else:
        print("[INFO] Using cached refined_doc.json")
        doc_json = json.load(open(refined_doc_json_path, "r"))
    # Also load the PDF's image captions (if you want them for generation):
    caption_json_path = pjoin(parsedpdf_dir, "caption.json")
    images = json.load(open(caption_json_path)) if os.path.exists(caption_json_path) else {}
    
else:
    # If no PDF, we assume we have a text topic
    print(f"[INFO] Generating from topic: '{topic}'")
    pdf_md5 = topic  # just reuse the "pdf" variable for your code’s logic
    doc_json = topic_generate(language_model, topic)
    
    # save doc_json to file
    topic_doc_json_path = pjoin(RUNS_DIR, project_id, "text_topic", pdf_md5)
    os.makedirs(topic_doc_json_path, exist_ok=True)
    json.dump(doc_json, open(pjoin(topic_doc_json_path, "topic_doc.json"), "w"), indent=4)
    images = {}


[STAGE] PDF/Topic Parsing
[INFO] Generating from topic: 'Introduce the history of OpenAI'


In [7]:
# -- 5. Slide Induction (looking at the PPT structure) --
print("[STAGE] Slide Induction")
template_img_dir = pjoin(pptx_config.RUN_DIR, "template_images")
if not os.path.exists(template_img_dir) or len(os.listdir(template_img_dir)) == 0:
    # Save a stripped version of the PPT
    deepcopy(presentation).save(
        pjoin(pptx_config.RUN_DIR, "template.pptx"), layout_only=False
    )
    ppt_to_images(
        pjoin(pptx_config.RUN_DIR, "template.pptx"), template_img_dir
    )

template_presentation = Presentation.from_file(
    pjoin(pptx_config.RUN_DIR, "template.pptx"), pptx_config
)

slide_inducter = induct.SlideInducter(
    vision_model,
    language_model,
    presentation,
    ppt_image_folder,
    template_img_dir,
    pptx_config,
    image_model,
    "inference_script",  # or "backend", up to you
)
slide_induction = slide_inducter.content_induct()
print(f"Slide Induction:\n{slide_induction}")

[STAGE] Slide Induction


Slide Induction:
defaultdict(<function SlideInducter.__init__.<locals>.<lambda> at 0x3ea538360>, {'opening:text': defaultdict(<class 'list'>, {'slides': [1], 'template_id': 1, 'content_schema': {'slide_title': {'description': 'The main title describing the subject of the slide.', 'type': 'text', 'data': ['Tourism & Culture:']}, 'main_heading': {'description': 'The central or main heading that introduces the core theme or message of the slide.', 'type': 'text', 'data': ['APPRECIATING THE TANGIBLE & THE INTANGIBLE OF BHUBANESWAR THROUGH CULTURAL TOURISM']}, 'presenter': {'description': 'The name of the person or entity presenting the content of the slide.', 'type': 'text', 'data': ['By Ayona Bhaduri']}}}), 'Text-Centered Slide with Bullet Points:text': defaultdict(<class 'list'>, {'template_id': 3, 'slides': [3, 5, 4, 2], 'content_schema': {'main_title': {'description': 'The primary heading that introduces the main topic of the slide', 'type': 'text', 'data': ['What is Tourism?']}, 'cont

In [8]:
# -- 6. PPT Generation --
print("[STAGE] PPT Generation")
# instantiate the “crew” that handles text generation for slides
crew = pptgen.PPTCrew(vision_model, language_model, text_model,
                      error_exit=True, retry_times=3)

crew.set_reference(template_presentation, slide_induction, generation_config)

# Actually generate the new PPT
# (the code will produce final.pptx in output_dir)
crew.generate_pres(
    generation_config,
    images,         # PDF or custom images (captions, if any)
    slides_count,   # number of pages to generate
    doc_json        # the JSON structure
)

print("[STAGE] Success!")
print(f"[INFO] Output PPT stored at {pjoin(output_dir, 'final.pptx')}")

[STAGE] PPT Generation


[STAGE] Success!
[INFO] Output PPT stored at runs/20250202_215225_6eab0d6a/final.pptx


### PPT gen test

In [9]:
for slide in presentation.slides:
    print(slide.slide_idx, slide.slide_title)
    for key, value in slide.__dict__.items():
        print(key, value)
    print("\n\n")

# presentation cut off test

presentation = Presentation.from_file("./examples/default_template_unedit.pptx", pptx_config)
presentation.slides = presentation.slides[:2]
presentation.save("cutoff.pptx")


1 Tourism & Culture:
shapes [TextBox: shape 0 of slide 1, TextBox: shape 1 of slide 1]
slide_idx 1
real_idx 1
background_xml <p:bgPr xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
  <a:noFill/>
  <a:effectLst/>
</p:bgPr>

slide_notes None
slide_layout_name Title Slide
slide_title Tourism & Culture:
slide_width 960
slide_height 540



2 What is culture?
shapes [TextBox: shape 0 of slide 2, TextBox: shape 1 of slide 2]
slide_idx 2
real_idx 2
background_xml <p:bgPr xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
  <a:noFill/>
  <a:effectLst/>
</p:bgPr>

slide_notes None
slide_layout_name Title and Content
slide_title What is culture?
slide_width 960
slide_height 540