In [65]:
# Imports
import os
from pdf2image import convert_from_path
import logging
from io import BytesIO
import shutil
from pathlib import Path
from pptx import Presentation
import base64
from openai import OpenAI
from dotenv import load_dotenv
from typing import List

input_dir = "/Users/sumitkamra/code/sumitkamra20/insightgen/data/input"
output_dir = "/Users/sumitkamra/code/sumitkamra20/insightgen/data/output"

# Configure logging to display INFO level logs in Jupyter Notebook
logging.basicConfig(level=logging.INFO, force=True)

# Test it
logging.info("Logging is now visible in Jupyter Notebook!")


INFO:root:Logging is now visible in Jupyter Notebook!


### Step 1: Extract slide metadata and initiatlize a dictionary to store slide metadata

In [66]:
# Extract slide metadata
import os
from pptx import Presentation
from pptx.enum.shapes import PP_PLACEHOLDER

def extract_slide_metadata(input_folder: str) -> dict:
    """
    Extracts metadata from each slide in a PPTX file.

    Args:
        input_folder (str): Path to the folder containing the PPTX file.

    Returns:
        dict: Dictionary storing slide metadata including layout, content status, placeholder availability, and placeholders for observations.
    """
    # Find the PPTX file in the input folder
    pptx_files = [f for f in os.listdir(input_folder) if f.endswith('.pptx')]

    if not pptx_files:
        raise FileNotFoundError("No PPTX file found in the input folder.")
    if len(pptx_files) > 1:
        raise ValueError("Multiple PPTX files found. Please keep only one.")

    pptx_path = os.path.join(input_folder, pptx_files[0])
    presentation = Presentation(pptx_path)

    slide_data = {}

    # Iterate through slides and extract metadata
    for slide_number, slide in enumerate(presentation.slides, start=1):
        layout_name = slide.slide_layout.name  # Extract layout name
        content_slide = layout_name not in ["Header slide", "Divider"]  # Determine if it's a content slide

        # Check if a title placeholder exists
        has_placeholder = any(
            shape.is_placeholder and shape.placeholder_format.type == PP_PLACEHOLDER.TITLE
            for shape in slide.shapes
        )

        # Initialize empty fields for later functions to fill
        slide_data[slide_number] = {
            "layout": layout_name,
            "content_slide": content_slide,
            "has_placeholder": has_placeholder,
            "key_observations": "",
            "slide_headline": "",
            "speaker_notes": "",
        }

    return slide_data


### Step 2: Create slide images and store in slide metadata dictionary

In [71]:
import os
import logging
import base64
from io import BytesIO
from pdf2image import convert_from_path

def generate_slide_images_base64(input_folder: str, slide_data: dict, img_format="JPEG", dpi=200) -> dict:
    """
    Converts PDF slides to images, encodes them in base64, and updates the slide_data dictionary.
    Excludes non-content slides (e.g., Header or Divider) from image processing.

    Args:
        input_folder (str): Directory containing input PDF and PPTX files.
        slide_data (dict): Dictionary storing slide metadata.
        img_format (str): Image format (default: JPEG).
        dpi (int): Resolution for image conversion.

    Returns:
        dict: Updated slide metadata dictionary with base64 images (only for content slides).
    """

    logging.info("Starting PDF to image conversion...")

    # Validate input directory
    if not os.path.exists(input_folder):
        raise ValueError(f"Input directory does not exist: {input_folder}")

    # Find PDF file in the input folder
    pdf_files = [f for f in os.listdir(input_folder) if f.endswith('.pdf')]

    if not pdf_files:
        logging.error("No PDF file found in the input folder.")
        return slide_data

    if len(pdf_files) > 1:
        logging.error("Multiple PDF files found. Please keep only one.")
        return slide_data

    pdf_path = os.path.join(input_folder, pdf_files[0])

    # Convert PDF to images (in-memory)
    images = convert_from_path(pdf_path, dpi=dpi)

    logging.info(f"PDF successfully converted to {len(images)} images.")

    # Process only content slides
    for i, image in enumerate(images, start=1):
        slide_number = i  # Assuming slides and PDF pages match 1:1

        # Skip non-content slides
        if slide_number not in slide_data or not slide_data[slide_number]["content_slide"]:
            slide_data[slide_number]["status"] = "Skipped (Non-content slide)"
            continue

        # Convert image to base64 (in-memory)
        img_byte_arr = BytesIO()
        image.save(img_byte_arr, format=img_format)
        img_byte_arr.seek(0)
        base64_image = base64.b64encode(img_byte_arr.read()).decode('utf-8')

        # Store base64 image in slide_data dictionary
        slide_data[slide_number]["image_base64"] = base64_image
        slide_data[slide_number]["status"] = "Image processed"

        logging.info(f"Slide {slide_number}: Image converted and stored as base64.")

    logging.info("Base64 images stored successfully in slide metadata.")

    return slide_data

### Step 3: Using Open AI API to generate headlines and store in slide metadata dictionary

In [77]:
import logging
from openai import OpenAI
from dotenv import load_dotenv
import os

def generate_headlines_OPENAI(slide_data: dict, brand: str) -> dict:
    """
    Generates AI-powered headlines for content slides and marks non-content slides as 'HEADER SLIDE'.

    Args:
        slide_data (dict): Dictionary storing slide metadata, including base64-encoded images.
        brand (str): Brand name for analysis.

    Returns:
        dict: Updated slide dictionary with AI-generated headlines.
    """
    # Load environment variables and initialize OpenAI client
    load_dotenv()
    client = OpenAI(api_key=os.getenv('OPENAI_API'))

    prompt = f'''
    Act as an expert market analyst writing a brand health study for {brand}.
    For this slide:
    1. Analyze key performance metrics and trends
    2. Identify significant competitor movements or market dynamics
    3. Provide strategic implications or recommendations for {brand}

    Synthesize these into a clear, insightful headline that captures the main story and its business impact.
    Focus on actionable insights and quantitative findings when present.
    Keep it limited to 25 words or less and use plain text.
    Capitalize only brand names and proper nouns.
    '''

    # Process only content slides that have a placeholder
    for slide_number, slide in slide_data.items():
        if not slide["content_slide"]:
            slide["slide_headline"] = "HEADER SLIDE"
            slide["status"] = "Skipped (Non-content slide)"
            continue

        if not slide["has_placeholder"]:
            slide["slide_headline"] = "NO TITLE PLACEHOLDER"
            slide["status"] = "Skipped (No title placeholder)"
            continue

        base64_image = slide.get("image_base64", "")

        if not base64_image:
            logging.error(f"Slide {slide_number}: Missing base64 image.")
            slide["slide_headline"] = "Error: Missing slide image"
            slide["status"] = "Error"
            continue

        try:
            # Send request to OpenAI Vision API
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": prompt},
                            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
                        ]
                    }
                ],
                max_tokens=300
            )

            headline = response.choices[0].message.content.strip()
            slide["slide_headline"] = headline
            slide["status"] = "Headline generated"
            logging.info(f"Slide {slide_number}: Headline generated")

        except Exception as e:
            logging.error(f"Slide {slide_number}: Error generating headline: {str(e)}")
            slide["slide_headline"] = "Error: AI processing failed"
            slide["status"] = "Error"

    return slide_data

### Step 4: Insert slide headlines into the PPTX file

In [75]:
import os
import logging
from pptx import Presentation
from pptx.enum.shapes import PP_PLACEHOLDER

def insert_headlines_into_pptx(input_folder: str, output_folder: str, slide_data: dict, save_as_new: bool = True):
    """
    Inserts AI-generated headlines into the PPTX file and saves it in the output directory.

    Args:
        input_folder (str): Path to the folder containing the input PPTX file.
        output_folder (str): Path to the folder where the modified PPTX file should be saved.
        slide_data (dict): Dictionary storing slide metadata and headlines.
        save_as_new (bool): Whether to save as a new file.

    Returns:
        str: Path to the saved PowerPoint file.
    """

    logging.info("Starting headline insertion into PowerPoint...")

    # Find the PPTX file in the input folder
    pptx_files = [f for f in os.listdir(input_folder) if f.endswith('.pptx')]

    if not pptx_files:
        raise FileNotFoundError("No PPTX file found in the input folder.")
    if len(pptx_files) > 1:
        raise ValueError("Multiple PPTX files found. Please keep only one.")

    pptx_path = os.path.join(input_folder, pptx_files[0])
    presentation = Presentation(pptx_path)

    for slide_number, slide in enumerate(presentation.slides, start=1):
        slide_info = slide_data.get(slide_number, {})
        headline = slide_info.get("slide_headline", "")

        if not headline or headline == "HEADER SLIDE":
            logging.info(f"Slide {slide_number}: Skipped (Header or non-content slide)")
            continue  # Skip non-content slides

        updated = False  # Track if we update a title placeholder

        # Try to update title placeholder
        for shape in slide.shapes:
            if shape.is_placeholder and shape.placeholder_format.type == PP_PLACEHOLDER.TITLE:
                shape.text = headline  # ✅ Replace title text with headline
                updated = True
                logging.info(f"Slide {slide_number}: Title updated with headline.")
                break  # Stop after modifying the first title placeholder

        # If no title placeholder, add headline to speaker notes
        if not updated:
            if slide.has_notes_slide:
                notes_slide = slide.notes_slide
                existing_notes = notes_slide.notes_text_frame.text if notes_slide.notes_text_frame else ""
                notes_slide.notes_text_frame.text = f"SLIDE HEADLINE: {headline}\n\n{existing_notes}"  # ✅ Append existing notes
                logging.info(f"Slide {slide_number}: Headline added to speaker notes.")
            else:
                logging.warning(f"Slide {slide_number}: No title placeholder or speaker notes. Headline not saved.")

    # Ensure the output directory exists
    os.makedirs(output_folder, exist_ok=True)

    # Define new filename and save in output directory
    original_filename = os.path.basename(pptx_path)
    new_filename = original_filename.replace(".pptx", "_WITH_HEADLINES.pptx")
    new_pptx_path = os.path.join(output_folder, new_filename)

    presentation.save(new_pptx_path)
    logging.info(f"PowerPoint file saved with headlines: {new_pptx_path}")

    return new_pptx_path

### Step 5: Run the code

In [76]:

# Metadata extraction and image generation
slide_metadata = extract_slide_metadata(input_dir)
slide_metadata = generate_slide_images_base64(input_dir, slide_metadata)

# Headline generation
brand = "Heineken"
slide_metadata = generate_headlines_OPENAI(slide_metadata, brand)

# Insert slide headlines into the PPTX file
modified_pptx = insert_headlines_into_pptx(input_dir, output_dir, slide_metadata, save_as_new=True)


INFO:root:Starting PDF to image conversion...
INFO:root:PDF successfully converted to 17 images.
INFO:root:Slide 4: Image converted and stored as base64.
INFO:root:Slide 5: Image converted and stored as base64.
INFO:root:Slide 6: Image converted and stored as base64.
INFO:root:Slide 7: Image converted and stored as base64.
INFO:root:Slide 8: Image converted and stored as base64.
INFO:root:Slide 9: Image converted and stored as base64.
INFO:root:Slide 10: Image converted and stored as base64.
INFO:root:Slide 11: Image converted and stored as base64.
INFO:root:Slide 13: Image converted and stored as base64.
INFO:root:Slide 14: Image converted and stored as base64.
INFO:root:Slide 15: Image converted and stored as base64.
INFO:root:Slide 16: Image converted and stored as base64.
INFO:root:Slide 17: Image converted and stored as base64.
INFO:root:Base64 images stored successfully in slide metadata.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
I