In [84]:
# Imports
import os
from pdf2image import convert_from_path
import logging
from io import BytesIO
import shutil
from pathlib import Path
from pptx import Presentation
import base64
from openai import OpenAI
from dotenv import load_dotenv
from typing import List

input_dir = "/Users/sumitkamra/code/sumitkamra20/insightgen/data/input"
output_dir = "/Users/sumitkamra/code/sumitkamra20/insightgen/data/output"

# Configure logging to display INFO level logs in Jupyter Notebook
logging.basicConfig(level=logging.INFO, force=True)

# Test it
logging.info("Logging is now visible in Jupyter Notebook!")


INFO:root:Logging is now visible in Jupyter Notebook!


### Step 1: Extract slide metadata and initiatlize a dictionary to store slide metadata

In [90]:
# Extract slide metadata
import os
from pptx import Presentation
from pptx.enum.shapes import PP_PLACEHOLDER

def extract_slide_metadata(input_folder: str) -> dict:
    """
    Extracts metadata from each slide in a PPTX file.

    Args:
        input_folder (str): Path to the folder containing the PPTX file.

    Returns:
        dict: Dictionary storing slide metadata including layout, content status, placeholder availability,
              and placeholders for observations.
    """
    # Find the PPTX file in the input folder
    pptx_files = [f for f in os.listdir(input_folder) if f.endswith('.pptx')]

    if not pptx_files:
        raise FileNotFoundError("No PPTX file found in the input folder.")
    if len(pptx_files) > 1:
        raise ValueError("Multiple PPTX files found. Please keep only one.")

    pptx_path = os.path.join(input_folder, pptx_files[0])
    presentation = Presentation(pptx_path)

    slide_data = {}

    # Iterate through slides and extract metadata
    for slide_number, slide in enumerate(presentation.slides, start=1):
        layout_name = slide.slide_layout.name  # Extract layout name

        # Mark slide as non-content if its layout name starts with "HEADER" (case-insensitive)
        content_slide = not layout_name.upper().startswith("HEADER")

        # Check if a title placeholder exists (if any shape is a placeholder of type TITLE)
        has_placeholder = any(
            shape.is_placeholder and shape.placeholder_format.type == PP_PLACEHOLDER.TITLE
            for shape in slide.shapes
        )

        # Initialize empty fields for later functions to fill
        slide_data[slide_number] = {
            "layout": layout_name,
            "content_slide": content_slide,
            "has_placeholder": has_placeholder,
            "key_observations": "",
            "slide_headline": "",
            "speaker_notes": "",
        }

    return slide_data


### Step 2: Create slide images and store in slide metadata dictionary

In [71]:
import os
import logging
import base64
from io import BytesIO
from pdf2image import convert_from_path

def generate_slide_images_base64(input_folder: str, slide_data: dict, img_format="JPEG", dpi=200) -> dict:
    """
    Converts PDF slides to images, encodes them in base64, and updates the slide_data dictionary.
    Excludes non-content slides (e.g., Header or Divider) from image processing.

    Args:
        input_folder (str): Directory containing input PDF and PPTX files.
        slide_data (dict): Dictionary storing slide metadata.
        img_format (str): Image format (default: JPEG).
        dpi (int): Resolution for image conversion.

    Returns:
        dict: Updated slide metadata dictionary with base64 images (only for content slides).
    """

    logging.info("Starting PDF to image conversion...")

    # Validate input directory
    if not os.path.exists(input_folder):
        raise ValueError(f"Input directory does not exist: {input_folder}")

    # Find PDF file in the input folder
    pdf_files = [f for f in os.listdir(input_folder) if f.endswith('.pdf')]

    if not pdf_files:
        logging.error("No PDF file found in the input folder.")
        return slide_data

    if len(pdf_files) > 1:
        logging.error("Multiple PDF files found. Please keep only one.")
        return slide_data

    pdf_path = os.path.join(input_folder, pdf_files[0])

    # Convert PDF to images (in-memory)
    images = convert_from_path(pdf_path, dpi=dpi)

    logging.info(f"PDF successfully converted to {len(images)} images.")

    # Process only content slides
    for i, image in enumerate(images, start=1):
        slide_number = i  # Assuming slides and PDF pages match 1:1

        # Skip non-content slides
        if slide_number not in slide_data or not slide_data[slide_number]["content_slide"]:
            slide_data[slide_number]["status"] = "Skipped (Non-content slide)"
            continue

        # Convert image to base64 (in-memory)
        img_byte_arr = BytesIO()
        image.save(img_byte_arr, format=img_format)
        img_byte_arr.seek(0)
        base64_image = base64.b64encode(img_byte_arr.read()).decode('utf-8')

        # Store base64 image in slide_data dictionary
        slide_data[slide_number]["image_base64"] = base64_image
        slide_data[slide_number]["status"] = "Image processed"

        logging.info(f"Slide {slide_number}: Image converted and stored as base64.")

    logging.info("Base64 images stored successfully in slide metadata.")

    return slide_data

### Step 3:Use Vision API and Assistant API for Observations and headlines

- Step 1: Generate observations and store in slide metadata
- Step 2: Generate headlines through observations and store in slide metadata



In [82]:
# Global prompt variables (to be moved to a separate file later if desired)

OBSERVATIONS_SYSTEM_PROMPT = """
You are an AI assistant specialized in analyzing market research report slides.

### What You Will Be Provided With:
- You will receive an image of a slide that may contain data, charts, and insights from a market research study conducted by Kantar.
- The **specific `<category>` and `<market>` will be provided by the user in their prompt.**
- The image might include data for different brands across various timeframes, or data for the category or user profiles, etc.
- **You might see Kantar and client brand logos at the very bottom of the slide. You can ignore them.**

### Instructions:
1. **Identify the Slide Topic:**
   - Determine what the slide is about based on the label above the charts.

2. **Analyze the Data Thoroughly:**
   - Carefully **read all the data** and identify notable trends or patterns.
   - If the data covers **multiple timeframes**, you **must always** analyze both:
     - **Short-term movement** (latest period vs. the preceding period).
     - **Long-term movement** (current period vs. the first period available on the slide).
     - Ensure both movements are included in your response when applicable. **If either movement is missing, your response is incomplete.**
   - If the slide **does not** include a timeframe:
     - Compare differences between brands or other labels visible in the slide.

3. **Ensure Objectivity and Completeness:**
   - **Base your response solely on the factual information** available in the slide.
   - Do **not** add any information beyond what is provided in the image.
   - **Always check that both short-term and long-term trends are covered if timeframes exist.**
"""

HEADLINE_SYSTEM_INSTRUCTIONS = """
You are an AI assistant specialized in **creating headlines** for Brand Health Tracking reports.

### **Task:**
You are given a textual description of data in a **brand health tracking slide**.
Your job is to **generate a concise headline** that summarizes the main idea with an **implication** for:
- **Client brands vs. competitors**, OR
- **The category**, OR
- **The market** *(depending on the slide content).*

#### **Headline Requirements:**
- **Length:** 30-50 words.
- **Avoid precise numeric figures.**
- **Plain text only** (no markdown symbols).
- **Use sentence case.**

### **Additional User Instructions (if provided):**
{additional_system_instructions}

### **You Understand:**
#### **1. Make Connections Between Measures and Brands**
- **Where possible, establish causal links between metrics.**
  - Example: **"Brand Power decline is driven by a drop in Meaningful connection and lower Salience."**
  - Example: **"A drop in Salience is reflected in weaker TOM awareness."**
  - Example: **"Tiger’s decline in Meaningful connection has strengthened rivals like 333 and Saigon Beer."**

#### **2. Identify Cross-Slide Insights Where Relevant**
- **Slides might be connected**—trends in one slide may explain or be reflected in another.
- If applicable, **link insights across slides** to **explain the cause-effect relationship.**
  - Example: **"Brand Power drop among young consumers aligns with weaker endorsement on key brand imageries."**

#### **3. Kantar’s Brand Power Framework**
- **Brand Power**: Core brand equity metric driven by:
  - **Meaningful** – Strength of emotional & functional connections.
  - **Difference** – Uniqueness and competitive edge.
  - **Salience** – How quickly a brand comes to mind in the category, influenced by availability, advertising, and usage.

#### **4. Other Key Metrics**
- **Brand Image**: Consumer endorsement on brand perceptions, often explaining shifts in Meaningful & Difference.
- **Trial & Regular Usage**: Trial is more relevant for smaller brands; regular usage reflects **consumer loyalty**.
- **BUMO (Brand Used Most Often)**: Measures **loyal usage base**—critical for big brands.

#### **5. Brand Sets & Context**
- The user prompt will specify **which brands are client brands and which are competitors**.
"""


In [102]:
import os
import logging
from dotenv import load_dotenv
import openai
from openai import OpenAI

def generate_observations_and_headlines(
    slide_data: dict,
    user_prompt: str,
    additional_system_instructions: str = ""
) -> dict:
    """
    1) Generates textual observations for each content slide using a ChatCompletion call with OBSERVATIONS_SYSTEM_PROMPT.
    2) Creates a new Assistant (via the Assistants API) for headlines and calls it to produce a concise headline.
    """
    load_dotenv()
    openai_api_key = os.getenv('OPENAI_API')
    if not openai_api_key:
        raise ValueError("Missing OPENAI_API key in environment variables.")

    # Initialize the OpenAI client (new format)
    client = OpenAI(api_key=openai_api_key)

    # Format the headline system instructions with any additional instructions provided.
    formatted_headline_instructions = HEADLINE_SYSTEM_INSTRUCTIONS.format(
        additional_system_instructions=additional_system_instructions
    )

    for slide_number, slide in slide_data.items():
        # Process only content slides
        if not slide.get("content_slide"):
            slide["slide_observations"] = ""
            slide["slide_headline"] = "HEADER SLIDE"
            slide["status"] = "Skipped (Non-content slide)"
            continue

        base64_image = slide.get("image_base64", "")
        if not base64_image:
            logging.error(f"Slide {slide_number}: Missing base64 image.")
            slide["slide_observations"] = ""
            slide["slide_headline"] = "Error: Missing slide image"
            slide["status"] = "Error"
            continue

        # ---------------------------
        # STEP 1: Generate Observations via ChatCompletion (updated format)
        # ---------------------------
        try:
            obs_response = client.chat.completions.create(
                model="gpt-4o",
                temperature=0.6,
                max_tokens=4000,
                messages=[
                    {"role": "system", "content": OBSERVATIONS_SYSTEM_PROMPT},
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": f"(Slide {slide_number}) {user_prompt}"},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_image}",
                                    "detail": "high"
                                }
                            }
                        ]
                    }
                ]
            )
            observations_text = obs_response.choices[0].message.content.strip()
            slide["slide_observations"] = observations_text
        except Exception as e:
            logging.error(f"Slide {slide_number}: Error generating observations: {str(e)}")
            slide["slide_observations"] = "Error in observations generation"
            slide["slide_headline"] = ""
            slide["status"] = "Error"
            continue

        # ---------------------------
        # STEP 2: Generate Headline via Assistants API
        # ---------------------------
        try:
            headline_assistant = client.beta.assistants.create(
                name="Headline Assistant API",
                instructions=formatted_headline_instructions,
                model="gpt-4o",
                temperature=0.7
            )

            thread = client.beta.threads.create()

            # Modify the message format to be more direct
            user_headline_message = f"Generate a headline based on these observations:\n{observations_text}"

            client.beta.threads.messages.create(
                thread_id=thread.id,
                role="user",
                content=user_headline_message
            )

            run = client.beta.threads.runs.create_and_poll(
                thread_id=thread.id,
                assistant_id=headline_assistant.id
            )

            if run.status == "completed":
                messages = client.beta.threads.messages.list(thread_id=thread.id)
                final_assistant_msg = messages.data[0]  # Get the most recent message

                # Extract only the assistant's response, removing any echoed input
                headline = final_assistant_msg.content[0].text.value.strip()

                # Clean up the headline by removing any "Assistant:" prefix and extra whitespace
                headline = headline.replace("Assistant:", "").strip()

                slide["slide_headline"] = headline
                slide["status"] = "Headline generated"
            else:
                slide["slide_headline"] = f"Error: Run status: {run.status}"
                slide["status"] = f"Run incomplete ({run.status})"


        except Exception as e:
            logging.error(f"Slide {slide_number}: Error generating headline: {str(e)}")
            slide["slide_headline"] = "Error in headline generation"
            slide["status"] = "Error"

    return slide_data

### Step 4: Insert slide headlines and observations into the PPTX file

In [99]:
import os
import logging
from pptx import Presentation
from pptx.enum.shapes import PP_PLACEHOLDER

def insert_headlines_into_pptx(input_folder: str, output_folder: str, slide_data: dict, save_as_new: bool = True):
    """
    Inserts AI-generated headlines into slide title placeholders and observations into speaker notes.

    Args:
        input_folder (str): Path to the folder containing the input PPTX file.
        output_folder (str): Path to the folder where the modified PPTX file should be saved.
        slide_data (dict): Dictionary storing slide metadata, headlines, and observations.
        save_as_new (bool): Whether to save as a new file.

    Returns:
        str: Path to the saved PowerPoint file.
    """
    logging.info("Starting headline and observations insertion into PowerPoint...")

    pptx_files = [f for f in os.listdir(input_folder) if f.endswith('.pptx')]
    if not pptx_files:
        raise FileNotFoundError("No PPTX file found in the input folder.")
    if len(pptx_files) > 1:
        raise ValueError("Multiple PPTX files found. Please keep only one.")

    pptx_path = os.path.join(input_folder, pptx_files[0])
    presentation = Presentation(pptx_path)

    for slide_number, slide in enumerate(presentation.slides, start=1):
        slide_info = slide_data.get(slide_number, {})
        headline = slide_info.get("slide_headline", "")
        observations = slide_info.get("slide_observations", "")

        if not headline or headline == "HEADER SLIDE":
            logging.info(f"Slide {slide_number}: Skipped (Header or non-content slide)")
            continue

        # Update title placeholder with headline
        title_updated = False
        for shape in slide.shapes:
            if shape.is_placeholder and shape.placeholder_format.type == PP_PLACEHOLDER.TITLE:
                shape.text = headline
                title_updated = True
                logging.info(f"Slide {slide_number}: Title updated with headline.")
                break

        if not title_updated:
            logging.warning(f"Slide {slide_number}: No title placeholder found for headline.")

        # Add observations to speaker notes
        if observations:
            notes_slide = slide.notes_slide
            notes_slide.notes_text_frame.text = observations
            logging.info(f"Slide {slide_number}: Observations added to speaker notes.")

    # Ensure the output directory exists
    os.makedirs(output_folder, exist_ok=True)

    # Save the modified presentation
    original_filename = os.path.basename(pptx_path)
    new_filename = original_filename.replace(".pptx", "_WITH_HEADLINES.pptx")
    new_pptx_path = os.path.join(output_folder, new_filename)

    presentation.save(new_pptx_path)
    logging.info(f"PowerPoint file saved with headlines and observations: {new_pptx_path}")

    return new_pptx_path

### Step 5 Running the code: With 2 Step Preccess


In [104]:
# Metadata extraction and image generation
slide_metadata = extract_slide_metadata(input_dir)
slide_metadata = generate_slide_images_base64(input_dir, slide_metadata)

# Headline generation (using the new two-step function)
user_prompt = """
Market: Vietnam,
Client brands: Heineken, Tiger, Bia Viet, Larue, Bivina
Competitors: 333, Saigon Beer, Hanoi Beer

"""
slide_metadata = generate_observations_and_headlines(slide_metadata, user_prompt)

# Insert slide headlines into the PPTX file
modified_pptx = insert_headlines_into_pptx(input_dir, output_dir, slide_metadata, save_as_new=True)

INFO:root:Starting PDF to image conversion...
INFO:root:PDF successfully converted to 10 images.
INFO:root:Slide 3: Image converted and stored as base64.
INFO:root:Slide 5: Image converted and stored as base64.
INFO:root:Slide 6: Image converted and stored as base64.
INFO:root:Slide 7: Image converted and stored as base64.
INFO:root:Slide 8: Image converted and stored as base64.
INFO:root:Slide 9: Image converted and stored as base64.
INFO:root:Slide 10: Image converted and stored as base64.
INFO:root:Base64 images stored successfully in slide metadata.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/assistants "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/threads "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/threads/thread_6foZpeFUNo8USwROc5v24WGC/messages "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/thr