In [1]:
import json
import base64
import mimetypes
import os
import sys
from dotenv import load_dotenv
load_dotenv()

from langchain_core.runnables import RunnableLambda
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.output_parsers import BaseOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI

sys.path.insert(0, os.path.abspath(".."))

In [2]:
SYSTEM_PROMPT = (
    "You are a product analyst extracting structured app information from a demo "
    "video. Use only what is shown or said without assumptions. "
    "{format_instructions}"
)

HUMAN_PROMPT = (
    "Analyze the demo video and extract the app details and the features discussed. "
    "Include start_timestamp and end_timestamp for each feature in HH:MM:SS format. "
    "Use empty strings when timestamps are not clear."
)


In [None]:
from dataclasses import dataclass

from src.schemas.product import _extract_json, _coerce_str


@dataclass
class AppInfo:
    id: str
    name: str
    description: str


@dataclass
class FeatureInfo:
    id: str
    name: str
    description: str
    start_timestamp: str
    end_timestamp: str


@dataclass
class ExtractionResult:
    app: AppInfo
    features: list[FeatureInfo]


class AppFeaturesParser(BaseOutputParser[ExtractionResult]):
    def get_format_instructions(self) -> str:
        return (
            "Return valid JSON with keys: app, features. "
            "app: {id, name, description}. "
            "features: list of feature objects. "
            "Each feature has id, name, description, start_timestamp, end_timestamp."
        )

    def parse(self, text: str) -> ExtractionResult:
        payload = json.loads(_extract_json(text))
        app_payload = payload.get("app") or {}
        app = AppInfo(
            id=_coerce_str(app_payload.get("id")),
            name=_coerce_str(app_payload.get("name")),
            description=_coerce_str(app_payload.get("description")),
        )

        features_payload = payload.get("features") or []
        features: list[FeatureInfo] = []
        if isinstance(features_payload, list):
            for item in features_payload:
                if isinstance(item, dict):
                    features.append(
                        FeatureInfo(
                            id=_coerce_str(item.get("id")),
                            name=_coerce_str(item.get("name")),
                            description=_coerce_str(item.get("description")),
                            start_timestamp=_coerce_str(item.get("start_timestamp")),
                            end_timestamp=_coerce_str(item.get("end_timestamp")),
                        )
                    )

        return ExtractionResult(app=app, features=features)


parser = AppFeaturesParser()

In [4]:
model = ChatGoogleGenerativeAI(
    model="gemini-3-pro-preview",
    temperature=0,
)


def _guess_video_mime_type(video_path: str) -> str:
    mime_type, _ = mimetypes.guess_type(video_path)
    return mime_type or "video/mp4"


def _load_video_base64(video_path: str) -> tuple[str, str]:
    mime_type = _guess_video_mime_type(video_path)
    with open(video_path, "rb") as video_file:
        video_base64 = base64.b64encode(video_file.read()).decode("utf-8")
    return mime_type, video_base64


def build_messages(inputs: dict[str, str]):
    video_path = inputs["video_path"]
    print('video_path  =', video_path)
    mime_type, video_base64 = _load_video_base64(video_path)
    system_text = SYSTEM_PROMPT.format(
        format_instructions=parser.get_format_instructions()
    )

    return [
        SystemMessage(content=system_text),
        HumanMessage(
            content=[
                {"type": "text", "text": HUMAN_PROMPT},
                {
                    "type": "file",
                    "source_type": "base64",
                    "mime_type": mime_type,
                    "data": video_base64,
                },
            ]
        ),
    ]


chain = RunnableLambda(build_messages) | model | parser

In [5]:
video_path = "../data/clips/send_receive_mails.mp4"
result = chain.invoke({"video_path": video_path})

video_path  = ../data/clips/send_receive_mails.mp4


In [6]:
result

ExtractionResult(app=AppInfo(id='zapmail', name='Zapmail', description='An email client designed for sending and receiving emails with productivity features like draft stacking.'), features=[FeatureInfo(id='compose-and-send-email', name='Compose and Send Email', description="Users can send emails by clicking the compose button, entering the recipient's address, adding a subject, and writing the message.", start_timestamp='00:11', end_timestamp='00:22'), FeatureInfo(id='stack-drafts', name='Stack Drafts', description='Users can minimize and stack their current email draft to a sidebar, allowing them to view and reference other emails while writing.', start_timestamp='00:23', end_timestamp='00:30')])