In [29]:
from langgraph.graph import END, START, StateGraph
from langgraph.types import Send, interrupt, Command
from typing import TypedDict
import subprocess
from openai import OpenAI
import textwrap
from langchain.chat_models import init_chat_model
from typing_extensions import Annotated
import operator
import base64
from langgraph.checkpoint.memory import InMemorySaver

memory = InMemorySaver()

llm = init_chat_model("openai:gpt-4o-mini")


class State(TypedDict):

    video_file: str
    audio_file: str
    transcription: str
    summaries: Annotated[list[str], operator.add]
    thumbnail_prompts: Annotated[list[str], operator.add]
    thumbnail_sketches: Annotated[list[str], operator.add]
    final_summary: str
    user_feedback: str
    chosen_prompt: str

In [38]:
def extract_audio(state: State):
    output_file = state["video_file"].replace("mp4", "mp3")
    command = [
        "ffmpeg",
        "-i",
        state["video_file"],
        "-filter:a",
        "atempo=2.0",
        "-y",
        output_file,
    ]
    subprocess.run(command)
    return {
        "audio_file": output_file,
    }


def transcribe_audio(state: State):
    client = OpenAI()
    with open(state["audio_file"], "rb") as audio_file:
        transcription = client.audio.transcriptions.create(
            model="whisper-1",
            response_format="text",
            file=audio_file,
            language="ko",
            prompt="생수,손잡이,거름망,보리,알갱이,보리차",
        )
        return {
            "transcription": transcription,
        }

In [39]:
def dispatch_summarizers(state: State):
    transcription = state["transcription"]
    chunks = []
    for i, chunk in enumerate(textwrap.wrap(transcription, 500)):
        chunks.append({"id": i + 1, "chunk": chunk})
    return [Send("summarize_chunk", chunk) for chunk in chunks]


def summarize_chunk(chunk):
    chunk_id = chunk["id"]
    chunk = chunk["chunk"]

    response = llm.invoke(
        f"""
        Please summarize the following text.

        Text: {chunk}
        """
    )
    summary = f"[Chunk {chunk_id}] {response.content}"
    return {
        "summaries": [summary],
    }

In [40]:
def mega_summary(state: State):

    all_summaries = "\n".join(state["summaries"])

    prompt = f"""
        You are given multiple summaries of different chunks from a video transcription.

        Please create a comprehensive final summary that combines all the key points.

        Individual summaries:

        {all_summaries}
    """

    response = llm.invoke(prompt)

    return {
        "final_summary": response.content,
    }


def dispatch_artists(state: State):
    return [
        Send(
            "generate_thumbnails",
            {
                "id": i,
                "summary": state["final_summary"],
            },
        )
        for i in [1, 2, 3, 4, 5]
    ]


def generate_thumbnails(args):
    concept_id = args["id"]
    summary = args["summary"]

    prompt = f"""
    Based on this video summary, create a detailed visual prompt for a YouTube thumbnail.

    Create a detailed prompt for generating a thumbnail image that would attract viewers. Include:
        - Main visual elements
        - Color scheme
        - Text overlay suggestions (Korean Text)
        - Overall composition
    
    Summary: {summary}
    """

    response = llm.invoke(prompt)

    thumbnail_prompt = response.content

    client = OpenAI()

    result = client.images.generate(
        model="gpt-image-1",
        prompt=thumbnail_prompt,
        quality="low",
        moderation="low",
        size="auto",
    )

    image_bytes = base64.b64decode(result.data[0].b64_json)

    filename = f"thumbnail_{concept_id}.jpg"

    with open(filename, "wb") as file:
        file.write(image_bytes)

    return {
        "thumbnail_prompts": [thumbnail_prompt],
        "thumbnail_sketches": [filename],
    }

In [41]:
def human_feedback(state: State):
    answer = interrupt(
        {
            "chosen_thumbnail": "Which thumbnail do you like the most?",
            "feedback": "Provide any feedback or changes you'd like for the final thumbnail.",
        }
    )
    user_feedback = answer["user_feedback"]
    chosen_prompt = answer["chosen_prompt"]
    return {
        "user_feedback": user_feedback,
        "chosen_prompt": state["thumbnail_prompts"][chosen_prompt - 1],
    }


def generate_hd_thumbnail(state: State):

    chosen_prompt = state["chosen_prompt"]
    user_feedback = state["user_feedback"]

    prompt = f"""
    You are a professional YouTube thumbnail designer. Take this original thumbnail prompt and create an enhanced version that incorporates the user's specific feedback.

    ORIGINAL PROMPT:
    {chosen_prompt}

    USER FEEDBACK TO INCORPORATE:
    {user_feedback}

    Create an enhanced prompt that:
        1. Maintains the core concept from the original prompt
        2. Specifically addresses and implements the user's feedback requests
        3. Adds professional YouTube thumbnail specifications:
            - High contrast and bold visual elements
            - Clear focal points that draw the eye
            - Professional lighting and composition
            - Optimal text placement and readability with generous padding from edges
            - Colors that pop and grab attention
            - Elements that work well at small thumbnail sizes
            - IMPORTANT: Always ensure adequate white space/padding between any text and the image borders
    """

    response = llm.invoke(prompt)

    final_thumbnail_prompt = response.content

    client = OpenAI()

    result = client.images.generate(
        model="gpt-image-1",
        prompt=final_thumbnail_prompt,
        quality="high",
        moderation="low",
        size="auto",
    )

    image_bytes = base64.b64decode(result.data[0].b64_json)

    with open("thumbnail_final.jpg", "wb") as file:
        file.write(image_bytes)

In [42]:
graph_builder = StateGraph(State)

graph_builder.add_node("extract_audio", extract_audio)
graph_builder.add_node("transcribe_audio", transcribe_audio)
graph_builder.add_node("summarize_chunk", summarize_chunk)
graph_builder.add_node("mega_summary", mega_summary)
graph_builder.add_node("generate_thumbnails", generate_thumbnails)
graph_builder.add_node("human_feedback", human_feedback)
graph_builder.add_node("generate_hd_thumbnail", generate_hd_thumbnail)

graph_builder.add_edge(START, "extract_audio")
graph_builder.add_edge("extract_audio", "transcribe_audio")
graph_builder.add_conditional_edges(
    "transcribe_audio", dispatch_summarizers, ["summarize_chunk"]
)
graph_builder.add_edge("summarize_chunk", "mega_summary")
graph_builder.add_conditional_edges(
    "mega_summary", dispatch_artists, ["generate_thumbnails"]
)
graph_builder.add_edge("generate_thumbnails", "human_feedback")
graph_builder.add_edge("human_feedback", "generate_hd_thumbnail")
graph_builder.add_edge("generate_hd_thumbnail", END)

graph = graph_builder.compile(checkpointer=memory)

In [43]:
config = {
    "configurable": {
        "thread_id": "1",
    },
}

In [45]:
graph.invoke(
    {"video_file": "video.mp4"},
    config=config,
)

ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 16.0.0 (clang-1600.0.26.6)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1.1_2 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex

{'video_file': 'video.mp4',
 'audio_file': 'video.mp3',
 'transcription': '이제 저희 실생수는 이손잡이로 편하게 따라요. 물 따를 때도 편리하고 무거워서 물리는 일도 없어요. 거름망도 있어서 보리알갱이만 있으면 끝이 필요없이 보리차까지 만들어줘서 추천해요.\n',
 'summaries': ['[Chunk 1] 저희 실생수는 편리한 손잡이로 쉽게 사용할 수 있으며, 물 따르기에도 편하고 무겁지 않아 안전합니다. 거름망이 있어 보리차를 쉽게 만들 수 있어 추천합니다.',
  '[Chunk 1] 저희 실생수는 이손잡이로 사용이 편리하고, 물 따를 때 무거워서 물이 쏟아지지 않습니다. 거름망이 있어 보리알갱이를 사용하면 보리차도 쉽게 만들 수 있어 추천합니다.',
  '[Chunk 1] 이 실생수는 이손잡이로 사용이 편리하고, 무거워서 물이 쏟아지는 일이 없으며, 거름망이 있어서 보리차를 쉽게 만들 수 있어 추천합니다.'],
 'thumbnail_prompts': ['### YouTube Thumbnail Visual Prompt\n\n#### Main Visual Elements:\n1. **Central Image**: A clear, close-up shot of the practical water dispenser. The dispenser should be depicted at an angle that showcases its convenient handle and lightweight design. Include the filter prominently in the shot, perhaps even showing some barley tea brewing inside the dispenser.\n  \n2. **Background**: A bright, refreshing background that implies cleanliness and hydration. Consider using image

In [46]:
response = {
    "user_feedback": "Give it a more vibrant color palette and make the text more accurate.",
    "chosen_prompt": 2,
}

graph.invoke(
    Command(resume=response),
    config=config,
)

{'video_file': 'video.mp4',
 'audio_file': 'video.mp3',
 'transcription': '이제 저희 실생수는 이손잡이로 편하게 따라요. 물 따를 때도 편리하고 무거워서 물리는 일도 없어요. 거름망도 있어서 보리알갱이만 있으면 끝이 필요없이 보리차까지 만들어줘서 추천해요.\n',
 'summaries': ['[Chunk 1] 저희 실생수는 편리한 손잡이로 쉽게 사용할 수 있으며, 물 따르기에도 편하고 무겁지 않아 안전합니다. 거름망이 있어 보리차를 쉽게 만들 수 있어 추천합니다.',
  '[Chunk 1] 저희 실생수는 이손잡이로 사용이 편리하고, 물 따를 때 무거워서 물이 쏟아지지 않습니다. 거름망이 있어 보리알갱이를 사용하면 보리차도 쉽게 만들 수 있어 추천합니다.',
  '[Chunk 1] 이 실생수는 이손잡이로 사용이 편리하고, 무거워서 물이 쏟아지는 일이 없으며, 거름망이 있어서 보리차를 쉽게 만들 수 있어 추천합니다.'],
 'thumbnail_prompts': ['### YouTube Thumbnail Visual Prompt\n\n#### Main Visual Elements:\n1. **Central Image**: A clear, close-up shot of the practical water dispenser. The dispenser should be depicted at an angle that showcases its convenient handle and lightweight design. Include the filter prominently in the shot, perhaps even showing some barley tea brewing inside the dispenser.\n  \n2. **Background**: A bright, refreshing background that implies cleanliness and hydration. Consider using image