## Multi-Agent System with OpenAI SORA Video Generation Model


In [None]:
%pip install semantic-kernel==1.30.0, azure-identity, python-dotenv, azure-ai-projects==1.0.0b8

In [None]:
from semantic_kernel.agents import ChatCompletionAgent
from semantic_kernel.connectors.ai import FunctionChoiceBehavior
from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion, AzureChatPromptExecutionSettings
from semantic_kernel.functions import KernelFunctionFromPrompt
from semantic_kernel.kernel import Kernel
from dotenv import load_dotenv
import os
import math
from typing import Annotated
from semantic_kernel.functions import KernelArguments

from semantic_kernel.functions.kernel_function_decorator import kernel_function

load_dotenv()

### Creating the "Prompt-Refiner" Agent

In [None]:
model = os.getenv("AZURE_OPENAI_CHAT_COMPLETION_MODEL")

system_prompt_for_prompt_refiner_agent = (
    "You are a Prompt Refiner Agent in a multi-agent system that generates high-quality videos based on user inputs. "
    "Your task is to take partial or vague inputs from the user—such as a short description of the expected video, "
    "visual style preferences, and target duration—and convert them into a refined, detailed, and structured prompt ready for a video generation model.\n\n"
    "You will receive the following fields:\n"
    "a) visual_style: A word or phrase indicating the desired look (e.g., 'anime', 'cinematic', 'Ghibli-style', 'hyper-realistic').\n"
    "b) length: A time estimate or label like 'short', '30 seconds', 'reel'.\n"
    "c) description: A loosely written idea, goal, or scene the user wants, possibly vague or fragmented.\n\n"
    "Your job is to:\n"
    "- Interpret and expand the user’s vague description into a vivid and coherent scene, adding relevant visual, emotional, and narrative details.\n"
    "- Ensure consistency with the chosen visual style and length.\n"
    "- Structure the prompt using natural, expressive language appropriate for guiding a video generation model like Sora.\n"
    "- Clarify or infer environment, mood, camera angles, characters, motion, lighting, and scene progression when not explicitly given.\n"
    "- Format your output as a single refined prompt that clearly describes the entire video scene in a way that maximizes visual creativity without ambiguity. "
    "Avoid generic words like 'nice' or 'cool'; be cinematic and descriptive."
)

kernel_for_prompt_refiner_agent = Kernel()

kernel_for_prompt_refiner_agent.add_service(
    AzureChatCompletion(
        service_id="default",
        api_key = os.getenv("AZURE_OPENAI_API_KEY"),
        deployment_name=model,
        endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    )
)

# Get the AI Service settings
settings = kernel_for_prompt_refiner_agent.get_prompt_execution_settings_from_service_id(service_id="default")

# Configure the function choice behavior to auto invoke kernel functions
settings.function_choice_behavior = FunctionChoiceBehavior.Auto()

# 2. Create a Semantic Kernel agent based on the agent definition
prompt_refiner_agent = ChatCompletionAgent(
    kernel = kernel_for_prompt_refiner_agent,
    name = "Prompt_Refiner_Agent",
    instructions = f"{system_prompt_for_prompt_refiner_agent}",
    arguments = KernelArguments(settings=settings)
)



In [None]:
import requests
from typing import Any, Callable, Set, Dict, List, Optional
import json
import os
from openai import AzureOpenAI
from azure.identity import DefaultAzureCredential
from dotenv import load_dotenv
import uuid
import time

class VideoGeneration:
    
    @kernel_function(
        description="Generates a video by making a call to OpenAI Sora Video Generation model.",
        name = "zvideoGenerationBySORA"
    )
    def use_video_generation_by_SORA(
        self,
        video_generation_prompt: Annotated[str, "the detailed prompt/description according to which the video needs to be generated"],
        n_seconds: Annotated[int, "the length of the video"], 
        height: Annotated[int, "the height of the video (in px)"], 
        width: Annotated[int, "the width of the video (in px)"]) -> str:
        

        
        azure_openai_endpoint = os.getenv("SORA_MODEL_ENDPOINT")
        azure_openai_api_key = os.getenv("SORA_MODEL_API_KEY")
        sora_deployment_name = os.getenv("SORA_DEPLOYMENT_NAME")
        sora_api_version = os.getenv("SORA_API_VERSION")

        print("AZURE_OPENAI_ENDPOINT:", azure_openai_endpoint)
        print("SORA_DEPLOYMENT_NAME:", sora_deployment_name)
        print("SORA_API_VERSION:", sora_api_version)

        path = f'openai/v1/video/generations/jobs'
        params = f'?api-version={sora_api_version}'
        constructed_url = azure_openai_endpoint + path + params

        print("Constructed URL:", constructed_url)

        headers = {
            'Api-Key': azure_openai_api_key,
            'Content-Type': 'application/json',
        }

        body = {
            "prompt": video_generation_prompt,
            "n_seconds": n_seconds,
            "height": height,
            "width": width,
            "model": sora_deployment_name,
        }

        print("Request body:", body)

        job_response = requests.post(constructed_url, headers=headers, json=body)

        if not job_response.ok:
            print("API call failed!")
            print("Status code:", job_response.status_code)
            print("Response:", job_response.text)
            return "❌ Video generation failed."

        # ...rest of your code...
    
        else:
            print(json.dumps(job_response.json(), sort_keys=True, indent=4, separators=(',', ': ')))
            job_response = job_response.json()
            job_id = job_response.get("id")
            status = job_response.get("status")
            status_url = f"{azure_openai_endpoint}openai/v1/video/generations/jobs/{job_id}?api-version={sora_api_version}"

            print(f"⏳ Polling job status for ID: {job_id}")
            while status not in ["succeeded", "failed"]:
                time.sleep(5)
                job_response = requests.get(status_url, headers=headers).json()
                status = job_response.get("status")
                print(f"Status: {status}")

            if status == "succeeded":
                print(job_response)
                generations = job_response.get("generations", [])
                if generations:
                    print(f"✅ Video generation succeeded.")

                    generation_id = generations[0].get("id")
                    video_url = f'{azure_openai_endpoint}openai/v1/video/generations/{generation_id}/content/video{params}'
                    video_response = requests.get(video_url, headers=headers)
                    if video_response.ok:
                        output_filename_prefix = "output" + str(uuid.uuid4())
                        output_filename = output_filename_prefix + ".mp4"
                        with open(output_filename, "wb") as file:
                            file.write(video_response.content)
                        return f'Video Generation succeeded and Generated video saved as "{output_filename}"'
                else:
                    return "⚠️ Status is succeeded, but no generations were returned."
            elif status == "failed":
                return "❌ Video generation failed."

### Creating the "SORA VIDEO GENERATOR" Agent

In [None]:
sora_system_prompt = (
    "You are the SORA Video Generator Agent, responsible for transforming refined narrative prompts into stunning, coherent video sequences. "
    "You receive structured, vivid, and detailed scene descriptions crafted to guide cinematic video generation. "
    "Your task is to interpret these prompts and generate videos that visually and emotionally match the provided descriptions.\n\n"
    "The input you receive is a single, highly descriptive prompt containing:\n"
    "- Environment details (e.g., time of day, weather, architecture, terrain)\n"
    "- Character descriptions and actions\n"
    "- Scene progression, including beginning, middle, and end (if applicable)\n"
    "- Visual style (e.g., Ghibli-style, hyper-realistic, anime, cinematic)\n"
    "- Mood and lighting\n"
    "- Motion dynamics and camera movements (e.g., dolly zoom, aerial shot, panning, slow motion)\n\n"
    "Your job is to:\n"
    "- Translate the prompt into a dynamic and immersive video with fluid motion, coherent transitions, and fidelity to the described visual style.\n"
    "- Ensure that lighting, textures, character design, and animation align with the mood and scene tone.\n"
    "- Adapt shot composition and camera angles to emphasize emotion, pacing, and storytelling.\n"
    "- Stay within the inferred or specified video length constraints.\n\n"
    "You do not generate text or audio—focus entirely on the **visual sequence**. "
    "Your goal is to produce a visually stunning and story-rich video that feels intentional and cinematic in every frame."
)

kernel_for_SORA_agent = Kernel()
service_id = "default"

kernel_for_SORA_agent.add_service(
    AzureChatCompletion(
        service_id="default",
        api_key = os.getenv("AZURE_OPENAI_API_KEY"),
        deployment_name=model,
        endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    )
)

# Get the AI Service settings
settings = kernel_for_SORA_agent.get_prompt_execution_settings_from_service_id(service_id=service_id)

# Configure the function choice behavior to auto invoke kernel functions
settings.function_choice_behavior = FunctionChoiceBehavior.Auto()

kernel_for_SORA_agent.add_plugin(VideoGeneration(), plugin_name="video_generation_plugin")

# 2. Create a Semantic Kernel agent based on the agent definition
sora_video_generation_agent = ChatCompletionAgent(
    kernel = kernel_for_SORA_agent,
    name = "Video_Generation_Agent",
    instructions = f"{sora_system_prompt}",
    arguments = KernelArguments(settings=settings)
)

In [None]:
goal = f""" Generate a video of the Music Band - Daft Punk - travelling the inter-galactic space for 5 seconds. """

response_from_prompt_refiner_agent = await prompt_refiner_agent.get_response(
    messages=goal
)
print("Response from Prompt Refiner Agent: \n")
print(response_from_prompt_refiner_agent)

response_from_video_generation_agent = await sora_video_generation_agent.get_response(
    messages = str(response_from_prompt_refiner_agent)
)
print("Response from Video Generation Agent \n")
print(response_from_video_generation_agent)