#### This is a very simple workflow to generate a short video entirely using AI. The Following models are used:
#### - DeepSeekV3 for text and image prompt generation
#### - StabilityAI for image generation  
#### - Kokoro for text to speech

#### User inputs

In [5]:
# ==== USER INPUTS ====
DeepSeekToken = "My_DeepSeek_Token"
video_title =  "The Amazing world of The Jurassic Era" # "The Fascinating History of the Mayan Civilization" 
narration_style = "educational and informative"
video_duration = "30 seconds"


#### installation

In [2]:
%%time
# ==== INSTALL REQUIRED LIBRARIES ====
!!pip install torch transformers diffusers moviepy opencv-python numpy accelerate -q
!pip install openai -q


CPU times: user 42.8 ms, sys: 20.7 ms, total: 63.5 ms
Wall time: 7.86 s


#### imports

In [4]:
%%time
import os
import cv2
import numpy as np
import torch
import random
from moviepy.editor import ImageSequenceClip, AudioFileClip, concatenate_videoclips
from TTS.api import TTS

# import ollama

CPU times: user 25 µs, sys: 0 ns, total: 25 µs
Wall time: 28.8 µs


#### text generation using DeepSeekV3 API call

In [8]:
%%time

from openai import OpenAI

client = OpenAI(api_key=DeepSeekToken, 
                base_url="https://api.deepseek.com")

content = """ You are a highly knowledgeable AI assistant and expert of generating texts for videos. 
            Below is the title of a video. 
            Generate an appropriate script for it, in the form of a sequence of texts and corresponding images. 
            Begin with an engaging, introductory remark.
            End with a thought-provoking, concluding remark. 
            Only use reliable sources. 
            The narration style should be {narration_style} and the duration of the video is {video_duration}. 
            Do not mention anything about accompanying music. Only the text and the scene. 
            Your response should have the following format:  
            image: " ... ", text: " ...", image: "..." , text: "...", image: "..." , text: "...", ....
            Do not include any text in the images. The images must be aesthetically pleasing, visually brilliant 
            with vivid colors, hyper-photo realistic. The scenes described should be static"""

response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "system", "content": content },
        {"role": "user", "content": video_title},
    ],
    stream=False
)

print(response.choices[0].message.content)

image: "A breathtaking view of a lush, prehistoric jungle with towering ferns, dense foliage, and a misty atmosphere. In the distance, a herd of massive Brachiosaurus grazes peacefully, their long necks reaching for the treetops.", text: "Welcome to the Jurassic Era, a time when Earth was ruled by giants. Imagine a world where the air was thick with the calls of ancient creatures, and the landscape was a vibrant tapestry of life.",  
image: "A close-up of a Stegosaurus, its spiked tail and iconic back plates glistening under the sunlight. The background shows a rocky terrain with scattered vegetation.", text: "The Jurassic Period, spanning from 201 to 145 million years ago, was a golden age for dinosaurs. Among them, the Stegosaurus roamed, a herbivore with a unique defense mechanism—its formidable tail spikes.",  
image: "A dramatic scene of an Allosaurus mid-hunt, its sharp teeth bared as it chases a smaller dinosaur through a dense forest. The sunlight filters through the trees, cas

#### organize the texts and image prompts

In [10]:
import re
input_text = response.choices[0].message.content

scenes = re.findall(r'image:\s*"([^"]+)"', input_text, re.DOTALL)
texts = re.findall(r'text:\s*"([^"]+)"', input_text)

# Print results
print("Scenes:", scenes)
print("Texts:", texts)
print(len(scenes), len(texts))

Scenes: ['A breathtaking view of a lush, prehistoric jungle with towering ferns, dense foliage, and a misty atmosphere. In the distance, a herd of massive Brachiosaurus grazes peacefully, their long necks reaching for the treetops.', 'A close-up of a Stegosaurus, its spiked tail and iconic back plates glistening under the sunlight. The background shows a rocky terrain with scattered vegetation.', 'A dramatic scene of an Allosaurus mid-hunt, its sharp teeth bared as it chases a smaller dinosaur through a dense forest. The sunlight filters through the trees, casting dynamic shadows.', 'A serene image of a shallow Jurassic sea, with a Plesiosaur gliding gracefully through the water. The surface reflects the golden hues of a setting sun, and coral reefs teem with life below.', 'A panoramic view of a volcanic landscape, with a massive eruption in the background. Lava flows carve through the land, and ash clouds fill the sky, while a lone dinosaur silhouette stands in the foreground.', 'A fo

#### image generation

In [11]:
from io import BytesIO
import IPython
import json
import os
from PIL import Image
import requests
import time

import getpass
# @markdown To get your API key visit https://platform.stability.ai/account/keys
STABILITY_KEY = getpass.getpass('Enter your API Key')


Enter your API Key ········


#### functions to generate and save images

In [12]:
def send_generation_request(
    host,
    params,
    files = None
):
    headers = {
        "Accept": "image/*",
        "Authorization": f"Bearer {STABILITY_KEY}"
    }

    if files is None:
        files = {}

    # Encode parameters
    image = params.pop("image", None)
    mask = params.pop("mask", None)
    if image is not None and image != '':
        files["image"] = open(image, 'rb')
    if mask is not None and mask != '':
        files["mask"] = open(mask, 'rb')
    if len(files)==0:
        files["none"] = ''

    # Send request
    print(f"Sending REST request to {host}...")
    response = requests.post(
        host,
        headers=headers,
        files=files,
        data=params
    )
    if not response.ok:
        raise Exception(f"HTTP {response.status_code}: {response.text}")

    return response

def send_async_generation_request(
    host,
    params,
    files = None
):
    headers = {
        "Accept": "application/json",
        "Authorization": f"Bearer {STABILITY_KEY}"
    }

    if files is None:
        files = {}

    # Encode parameters
    image = params.pop("image", None)
    mask = params.pop("mask", None)
    if image is not None and image != '':
        files["image"] = open(image, 'rb')
    if mask is not None and mask != '':
        files["mask"] = open(mask, 'rb')
    if len(files)==0:
        files["none"] = ''

    # Send request
    print(f"Sending REST request to {host}...")
    response = requests.post(
        host,
        headers=headers,
        files=files,
        data=params
    )
    if not response.ok:
        raise Exception(f"HTTP {response.status_code}: {response.text}")

    # Process async response
    response_dict = json.loads(response.text)
    generation_id = response_dict.get("id", None)
    assert generation_id is not None, "Expected id in response"

    # Loop until result or timeout
    timeout = int(os.getenv("WORKER_TIMEOUT", 500))
    start = time.time()
    status_code = 202
    while status_code == 202:
        print(f"Polling results at https://api.stability.ai/v2beta/results/{generation_id}")
        response = requests.get(
            f"https://api.stability.ai/v2beta/results/{generation_id}",
            headers={
                **headers,
                "Accept": "*/*"
            },
        )

        if not response.ok:
            raise Exception(f"HTTP {response.status_code}: {response.text}")
        status_code = response.status_code
        time.sleep(10)
        if time.time() - start > timeout:
            raise Exception(f"Timeout after {timeout} seconds")

    return response

#### Specify parameters for image generator and generate the images one by one

In [14]:
negative_prompt = "worst quality, normal quality, low quality, low res, blurry, distortion, text, watermark, logo, banner, extra digits, cropped, jpeg artifacts, signature, username, error, sketch, duplicate, ugly, monochrome, horror, geometry, mutation, disgusting, bad anatomy, bad proportions, bad quality, deformed, disconnected limbs, out of frame, out of focus, dehydrated, disfigured, extra arms, extra limbs, extra hands, fused fingers, gross proportions, long neck, jpeg, malformed limbs, mutated, mutated hands, mutated limbs, missing arms, missing fingers, picture frame, poorly drawn hands, poorly drawn face, collage, pixel, pixelated, grainy, color aberration, amputee, autograph, bad illustration, beyond the borders, blank background, body out of frame, boring background, branding, cut off, dismembered, disproportioned, distorted, draft, duplicated features, extra fingers, extra legs, fault, flaw, grains, hazy, identifying mark, improper scale, incorrect physiology, incorrect ratio, indistinct, kitsch, low resolution, macabre, malformed, mark, misshapen, missing hands, missing legs, mistake, morbid, mutilated, off-screen, outside the picture, poorly drawn feet, printed words, render, repellent, replicate, reproduce, revolting dimensions, script, shortened, sign, split image, squint, storyboard, tiling, trimmed, unfocused, unattractive, unnatural pose, unreal engine, unsightly, written language"
aspect_ratio = "16:9" #@param ["21:9", "16:9", "3:2", "5:4", "1:1", "4:5", "2:3", "9:16", "9:21"]
seed = 0 #@param {type:"integer"}
output_format = "jpeg" #@param ["webp", "jpeg", "png"]
model_name = "sdxl" #sd3

### generate images
for i, scene in enumerate(scenes):
    print( " At scene "  + str(i) )
    prompt =  scenes[i] + ", photorealistic, vibrant colors, ultraHD, high quality photography"
    params = {
    "prompt" : prompt,
    "negative_prompt" : negative_prompt,
    "aspect_ratio" : aspect_ratio,
    "seed" : i+1,
    "output_format": output_format
}

    response = send_generation_request(
        host,
        params
    )
    
    # Decode response
    output_image = response.content
    finish_reason = response.headers.get("finish-reason")
    seed = response.headers.get("seed")
    
    # Check for NSFW classification
    if finish_reason == 'CONTENT_FILTERED':
        raise Warning("Generation failed NSFW classifier")
    
    # Save and display result
    generated = f"generated_{seed}.{output_format}"
    with open(generated, "wb") as f:
        f.write(output_image)
    print(f"Saved image {generated}")

    gc.collect()
    torch.cuda.empty_cache()

 At scene 0
Sending REST request to https://api.stability.ai/v2beta/stable-image/generate/sd3...
Saved image generated_323159090.jpeg
 At scene 1
Sending REST request to https://api.stability.ai/v2beta/stable-image/generate/sd3...
Saved image generated_1.jpeg
 At scene 2
Sending REST request to https://api.stability.ai/v2beta/stable-image/generate/sd3...
Saved image generated_2.jpeg
 At scene 3
Sending REST request to https://api.stability.ai/v2beta/stable-image/generate/sd3...
Saved image generated_3.jpeg
 At scene 4
Sending REST request to https://api.stability.ai/v2beta/stable-image/generate/sd3...
Saved image generated_4.jpeg
 At scene 5
Sending REST request to https://api.stability.ai/v2beta/stable-image/generate/sd3...
Saved image generated_5.jpeg
 At scene 6
Sending REST request to https://api.stability.ai/v2beta/stable-image/generate/sd3...
Saved image generated_6.jpeg
 At scene 7
Sending REST request to https://api.stability.ai/v2beta/stable-image/generate/sd3...
Saved image g

#### text to speech conversion using Kokoro

In [42]:
# 1️⃣ Install kokoro
!pip install -q kokoro>=0.8.2 soundfile
# 2️⃣ Install espeak, used for English OOD fallback and some non-English languages
!apt-get -qq -y install espeak-ng > /dev/null 2>&1
# 🇪🇸 'e' => Spanish es
# 🇫🇷 'f' => French fr-fr
# 🇮🇳 'h' => Hindi hi
# 🇮🇹 'i' => Italian it
# 🇧🇷 'p' => Brazilian Portuguese pt-br

# 3️⃣ Initalize a pipeline
from kokoro import KPipeline
from IPython.display import display, Audio
import soundfile as sf
# 🇺🇸 'a' => American English, 🇬🇧 'b' => British English
# 🇯🇵 'j' => Japanese: pip install misaki[ja]
# 🇨🇳 'z' => Mandarin Chinese: pip install misaki[zh]
pipeline = KPipeline(lang_code='a') # <= make sure lang_code matches voice


input_text = "\n".join(texts)

# 4️⃣ Generate, display, and save audio files in a loop.
generator = pipeline(
    text, voice='af_heart', # <= change voice here
    speed=1, split_pattern=r'\n+'
)
for i, (gs, ps, audio) in enumerate(generator):
    print(i)  # i => index
    print(gs) # gs => graphemes/text
    print(ps) # ps => phonemes
    display(Audio(data=audio, rate=24000, autoplay=i==0))
    sf.write(f'voiceover_chunk_{i}.wav', audio, 24000) # save each audio file


#### all images and audio files are being saved in the current directory 

In [60]:
import os
current_dir = os.getcwd()
image_paths = [f for f in os.listdir(current_dir) if f.endswith('.jpeg')]
audio_paths = [f for f in os.listdir(current_dir) if ".wav" in f]

image_paths.sort()
audio_paths.sort()

print(image_paths)
print(audio_paths)

['generated_0.jpeg', 'generated_1.jpeg', 'generated_2.jpeg', 'generated_3.jpeg', 'generated_4.jpeg', 'generated_5.jpeg', 'generated_6.jpeg', 'generated_7.jpeg']
['voiceover_chunk_0.wav', 'voiceover_chunk_1.wav', 'voiceover_chunk_2.wav', 'voiceover_chunk_3.wav', 'voiceover_chunk_4.wav', 'voiceover_chunk_5.wav', 'voiceover_chunk_6.wav', 'voiceover_chunk_7.wav']


#### get durations of audio clips

In [61]:
import os
from pydub import AudioSegment

durations = []
# Iterate over all .wav files in the directory
for filename in audio_paths:
    
    file_path = os.path.join(current_dir, filename)
    audio = AudioSegment.from_wav(file_path)  # Load the audio
    duration_sec = len(audio) / 1000  # Convert from milliseconds to seconds
    print(f"{filename}: {duration_sec:.2f} seconds")
    durations.append(duration_sec)
    
#durations = list(map(int, durations))
print(durations) 


voiceover_chunk_0.wav: 11.14 seconds
voiceover_chunk_1.wav: 13.15 seconds
voiceover_chunk_2.wav: 11.07 seconds
voiceover_chunk_3.wav: 11.14 seconds
voiceover_chunk_4.wav: 11.58 seconds
voiceover_chunk_5.wav: 9.22 seconds
voiceover_chunk_6.wav: 10.62 seconds
voiceover_chunk_7.wav: 12.54 seconds
[11.136, 13.152, 11.072, 11.136, 11.584, 9.216, 10.624, 12.544]


#### A function for basic animation (zoom in)

In [62]:
def zoom_effect(image_path, duration, fps=24):
    img = cv2.imread(image_path)
    h, w, _ = img.shape
    zoom_levels = np.linspace(1, 1.4, num= int(duration * fps)  ) # Gradual zoom-in

    frames = []
    for zoom in zoom_levels:
        center = (w // 2, h // 2)
        size = (int(w / zoom), int(h / zoom))
        cropped = img[center[1]-size[1]//2:center[1]+size[1]//2, center[0]-size[0]//2:center[0]+size[0]//2]
        resized = cv2.resize(cropped, (w, h))
        frames.append(resized)

    return frames

#### Create video, clip by clip

In [63]:
%%time
from moviepy.editor import ImageSequenceClip, AudioFileClip, concatenate_videoclips, VideoFileClip  # Import VideoFileClip here
# Create animated video clips
video_clips = []
fps = 24
video_filename = video_title.replace(" ", "_")+".mp4"

for i, img in enumerate(image_paths):
    img_path = os.path.join(current_dir , img)
    
    # Generate video frames with zoom effect
    frames = zoom_effect(img_path, durations[i], fps) 
    clip_path = f"{img_path}.mp4"

    # Save the video clip
    out = cv2.VideoWriter(clip_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (frames[0].shape[1], frames[0].shape[0]))
    for frame in frames:
        out.write(frame)
    out.release()

    # Load the generated video clip
    video = VideoFileClip(clip_path)
    
    audio_path = os.path.join(current_dir , audio_paths[i]) 
    # Load and trim the corresponding audio
    audio = AudioFileClip(audio_path).set_duration(video.duration)
    
    # Attach audio to the video clip
    video = video.set_audio(audio)

    # Store processed video clips
    video_clips.append(video)
    print(f" clip {i} finished")

# ==== STEP 5: Merge Processed Video Clips ====
print("\nMerging video clips...")
final_video = concatenate_videoclips(video_clips, method="compose")

# Save the final video
output_path = os.path.join(current_dir, video_filename)
final_video.write_videofile(output_path, codec="libx264", fps=24)

print(f"\n✅ Video generated successfully: {output_path}")

 clip 0 finished
 clip 1 finished
 clip 2 finished
 clip 3 finished
 clip 4 finished
 clip 5 finished
 clip 6 finished



[A                                                  
100%|██████████| 3.80M/3.80M [1:08:31<00:00, 925iB/s][A
[A                                                  
100%|██████████| 3.80M/3.80M [1:08:31<00:00, 925iB/s][A

 clip 7 finished

Merging video clips...
Moviepy - Building video /kaggle/working/The_Amazing_world_of_The_Jurassic_Era.mp4.
MoviePy - Writing audio in The_Amazing_world_of_The_Jurassic_EraTEMP_MPY_wvf_snd.mp3


                                                                      
[A                                                  
100%|██████████| 3.80M/3.80M [1:08:32<00:00, 925iB/s][A
[A                                                  
100%|██████████| 3.80M/3.80M [1:08:32<00:00, 925iB/s][A

MoviePy - Done.
Moviepy - Writing video /kaggle/working/The_Amazing_world_of_The_Jurassic_Era.mp4



                                                                
[A                                                  
100%|██████████| 3.80M/3.80M [1:09:45<00:00, 908iB/s][A
[A                                                  
100%|██████████| 3.80M/3.80M [1:09:45<00:00, 908iB/s][A

Moviepy - Done !
Moviepy - video ready /kaggle/working/The_Amazing_world_of_The_Jurassic_Era.mp4

✅ Video generated successfully: /kaggle/working/The_Amazing_world_of_The_Jurassic_Era.mp4
CPU times: user 51.3 s, sys: 7.2 s, total: 58.5 s
Wall time: 1min 32s
