# Video Generation Pipeline

This notebook generates a video from a given topic or text using Ollama for scripting, Stable Diffusion (Diffusers) for images, Edge-TTS for audio, and MoviePy for assembly.

In [5]:
import json
import os
import textwrap
import requests
import asyncio
from PIL import Image, ImageDraw, ImageFont
import edge_tts
from moviepy import ImageClip, AudioFileClip, concatenate_videoclips, VideoFileClip
import torch
from diffusers import StableDiffusionPipeline, LCMScheduler
from peft import get_peft_model

In [6]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())


2.5.1+cu121
12.1
True


In [7]:
# Configuration
OLLAMA_API_URL = "http://localhost:11434/api/generate"
# Use 'llama3' or another model you have installed
OLLAMA_MODEL = "phi3:mini"

OUTPUT_DIR = "output"
SCENE_DIR = os.path.join(OUTPUT_DIR, "scenes")
AUDIO_DIR = os.path.join(OUTPUT_DIR, "audio")
FINAL_VIDEO_DIR = os.path.join(OUTPUT_DIR, "video")

os.makedirs(SCENE_DIR, exist_ok=True)
os.makedirs(AUDIO_DIR, exist_ok=True)
os.makedirs(FINAL_VIDEO_DIR, exist_ok=True)

## Step 0: Initialize Stable Diffusion Model

In [8]:
# ===== SETTINGS =====
model_id = "runwayml/stable-diffusion-v1-5"
lora_id = "latent-consistency/lcm-lora-sdv1-5"

device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Loading Stable Diffusion on {device}...")

# ===== LOAD PIPELINE =====
pipe = StableDiffusionPipeline.from_pretrained(
    model_id,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    safety_checker=None
).to(device)

# Enable memory optimizations
pipe.enable_attention_slicing()
pipe.enable_vae_slicing()
if device == "cuda":
    pipe.enable_model_cpu_offload()

# ===== LOAD LCM LoRA =====
pipe.load_lora_weights(lora_id)
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)

def generate_image_sd(prompt_text, output_path):
    print(f"Generating image for: {prompt_text}...")
    prompt = f"""
    simple flat illustration of {prompt_text},
    minimal design,
    clean white background,
    educational graphic,
    vector style,
    no text
    """
    
    try:
        image = pipe(
            prompt=prompt,
            num_inference_steps=6,      # VERY LOW = FAST
            guidance_scale=1.5,         # LCM works best low
            height=512,
            width=512
        ).images[0]
        
        image.save(output_path)
        return output_path
    except Exception as e:
        print(f"Error generating image: {e}")
        return None

Loading Stable Diffusion on cuda...


Loading weights: 100%|██████████| 196/196 [00:00<00:00, 377.02it/s, Materializing param=text_model.final_layer_norm.weight]
[1mCLIPTextModel LOAD REPORT[0m from: C:\Users\navee\.cache\huggingface\hub\models--runwayml--stable-diffusion-v1-5\snapshots\451f4fe16113bff5a5d2269ed5ad43b0592e9a14\text_encoder
Key                                | Status     |  | 
-----------------------------------+------------+--+-
text_model.embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
Loading pipeline components...: 100%|██████████| 6/6 [00:04<00:00,  1.48it/s]
You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the 

## Step 1: Script Generation with Ollama

In [9]:
def generate_script(topic):
    prompt = f"""
    Convert this topic into a structured video plan.
    Topic: {topic}
    Return JSON only:
    {{
      "scenes": [
        {{
          "title": "",
          "bullets": [],
          "narration": "",
          "image_prompt": "visual description for illustration"
        }}
      ]
    }}
    """
    
    print(f"Generating script for: {topic}...")
    try:
        response = requests.post(OLLAMA_API_URL, json={
            "model": OLLAMA_MODEL,
            "prompt": prompt,
            "format": "json",
            "stream": False
        })
        response.raise_for_status()
        return json.loads(response.json()['response'])
    except requests.exceptions.RequestException as e:
        print(f"Error connecting to Ollama: {e}")
        return None
    except json.JSONDecodeError:
        print("Error decoding JSON response from Ollama.")
        return None

## Step 2: Slide Generation (PIL)

In [10]:
def create_slide(scene, index, image_path=None):
    width, height = 1280, 720
    img = Image.new('RGB', (width, height), color='white')
    draw = ImageDraw.Draw(img)
    
    # Fonts
    try:
        title_font = ImageFont.truetype("arial.ttf", 60)
        text_font = ImageFont.truetype("arial.ttf", 35)
    except:
        title_font = ImageFont.load_default()
        text_font = ImageFont.load_default()
    
    # Layout Configuration
    margin = 50
    content_width = width - (2 * margin)
    
    # If image exists, we use split layout: Text (Left) | Image (Right)
    if image_path and os.path.exists(image_path):
        try:
            sd_img = Image.open(image_path)
            # Resize to fit right side but keep aspect ratio or simple fit
            # Let's make it 512x512 centered on the right half, or scaled nicely
            # Right half starts at x = 640
            
            # Target height 600, maintain aspect
            target_ih = 600
            aspect = sd_img.width / sd_img.height
            target_iw = int(target_ih * aspect)
            
            sd_img = sd_img.resize((target_iw, target_ih), Image.Resampling.LANCZOS)
            
            # Position on right side
            img_x = 640 + (640 - target_iw) // 2
            img_y = (720 - target_ih) // 2
            
            img.paste(sd_img, (img_x, img_y))
            
            # Constrain text to left half
            content_width = 580 # 640 - margin - padding
        except Exception as e:
            print(f"Error placing image: {e}")

    # Draw Title
    title_text = scene.get('title', f"Scene {index}")
    # Wrap title if needed
    title_lines = textwrap.wrap(title_text, width=20 if content_width < 600 else 40)
    ty = 50
    for line in title_lines:
        draw.text((margin, ty), line, fill='black', font=title_font)
        ty += 70
    
    # Draw Bullets
    y = ty + 30
    bullets = scene.get('bullets', [])
    for bullet in bullets:
        lines = textwrap.wrap(bullet, width=30 if content_width < 600 else 50)
        for line in lines:
            draw.text((margin + 30, y), f"• {line}", fill='black', font=text_font)
            y += 45
            
    filename = os.path.join(SCENE_DIR, f"scene_{index}.png")
    img.save(filename)
    return filename

## Step 3: Audio Generation (Edge-TTS)

In [11]:
async def generate_audio(text, index):
    voice = "en-US-ChristopherNeural"
    output_file = os.path.join(AUDIO_DIR, f"scene_{index}.mp3")
    
    print(f"Generating audio for scene {index}...")
    try:
        communicate = edge_tts.Communicate(text, voice)
        await communicate.save(output_file)
        return output_file
    except Exception as e:
        print(f"Error generating audio: {e}")
        return None

## Step 4: Video Assembly (MoviePy)

In [21]:
def create_video_clip(image_path, audio_path, index):
    output_path = os.path.join(FINAL_VIDEO_DIR, f"scene_{index}.mp4")
    
    print(f"Creating video clip for scene {index} using MoviePy...")
    try:
        audio_clip = AudioFileClip(audio_path)
        video_clip = ImageClip(image_path).with_duration(audio_clip.duration)
        video_clip = video_clip.with_audio(audio_clip)
        video_clip.write_videofile(output_path, fps=24, codec='libx264', audio_codec='aac')
        return output_path
    except Exception as e:
        print(f"MoviePy failed for scene {index}: {e}")
        return None

## Step 5: Merge All Scenes

In [22]:
def merge_scenes(video_files):
    output_filename = "final_video.mp4"
    
    print("Merging all scenes into final video...")
    try:
        clips = [VideoFileClip(f) for f in video_files]
        final_clip = concatenate_videoclips(clips)
        final_clip.write_videofile(output_filename, fps=24, codec='libx264', audio_codec='aac')
        print(f"Done! Output: {output_filename}")
        return output_filename
    except Exception as e:
        print(f"Error merging scenes: {e}")
        return None

## Execution Pipeline

In [23]:
async def main(topic):
    # 1. Generate Script
    script_data = generate_script(topic)
    if not script_data:
        return
    
    # Save plan for reference
    with open("video_plan.json", "w") as f:
        json.dump(script_data, f, indent=2)
    
    scenes = script_data.get('scenes', [])
    video_clips = []
    
    for i, scene in enumerate(scenes, 1):
        title = scene.get('title')
        print(f"Processing Scene {i}: {title}")
        
        # 1.5 Generate Image (SD)
        image_prompt = scene.get('image_prompt')
        generated_img_path = None
        if image_prompt:
             # Create a safe filename for the raw SD generation
             raw_img_path = os.path.join(SCENE_DIR, f"scene_{i}_raw.png")
             generated_img_path = generate_image_sd(image_prompt, raw_img_path)
        
        if not generated_img_path:
             # Fallback if generation failed or no prompt, create_slide handles None
             print("No image generated, using text-only layout.")

        # 2. Tools -> Image (Slide)
        img_path = create_slide(scene, i, image_path=generated_img_path)
        
        # 3. Narration -> Audio
        narration = scene.get('narration', '')
        if not narration:
            print(f"Warning: No narration for scene {i}")
            continue
            
        audio_path = await generate_audio(narration, i)
        if not audio_path:
            continue
            
        # 4. Combine -> Clip
        clip_path = create_video_clip(img_path, audio_path, i)
        if clip_path:
            video_clips.append(clip_path)
            
    # 5. Merge
    if video_clips:
        merge_scenes(video_clips)
    else:
        print("No video clips were created.")

# Example Usage


In [24]:
await main("The history of the internet")

Generating script for: The history of the internet...
Processing Scene 1: The Origins of the Internet
Generating image for: 1960's era computer with ARPANET equipment....


100%|██████████| 6/6 [00:29<00:00,  4.91s/it]


Generating audio for scene 1...
Creating video clip for scene 1 using MoviePy...
MoviePy - Building video output\video\scene_1.mp4.
MoviePy - Writing audio in scene_1TEMP_MPY_wvf_snd.mp4


                                                                   

MoviePy - Done.
MoviePy - Writing video output\video\scene_1.mp4



                                                                         

MoviePy - Done !
MoviePy - video ready output\video\scene_1.mp4
Processing Scene 2: The Birth of the Internet
Generating image for: First webpage with hyperlinks in 1990's era computer....


100%|██████████| 6/6 [00:29<00:00,  4.92s/it]


Generating audio for scene 2...
Creating video clip for scene 2 using MoviePy...
MoviePy - Building video output\video\scene_2.mp4.
MoviePy - Writing audio in scene_2TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
MoviePy - Writing video output\video\scene_2.mp4



                                                                         

MoviePy - Done !
MoviePy - video ready output\video\scene_2.mp4
Processing Scene 3: Evolution and Expansion of the Internet
Generating image for: Early FTP server interface on computer with internet connection....


100%|██████████| 6/6 [00:29<00:00,  4.91s/it]


Generating audio for scene 3...
Creating video clip for scene 3 using MoviePy...
MoviePy - Building video output\video\scene_3.mp4.
MoviePy - Writing audio in scene_3TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
MoviePy - Writing video output\video\scene_3.mp4



                                                                         

MoviePy - Done !
MoviePy - video ready output\video\scene_3.mp4
Processing Scene 4: The Internet Today
Generating image for: Smartphone displaying a web browser with popular social media icons....


100%|██████████| 6/6 [00:29<00:00,  4.92s/it]


Generating audio for scene 4...
Creating video clip for scene 4 using MoviePy...
MoviePy - Building video output\video\scene_4.mp4.
MoviePy - Writing audio in scene_4TEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
MoviePy - Writing video output\video\scene_4.mp4



                                                                         

MoviePy - Done !
MoviePy - video ready output\video\scene_4.mp4
Merging all scenes into final video...
MoviePy - Building video final_video.mp4.
MoviePy - Writing audio in final_videoTEMP_MPY_wvf_snd.mp4


                                                                      

MoviePy - Done.
MoviePy - Writing video final_video.mp4



                                                                           

MoviePy - Done !
MoviePy - video ready final_video.mp4
Done! Output: final_video.mp4


