<a href="https://colab.research.google.com/github/uth-ara/single-video-generator/blob/main/videogen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch transformers diffusers accelerate moviepy pillow numpy python-dotenv




In [1]:
%%writefile main.py
#!/usr/bin/env python3
"""
main.py — Single-Video Generator CLI

Modes:
- --mock     : creates a simple MP4 locally (text on white background) for testing/demo.
- --cogvideo : uses an open-source text-to-video model via Diffusers
               (damo-vilab/text-to-video-ms-1.7b).

Inputs (CLI):
  --prompt   : text prompt (subject + action)
  --style    : style string (e.g. cartoon, cinematic)
  --angle    : camera angle string (e.g. top-down, side)
  --duration : duration in seconds (recommended <= 10)
"""

import argparse
from pathlib import Path


# Output directory for generated videos
OUTPUT_DIR = Path("outputs")
OUTPUT_DIR.mkdir(exist_ok=True)


# Prompt construction

def build_prompt(subject_action: str, style: str, angle: str) -> str:
    """
    Combine the different CLI fields into a single text prompt.
    """
    return f"{subject_action}. Style: {style}. Camera angle: {angle}."

# Mock generator

def generate_mock_video(prompt: str, duration: float, out_path: Path) -> Path:
    """
    Simple local mock generator: writes a white background video with the prompt text.
    For testing the pipeline without a GPU or heavy model.
    """
    from moviepy.editor import ImageSequenceClip
    from PIL import Image, ImageDraw, ImageFont
    import numpy as np

    w, h = 640, 360
    fps = 24
    total_frames = int(duration * fps)

    frames = []
    font = ImageFont.load_default()

    for _ in range(total_frames):
        img = Image.new("RGB", (w, h), (255, 255, 255))
        draw = ImageDraw.Draw(img)
        draw.multiline_text((20, 20), prompt, fill=(0, 0, 0), font=font)
        frames.append(np.array(img))

    clip = ImageSequenceClip(frames, fps=fps)
    clip.write_videofile(str(out_path), fps=fps, codec="libx264", audio=False)
    return out_path



# Real text-to-video model (damo-vilab/text-to-video-ms-1.7b)

def call_text2video_ms(prompt: str, duration: float, out_path: Path) -> Path:
    """
    Generate a short video using damo-vilab/text-to-video-ms-1.7b via Diffusers.

    For Colab T4 and similar GPUs we:
    - use half precision when possible
    - enable CPU offload & VAE slicing to reduce VRAM usage
    - keep the clip short (1–2 seconds) so it fits comfortably
    """
    import torch
    from diffusers import DiffusionPipeline
    from diffusers.utils import export_to_video

    device = "cuda" if torch.cuda.is_available() else "cpu"
    if device != "cuda":
        print("⚠ Warning: no CUDA GPU detected, this may be extremely slow or fail.")

    model_id = "damo-vilab/text-to-video-ms-1.7b"

    if device == "cuda":
        pipe = DiffusionPipeline.from_pretrained(
            model_id,
            torch_dtype=torch.float16,
            variant="fp16",
        )
        # Memory optimizations for lower VRAM GPUs (eg T4)
        pipe.enable_model_cpu_offload()
        pipe.enable_vae_slicing()
    else:
        pipe = DiffusionPipeline.from_pretrained(model_id)

    fps = 8

    num_frames = int(duration * fps)
    num_frames = max(8, min(num_frames, 16))  # ~1–2 seconds

    print(f"Generating video with text-to-video-ms-1.7b: {num_frames} frames at {fps} fps.")
    result = pipe(prompt, num_frames=num_frames)
    frames = result.frames[0]

    export_to_video(frames, str(out_path), fps=fps)
    return out_path



# CLI entrypoint

def main() -> None:
    parser = argparse.ArgumentParser(
        description="Single-video generator CLI (mock or text-to-video model)."
    )

    parser.add_argument(
        "--prompt",
        required=True,
        help="Subject + action (e.g. 'A dog chasing a frisbee')",
    )
    parser.add_argument(
        "--style",
        default="cinematic",
        help="Style string (e.g. cartoon, cinematic, aesthetic)",
    )
    parser.add_argument(
        "--angle",
        default="side",
        help="Camera angle (e.g. top-down, side, wide)",
    )
    parser.add_argument(
        "--duration",
        type=float,
        default=3.0,
        help="Duration in seconds (recommended <= 10)",
    )
    parser.add_argument(
        "--mock",
        action="store_true",
        help="Use mock generator (no heavy model)",
    )
    parser.add_argument(
        "--cogvideo",
        action="store_true",
        help="Use real text-to-video model via Diffusers "
             "(damo-vilab/text-to-video-ms-1.7b)",
    )

    args = parser.parse_args()

    # Build combined prompt
    full_prompt = build_prompt(args.prompt, args.style, args.angle)
    out_path = OUTPUT_DIR / "output.mp4"

    # Simple guard against silly durations
    if args.duration <= 0:
        print("Duration must be positive. Using default of 3 seconds.")
        args.duration = 3.0
    elif args.duration > 10:
        print("Duration capped at 10 seconds as per task requirement.")
        args.duration = 10.0

    # Mode selection
    if args.mock and args.cogvideo:
        print("Both --mock and --cogvideo given; using --cogvideo (real model).")

    if args.cogvideo:
        print("Using damo-vilab/text-to-video-ms-1.7b (real text-to-video model) via Diffusers...")
        call_text2video_ms(full_prompt, args.duration, out_path)
        print("Text-to-video output written to", out_path)

    elif args.mock:
        print("Using mock generator...")
        generate_mock_video(full_prompt, args.duration, out_path)
        print("Mock video written to", out_path)

    else:
        print("No mode selected. Defaulting to mock mode.")
        generate_mock_video(full_prompt, args.duration, out_path)
        print("Mock video written to", out_path)


if __name__ == "__main__":
    main()


Writing main.py


In [None]:
!python main.py --prompt "Clouds moving slowly in the sky" --style natural --angle wide --duration 2 --cogvideo


Using damo-vilab/text-to-video-ms-1.7b (real text-to-video model) via Diffusers...
2025-12-04 18:01:30.602742: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764871290.627909    3220 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764871290.635296    3220 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764871290.654207    3220 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764871290.654248    3220 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W000

In [None]:
from IPython.display import Video
Video("outputs/output.mp4", embed=True)
