<a href="https://colab.research.google.com/github/surajit93/open-university-video/blob/main/open_university_video_renderer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import os
from TTS.api import TTS
import random

# Required for phonemizer in CI
os.environ["PHONEMIZER_ESPEAK_PATH"] = "/usr/bin/espeak"
os.environ["ESPEAK_PATH"] = "/usr/bin/espeak"

tts = TTS(
    model_name="tts_models/en/vctk/vits",
    progress_bar=False,
    gpu=False
)

VOICE_POOL = ["p225","p226","p227","p228","p229","p230"]

def generate_dynamic_voice(text, out_file):
    speaker = random.choice(VOICE_POOL)

    text = text.replace(".", "... ")
    text = text.replace(",", ", ")

    tts.tts_to_file(
        text=text,
        file_path=out_file,
        speaker=speaker
    )

print("Dynamic TTS engine ready")

 > tts_models/en/vctk/vits is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > initialization of speaker-embedding layers.
Dynamic TTS engine ready


In [8]:
from pathlib import Path
import json
import os
print(os.getcwd())
print(os.listdir())

slide_plan = json.loads(Path("slide_plan.json").read_text())
audio_map  = json.loads(Path("slide_audio_map.json").read_text())

assert len(slide_plan["slides"]) == len(audio_map)
print("Slides loaded:", len(slide_plan["slides"]))


/content
['.config', 'raw_2.wav', 'voice_2.wav', 'voice_5.wav', 'tts_env', 'voice_4.wav', 'raw_10.wav', 'voice_9.wav', 'raw_4.wav', 'raw_7.wav', 'voice_6.wav', 'raw_8.wav', 'voice_8.wav', 'voice_7.wav', 'voice_10.wav', 'slide_plan.json', 'raw_6.wav', 'raw_1.wav', 'raw_3.wav', 'voice_3.wav', 'voice_1.wav', 'asset_cache', 'slide_audio_map.json', 'raw_9.wav', 'drive', 'raw_5.wav', 'sample_data']
Slides loaded: 10


In [9]:
import librosa
import numpy as np

def detect_energy_curve(audio_file):
    y, sr = librosa.load(audio_file, sr=None)
    energy = librosa.feature.rms(y=y)[0]
    return np.mean(energy)

def adaptive_chunk_duration(audio_file):
    energy = detect_energy_curve(audio_file)
    if energy > 0.08:
        return 3
    elif energy > 0.05:
        return 4
    else:
        return 5


In [10]:
IMPORTANT_WORDS = [
    "money","power","rich","elite","secret","danger",
    "control","future","mistake","warning","truth",
    "hidden","win","lose","status","psychology"
]

def word_style(word):
    if word.lower() in IMPORTANT_WORDS:
        return {"color":"#FFD700","scale":1.3}
    return {"color":"white","scale":1.0}


In [11]:
%%bash
source /content/tts_env/bin/activate

export PHONEMIZER_ESPEAK_PATH=/usr/bin/espeak
export ESPEAK_PATH=/usr/bin/espeak

python - <<EOF
import json
import random
import subprocess
from pathlib import Path
from TTS.api import TTS

slide_plan = json.loads(Path("slide_plan.json").read_text())
audio_map  = json.loads(Path("slide_audio_map.json").read_text())

tts = TTS(model_name="tts_models/en/vctk/vits", progress_bar=False, gpu=False)

VOICE_POOL = ["p225","p226","p227","p228","p229","p230"]

for item in audio_map:
    sid  = item["slide_id"]
    text = item["spoken_text"]

    raw   = f"raw_{sid}.wav"
    final = f"voice_{sid}.wav"

    speaker = random.choice(VOICE_POOL)

    text = text.replace(".", "... ")
    text = text.replace(",", ", ")

    tts.tts_to_file(text=text, file_path=raw, speaker=speaker)

    speed = random.uniform(0.95,1.1)

    subprocess.run([
        "ffmpeg","-y",
        "-i",raw,
        "-filter:a",f"atempo={speed}",
        "-ar","22050",
        "-ac","1",
        final
    ],check=True)

print("Voice files ready")
EOF

 > tts_models/en/vctk/vits is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > initialization of speaker-embedding layers.
 > Text splitted to sentences.
['Most people never become rich...', 'Not because they lack intelligence...', 'But because they lack coura

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [12]:
from moviepy.editor import AudioFileClip

def emotional_music(audio_file):
    energy = detect_energy_curve(audio_file)
    if energy > 0.08:
        volume = 0.25
    else:
        volume = 0.15
    return volume


In [13]:
import os
import requests
import hashlib
import numpy as np
from PIL import Image, ImageFilter, ImageEnhance
from moviepy.editor import ImageClip, CompositeVideoClip
from io import BytesIO

W, H = 1280, 720

CACHE_DIR = "asset_cache"
os.makedirs(CACHE_DIR, exist_ok=True)

# Fixed seed for deterministic CI builds
VIDEO_SEED = 482731


# =========================================
# INTELLIGENT QUERY BUILDER
# =========================================

VISUAL_STRATEGY_MAP = {
    "CONCEPT_DIAGRAM": ("abstract illustration", "cinematic lighting"),
    "FLOW_DIAGRAM": ("modern infographic workflow", "clean professional"),
    "SYSTEM_DIAGRAM": ("digital network architecture", "dark technology"),
    "TEXT_ONLY": ("cinematic photography", "professional documentary"),
    "GAME_SCENE": ("3d game environment", "epic dramatic lighting"),
    "CASE_STUDY": ("corporate office real people", "neutral business tone"),
    "PSYCHOLOGY_BREAKDOWN": ("emotional portrait conceptual art", "moody shadows"),
    "HISTORICAL_ANALYSIS": ("historical setting realistic photography", "dramatic atmosphere"),
    "FUTURE_PREDICTION": ("futuristic cityscape sci fi", "high contrast cinematic"),
    "DATA_INSIGHT": ("data visualization abstract background", "minimal modern")
}


def build_visual_query(slide):

    strategy = slide.get("visual_strategy", "TEXT_ONLY")
    title = slide.get("title", "")
    gist = slide.get("right_panel_gist", "")
    desc = slide.get("left_panel_plan", {}).get("description", "")

    base = f"{title} {gist} {desc}".strip()

    style, mood = VISUAL_STRATEGY_MAP.get(
        strategy,
        VISUAL_STRATEGY_MAP["TEXT_ONLY"]
    )

    return f"{base} {style} {mood}"



# =========================================
# CACHE SYSTEM
# =========================================

def get_cache_path(key):
    h = hashlib.md5(key.encode()).hexdigest()
    return os.path.join(CACHE_DIR, f"{h}.jpg")


# =========================================
# IMAGE FETCHERS
# =========================================

def fetch_unsplash(query, seed):
    try:
        url = f"https://source.unsplash.com/1600x900/?{query}&sig={seed}"
        r = requests.get(url, timeout=10)
        if r.status_code == 200:
            return Image.open(BytesIO(r.content)).convert("RGB")
    except:
        pass
    return None


def fetch_picsum(seed):
    try:
        url = f"https://picsum.photos/seed/{seed}/1600/900"
        r = requests.get(url, timeout=10)
        if r.status_code == 200:
            return Image.open(BytesIO(r.content)).convert("RGB")
    except:
        pass
    return None


# =========================================
# FETCH + CACHE WITH REDUNDANCY
# =========================================

def fetch_and_cache(query, seed):

    cache_key = f"{query}-{seed}"
    path = get_cache_path(cache_key)

    # 1️⃣ LOCAL CACHE FIRST
    if os.path.exists(path):
        return Image.open(path).convert("RGB").resize((W, H))

    img = None

    # 2️⃣ PRIMARY → UNSPLASH
    img = fetch_unsplash(query, seed)

    # 3️⃣ SECONDARY → PICSUM
    if img is None:
        img = fetch_picsum(seed)

    # 4️⃣ SAVE TO CACHE
    if img is not None:
        img = img.resize((W, H))
        img.save(path)
        return img

    # 5️⃣ HARD FALLBACK
    return Image.new("RGB", (W, H), (20, 20, 25))


# =========================================
# CINEMATIC COLOR GRADING (LUT STYLE)
# =========================================

def cinematic_grade(img):

    img = ImageEnhance.Color(img).enhance(0.85)
    img = ImageEnhance.Contrast(img).enhance(1.25)
    img = ImageEnhance.Brightness(img).enhance(0.9)

    # Dark cinematic overlay
    overlay = Image.new("RGBA", (W, H), (10, 15, 25, 120))
    img = img.convert("RGBA")
    img = Image.alpha_composite(img, overlay).convert("RGB")

    return img


# =========================================
# PARALLAX DEPTH SYSTEM
# =========================================

def create_parallax_layers(img):

    base = img.filter(ImageFilter.GaussianBlur(radius=2.5))
    mid = img
    front = img.filter(ImageFilter.UnsharpMask(radius=2, percent=150))

    return base, mid, front


# =========================================
# MASTER BACKGROUND CLIP BUILDER
# =========================================

def build_background_clip(slide, duration, slide_index):

    query = build_visual_query(slide)

    seed = int(
        hashlib.md5(f"{VIDEO_SEED}-{query}-{slide_index}".encode()).hexdigest(),
        16
    ) % 100000

    img = fetch_and_cache(query, seed)
    img = cinematic_grade(img)

    base, mid, front = create_parallax_layers(img)

    base_clip = ImageClip(np.array(base)).set_duration(duration)
    mid_clip = ImageClip(np.array(mid)).set_duration(duration)
    front_clip = ImageClip(np.array(front)).set_duration(duration)

    # Dynamic camera curve (ease-in acceleration)
    def camera_curve(t):
        progress = t / duration
        return 1 + 0.07 * (progress ** 2)

    base_clip = base_clip.resize(lambda t: camera_curve(t))
    mid_clip = mid_clip.resize(lambda t: 1 + 0.045 * (t / duration))
    front_clip = front_clip.resize(lambda t: 1 + 0.02 * (t / duration))

    return CompositeVideoClip([base_clip, mid_clip, front_clip])


In [14]:
from moviepy.editor import AudioFileClip, CompositeAudioClip

BACKGROUND_MUSIC = "background.mp3"  # Put a royalty free mp3 in repo

def add_background_music(video):
    if not Path(BACKGROUND_MUSIC).exists():
        return video

    music = AudioFileClip(BACKGROUND_MUSIC).volumex(0.15)
    final_audio = CompositeAudioClip([video.audio, music])
    return video.set_audio(final_audio)


In [15]:
# ---- PURE PIL TEXT ENGINE (NO IMAGEMAGICK) ----

from PIL import Image, ImageDraw, ImageFont
import numpy as np
from moviepy.editor import ImageClip

FONT_PATH = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"

def create_text_image(text, fontsize=90, color="white"):
    img = Image.new("RGBA", (W, H), (0,0,0,0))
    draw = ImageDraw.Draw(img)
    font = ImageFont.truetype(FONT_PATH, fontsize)

    bbox = draw.textbbox((0,0), text, font=font)
    text_width = bbox[2] - bbox[0]
    text_height = bbox[3] - bbox[1]

    position = ((W - text_width) // 2, (H - text_height) // 2)
    draw.text(position, text, font=font, fill=color)

    return np.array(img)

def pillow_text_clip(text, duration, fontsize=90, color="white"):
    img = create_text_image(text, fontsize, color)
    return ImageClip(img).set_duration(duration)


In [16]:

def animated_counter(start, end, duration):
    def make_frame(t):
        value = int(start + (end-start)*(t/duration))
        img = create_text_image(str(value), fontsize=120, color="yellow")
        return img

    return VideoClip(make_frame, duration=duration)


In [17]:
def infographic_bar(value, duration):
    from moviepy.editor import VideoClip
    def make_frame(t):
        img = np.zeros((H,W,3),dtype=np.uint8)
        width = int((t/duration)*W*value)
        img[H//2-20:H//2+20,0:width] = (255,200,0)
        return img
    return VideoClip(make_frame,duration=duration)


In [18]:
def moving_arrow(duration):
    def make_frame(t):
        img = np.zeros((H,W,3),dtype=np.uint8)
        x = int((t/duration)*W)
        img[H//2:H//2+10,x:x+100] = (255,0,0)
        return img
    return VideoClip(make_frame,duration=duration)


In [19]:
from moviepy.editor import *
import math

def kinetic_text_layer(text, duration):
    words = text.split()
    clips = []
    per = max(0.25, duration / max(len(words),1))
    t_cursor = 0

    for w in words:
        style = word_style(w)

        clip = (pillow_text_clip(
                    w,
                    per,
                    fontsize=90,
                    color=style["color"]
                )
                .set_start(t_cursor)
                .resize(lambda t: style["scale"] + 0.15*math.sin(t*6))
                .fadein(0.1)
                .fadeout(0.1)
                .set_position(("center","center"))
        )

        clips.append(clip)
        t_cursor += per

    return CompositeVideoClip(clips).set_duration(duration)


In [20]:
!apt-get install -y fonts-dejavu-core
!fc-cache -fv
!ls /usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
fonts-dejavu-core is already the newest version (2.37-2build1).
0 upgraded, 0 newly installed, 0 to remove and 76 not upgraded.
/usr/share/fonts: caching, new cache contents: 0 fonts, 1 dirs
/usr/share/fonts/truetype: caching, new cache contents: 0 fonts, 3 dirs
/usr/share/fonts/truetype/dejavu: caching, new cache contents: 6 fonts, 0 dirs
/usr/share/fonts/truetype/humor-sans: caching, new cache contents: 1 fonts, 0 dirs
/usr/share/fonts/truetype/liberation: caching, new cache contents: 16 fonts, 0 dirs
/usr/local/share/fonts: caching, new cache contents: 0 fonts, 0 dirs
/root/.local/share/fonts: skipping, no such directory
/root/.fonts: skipping, no such directory
/usr/share/fonts/truetype: skipping, looped directory detected
/usr/share/fonts/truetype/dejavu: skipping, looped directory detected
/usr/share/fonts/truetype/humor-sans: skipping, looped directory detected
/usr/share/fonts/truet

In [21]:
segments=[]
import math
from pathlib import Path
import json
from moviepy.editor import AudioFileClip, CompositeVideoClip

# Ensure slide_plan and audio_map are loaded
slide_plan = json.loads(Path("slide_plan.json").read_text())
audio_map  = json.loads(Path("slide_audio_map.json").read_text())
assert len(slide_plan["slides"]) == len(audio_map)
print("Slides loaded:", len(slide_plan["slides"]))

for slide_index, slide in enumerate(slide_plan["slides"]):
    sid  = slide["slide_id"]
    text = slide.get("right_panel_gist","")
    audio_file=f"voice_{sid}.wav"

    audio= AudioFileClip(audio_file)
    total=audio.duration
    chunk=adaptive_chunk_duration(audio_file)
    parts=math.ceil(total/chunk)

    for i in range(parts):
        start=i*chunk
        end=min((i+1)*chunk,total)
        dur=end-start

        # Use build_background_clip to create the background video
        bg = build_background_clip(slide, dur, slide_index)
        txt = kinetic_text_layer(text,dur)
        arrow = moving_arrow(dur)

        scene = CompositeVideoClip([bg,arrow,txt])
        scene = scene.set_audio(audio.subclip(start,end))

        segments.append(scene.crossfadein(0.2))


Slides loaded: 10


In [22]:
# ---- PURE PIL TEXT ENGINE (NO IMAGEMAGICK) ----
from PIL import Image, ImageDraw, ImageFont
import numpy as np
from moviepy.editor import ImageClip, AudioClip

# Assuming W and H are defined globally by earlier cells, as indicated by context
# W, H = 1280, 720 # If not global, uncomment and define them here

FONT_PATH = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"

def create_text_image(text, fontsize=90, color="white"):
    img = Image.new("RGBA", (W, H), (0,0,0,0))
    draw = ImageDraw.Draw(img)
    font = ImageFont.truetype(FONT_PATH, fontsize)

    bbox = draw.textbbox((0,0), text, font=font)
    text_width = bbox[2] - bbox[0]
    text_height = bbox[3] - bbox[1]

    position = ((W - text_width) // 2, (H - text_height) // 2)
    draw.text(position, text, font=font, fill=color)

    return np.array(img)

def pillow_text_clip(text, duration, fontsize=90, color="white"):
    img = create_text_image(text, fontsize, color)
    return ImageClip(img).set_duration(duration)

intro = pillow_text_clip(
    "OPEN MEDIA UNIVERSITY",
    3,
    fontsize=100,
    color="white"
).fadein(1).fadeout(1).set_position("center")
# Ensure intro is a VideoClip with a silent audio track
intro = intro.set_audio(AudioClip(lambda t: 0, duration=intro.duration))

outro = pillow_text_clip(
    "SUBSCRIBE FOR NEXT EPISODE",
    4,
    fontsize=80,
    color="yellow"
).fadein(1).set_position("center")
# Ensure outro is a VideoClip with a silent audio track
outro = outro.set_audio(AudioClip(lambda t: 0, duration=outro.duration))


In [None]:
final_video = concatenate_videoclips(
    [intro] + segments + [outro],
    method="compose"
)

final_video = add_background_music(final_video)
final_video.write_videofile("final.mp4", fps=30, codec="libx264", audio_codec="aac")



Moviepy - Building video final.mp4.
MoviePy - Writing audio in finalTEMP_MPY_wvf_snd.mp4




MoviePy - Done.
Moviepy - Writing video final.mp4



t:   0%|          | 6/4123 [00:01<18:23,  3.73it/s, now=None]