<a href="https://colab.research.google.com/github/surajit93/open-university-video/blob/main/open_university_video_renderer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!rm -rf *.wav *.png final.mp4

In [None]:
!apt-get update -qq
!apt-get install -y espeak-ng espeak-ng-data libespeak-ng1 ffmpeg

In [None]:
!pip install -q torch torchaudio soundfile
!pip install -q "coqui-tts[codec]"
!pip install -q pillow moviepy requests

In [None]:
import os

os.environ["PHONEMIZER_ESPEAK_PATH"] = "/usr/lib/x86_64-linux-gnu/libespeak-ng.so.1"
os.environ["ESPEAK_PATH"] = "/usr/lib/x86_64-linux-gnu/libespeak-ng.so.1"

print("eSpeak configured")


In [None]:
from TTS.api import TTS

tts = TTS(
    model_name="tts_models/en/vctk/vits",
    progress_bar=False,
    gpu=False
)

print("TTS model loaded")


In [None]:
from pathlib import Path
import json

SLIDE_PLAN_PATH = Path("slide_plan.json")
AUDIO_MAP_PATH  = Path("slide_audio_map.json")

slide_plan = json.loads(SLIDE_PLAN_PATH.read_text(encoding="utf-8"))
audio_map  = json.loads(AUDIO_MAP_PATH.read_text(encoding="utf-8"))

assert len(slide_plan["slides"]) == len(audio_map), "Slide/audio mismatch"

print("Slides:", len(slide_plan["slides"]))
print("Narration blocks:", len(audio_map))


In [None]:
def safe_text(x):
    if x is None:
        return ""
    if isinstance(x, (dict, list)):
        return json.dumps(x, ensure_ascii=False)
    return str(x)


In [None]:
from TTS.api import TTS

import os

RENDER_LANG = os.environ.get("RENDER_LANG", "en")

LANG_MODELS = {
    "en": ("tts_models/en/vctk/vits", "p225"),
}

LANG_MODELS = {RENDER_LANG: LANG_MODELS[RENDER_LANG]}


audio_files = []

for lang, (model, speaker) in LANG_MODELS.items():
    tts = TTS(model_name=model, progress_bar=False, gpu=False)

    for item in audio_map:
        slide_id = item["slide_id"]
        text = item["spoken_text"]

        out = f"voice_{lang}_{slide_id}.wav"
        tts.tts_to_file(
            text=text,
            file_path=out,
            speaker=speaker
        )
        audio_files.append(out)

print(len(audio_files), "voice files created (multi-language)")


In [None]:
from PIL import Image, ImageDraw, ImageFont
import requests
from io import BytesIO
import re


In [None]:
# Canvas
W, H = 1280, 720
HEADER_H = int(0.20 * H)
FOOTER_H = int(0.20 * H)
BODY_H   = H - HEADER_H - FOOTER_H

LEFT_W  = W // 2
RIGHT_W = W // 2
MARGIN  = int(0.04 * W)

# Colors
WHITE = "#ffffff"
TEXT = "#0f172a"
HEADER_BG = "#1e3a8a"
FOOTER_BG = "#000000"
ACCENT = "#2563eb"
DIAGRAM_BG = "#eef2ff"

# Fonts
def load_font(size, bold=False, mono=False):
    try:
        if mono:
            return ImageFont.truetype("DejaVuSansMono.ttf", size)
        if bold:
            return ImageFont.truetype("DejaVuSans-Bold.ttf", size)
        return ImageFont.truetype("DejaVuSans.ttf", size)
    except:
        return ImageFont.load_default()


In [None]:
def fit_text(draw, text, box_w, box_h, max_size=96, min_size=28):
    for size in range(max_size, min_size, -2):
        font = load_font(size, bold=True)
        lines, line = [], ""

        for word in text.split():
            test = line + word + " "
            if draw.textlength(test, font=font) <= box_w:
                line = test
            else:
                lines.append(line)
                line = word + " "
        if line:
            lines.append(line)

        total_h = len(lines) * (size + 10)
        if total_h <= box_h:
            return font, lines

    return load_font(min_size, bold=True), [text]


In [None]:
def draw_diagram(draw, x, y, w, h, boxes):
    if not boxes:
        return False

    box_h = int(h / (len(boxes) * 1.4))
    gap = int(box_h * 0.35)
    cy = y

    font = load_font(int(box_h * 0.40), bold=True)

    for i, label in enumerate(boxes):
        draw.rounded_rectangle(
            [x, cy, x + w, cy + box_h],
            radius=18,
            fill=DIAGRAM_BG,
            outline=ACCENT,
            width=3
        )
        draw.text(
            (x + 20, cy + box_h // 2),
            safe_text(label),
            font=font,
            fill=TEXT,
            anchor="lm"
        )
        if i < len(boxes) - 1:
            draw.line(
                [x + w // 2, cy + box_h, x + w // 2, cy + box_h + gap],
                fill=ACCENT,
                width=3
            )
        cy += box_h + gap

    return True


def fetch_photo(query):
    try:
        r = requests.get(
            f"https://source.unsplash.com/900x700/?{query}",
            timeout=8
        )
        return Image.open(BytesIO(r.content)).convert("RGB")
    except:
        return None


In [None]:
image_files = []

for slide in slide_plan["slides"]:
    i = slide["slide_id"]

    img = Image.new("RGB", (W, H), WHITE)
    d = ImageDraw.Draw(img)

    # HEADER
    d.rectangle([0, 0, W, HEADER_H], fill=HEADER_BG)
    h_font = load_font(int(HEADER_H * 0.45), bold=True)
    d.text(
      (MARGIN, HEADER_H * 0.30),
      safe_text(slide.get("title")),
      font=h_font,
      fill="#ffffff"
    )


    # FOOTER
    d.rectangle([0, H - FOOTER_H, W, H], fill=FOOTER_BG)
    f_font = load_font(int(FOOTER_H * 0.30))
    d.text(
        (MARGIN, H - FOOTER_H * 0.6),
        f"Open Media University • Slide {i}",
        font=f_font,
        fill="#ffffff"
    )

    # BODY COORDS
    lx, ly = MARGIN, HEADER_H + MARGIN
    lw, lh = LEFT_W - 2 * MARGIN, BODY_H - 2 * MARGIN

    rx, ry = LEFT_W + MARGIN, HEADER_H + MARGIN
    rw, rh = RIGHT_W - 2 * MARGIN, BODY_H - 2 * MARGIN

    left = slide["left_panel_plan"]
    strategy = slide["visual_strategy"]

    left_rendered = False

    # LEFT PANEL
    if strategy in ["CONCEPT_DIAGRAM", "FLOW_DIAGRAM", "SYSTEM_DIAGRAM"]:
        left_rendered = draw_diagram(
            d, lx, ly, lw, lh, left["diagram_boxes"]
        )

    elif strategy == "PHOTO_REFERENCE":
        photo = fetch_photo(left["photo_query"])
        if photo:
            photo.thumbnail((lw, lh))
            img.paste(photo, (lx, ly))
            left_rendered = True

    elif strategy == "CODE_BLOCK":
      font = load_font(int(lh * 0.07), mono=True)
      y = ly
      for line in safe_text(left.get("description")).splitlines():
          d.text((lx, y), line, font=font, fill=TEXT)
          y += font.size + 12
      left_rendered = True



    elif strategy == "MATH_FORMULA":
      font = load_font(int(lh * 0.18), bold=True)
      d.text(
          (lx, ly + lh // 2),
          safe_text(left.get("math_formula")),
          font=font,
          fill=TEXT
      )


    # RIGHT PANEL (GIST ONLY, AUTO-FIT)
    gist = safe_text(slide.get("right_panel_gist"))
    font, lines = fit_text(d, gist, rw, rh)

    y = ry + (rh - len(lines) * (font.size + 10)) // 2
    for line in lines:
        d.text((rx, y), line, font=font, fill=TEXT)
        y += font.size + 10

    fname = f"slide_{i}.png"
    img.save(fname)
    image_files.append(fname)

print(len(image_files), "slides rendered")


In [None]:
# Ensure moviepy is available in the notebook kernel
import sys
import subprocess

try:
    import moviepy.editor  # noqa
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "moviepy"])


In [None]:
from pathlib import Path
import subprocess
import random
import json

# ============================================================
# BACKGROUND MUSIC (OPTIONAL)
# ============================================================

BG_FILES = ["bg.mp3", "bg1.mp3", "bg2.mp3", "bg3.mp3"]
bg_candidates = [Path(f) for f in BG_FILES if Path(f).exists()]
bg_file = random.choice(bg_candidates) if bg_candidates else None

if bg_file:
    print(f"✔ Background music selected: {bg_file.name}")
else:
    print("✔ No background music (voice-only)")

# ============================================================
# AUDIO MIXING (FFMPEG – HARD STOP GUARANTEED)
# ============================================================

final_audio_files = []

for slide in slide_plan["slides"]:
    sid = slide["slide_id"]

    voice = Path(f"voice_en_{sid}.wav")
    final_audio = Path(f"voice_final_{sid}.wav")

    if not voice.exists():
        raise RuntimeError(f"Missing voice file: {voice}")

    if bg_file:
        subprocess.run(
            [
                "ffmpeg", "-y",
                "-i", str(voice),
                "-stream_loop", "-1",
                "-i", str(bg_file),
                "-filter_complex",
                "[1:a]volume=0.18[a_bg];[0:a][a_bg]amix=inputs=2",
                "-shortest",
                "-c:a", "aac",
                "-b:a", "192k",
                str(final_audio)
            ],
            check=True
        )
    else:
        subprocess.run(
            [
                "ffmpeg", "-y",
                "-i", str(voice),
                "-c:a", "aac",
                "-b:a", "192k",
                str(final_audio)
            ],
            check=True
        )

    final_audio_files.append(final_audio)

# ============================================================
# BUILD PER-SLIDE VIDEOS (NO MOVIEPY)
# ============================================================

segment_files = []

for slide in slide_plan["slides"]:
    sid = slide["slide_id"]

    img = Path(f"slide_{sid}.png")
    audio = Path(f"voice_final_{sid}.wav")
    segment = Path(f"segment_{sid}.mp4")

    if not img.exists():
        raise RuntimeError(f"Missing slide image: {img}")

    subprocess.run(
        [
            "ffmpeg", "-y",
            "-loop", "1",
            "-i", str(img),
            "-i", str(audio),
            "-c:v", "libx264",
            "-preset", "medium",
            "-pix_fmt", "yuv420p",
            "-c:a", "aac",
            "-b:a", "192k",
            "-shortest",
            str(segment)
        ],
        check=True
    )

    segment_files.append(segment)

# ============================================================
# CONCAT ALL SEGMENTS (FFMPEG CONCAT DEMUXER)
# ============================================================

concat_file = Path("concat.txt")
concat_file.write_text(
    "\n".join(f"file '{seg.name}'" for seg in segment_files),
    encoding="utf-8"
)

subprocess.run(
    [
        "ffmpeg", "-y",
        "-f", "concat",
        "-safe", "0",
        "-i", str(concat_file),
        "-c", "copy",
        "final.mp4"
    ],
    check=True
)

print("✔ final.mp4 rendered successfully")
