In [1]:
import os
import cv2
import json
import shutil
import requests
import tempfile
from predict import Predictor
from IPython.display import Video

In [2]:
def download_video(video_url):
    response = requests.get(video_url)
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
    temp_file.write(response.content)
    temp_file.close()
    return temp_file.name


def extract_audio(video_path: str, output_audio_path: str) -> bool:
    print(f"[~] Extracting audio from {video_path} to {output_audio_path}")
    command = f"ffmpeg -i {video_path} -vn -ar 44100 -ac 2 -ab 192k -f mp3 {output_audio_path}"
    os.system(command)
    return check_audio_extraction_success(output_audio_path)


def check_audio_extraction_success(output_audio_path: str) -> bool:
    if os.path.exists(output_audio_path) and os.path.getsize(output_audio_path) > 0:
        return True
    else:
        print("[!] No audio stream found in the video or extraction failed.")
        return False


def process_video_frames(video, data, output_frames_pattern):
    for frame_data in data:
        frame_number = frame_data["frame_number"]
        video.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
        ret, frame = video.read()
        if ret:
            draw_faces(frame, frame_data["faces"])
            cv2.imwrite(output_frames_pattern % frame_number, frame)


def draw_faces(frame, faces):
    for face in faces:
        x0, y0, x1, y1 = face["x0"], face["y0"], face["x1"], face["y1"]
        GREEN = (0, 255, 0)
        RED = (0, 0, 255)  # BGR so red is last channel
        color = GREEN if face["speaking"] else RED
        cv2.rectangle(
            frame,
            (x0, y0),
            (x1, y1),
            color,
            10,
        )
        cv2.putText(
            frame,
            f"Face {face['face_id']}",
            (x0, y0 - 10),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.9,
            color,
            2,
        )


def visualize_output(video_path, json_output):
    data = json.loads(json_output)
    temp_dir = tempfile.mkdtemp()
    output_frames_pattern = os.path.join(temp_dir, "frame_%05d.png")
    video = cv2.VideoCapture(video_path)
    fps = int(video.get(cv2.CAP_PROP_FPS))
    process_video_frames(video, data, output_frames_pattern)
    video.release()
    audio_path = os.path.join(temp_dir, "audio.mp3")
    has_audio = extract_audio(video_path, audio_path)
    output_video_path = generate_output_video(
        fps,
        output_frames_pattern,
        audio_path,
        has_audio,
    )
    shutil.rmtree(temp_dir)
    return output_video_path


def generate_output_video(fps, output_frames_pattern, audio_path, has_audio):
    output_video_path = "output_video.mp4"
    if has_audio:
        ffmpeg_cmd = f"ffmpeg -y -framerate {fps} -i {output_frames_pattern} -i {audio_path} -c:v libx264 -pix_fmt yuv420p -c:a aac -strict experimental {output_video_path}"
    else:
        ffmpeg_cmd = f"ffmpeg -y -framerate {fps} -i {output_frames_pattern} -c:v libx264 -pix_fmt yuv420p {output_video_path}"
    os.system(ffmpeg_cmd)
    return output_video_path

In [3]:
# Download the video and save it to a temporary file
video_url = "https://replicate.delivery/pbxt/KUjguhc1e9L8mC40dd8Ub8lnLiQtjOLYvgsuuWNbQmLmfmXX/Untitled.mp4"
video_path = download_video(video_url)
print(f"Input video = {video_path}")

Input video = /tmp/tmpmao5i_kv.mp4


In [4]:
# Create an instance of the Predictor class
p = Predictor()

# Run the prediction on the downloaded video
result = p.predict(
    video=video_path,
    start=0,
    duration=0,
    min_track=10,
    crop_scale=0.4,
    min_face_size=1,
    face_det_scale=0.25,
    num_failed_det=10,
    return_json=True,
    return_boundingbox_percentages=False,  # Return pixel coords of faces
)

Command output: 05-01 17:34:34 Model para number = 15.01

Command errors: 2024-05-01 17:33:58 Extract the video and save in demo/tmpmao5i_kv/pyavi/video.avi 
2024-05-01 17:33:58 Extract the audio and save in demo/tmpmao5i_kv/pyavi/audio.wav 
2024-05-01 17:34:01 Extract the frames and save in demo/tmpmao5i_kv/pyframes 
VideoManager is deprecated and will be removed.
`base_timecode` argument is deprecated and has no effect.
demo/tmpmao5i_kv/pyavi/video.avi - scenes detected 1
2024-05-01 17:34:01 Scene detection and save in demo/tmpmao5i_kv/pywork 
2024-05-01 17:34:20 Face detection and save in demo/tmpmao5i_kv/pywork 
2024-05-01 17:34:20 Face track and detected 2 tracks 
100%|██████████| 2/2 [00:13<00:00,  6.81s/it]
2024-05-01 17:34:33 Face Crop and saved in demo/tmpmao5i_kv/pycrop tracks 
Model pretrain_TalkSet.model loaded from previous state! 
100%|██████████| 2/2 [00:01<00:00,  1.12it/s]
2024-05-01 17:34:35 Scores extracted and saved in demo/tmpmao5i_kv/pywork 
100%|██████████| 438/4

In [5]:
json_output = result.json_str
data = json.loads(json_output)
print(json.dumps(data, indent=2))

[
  {
    "frame_number": 0,
    "faces": [
      {
        "face_id": 0,
        "x0": 287,
        "y0": 120,
        "x1": 450,
        "y1": 283,
        "speaking": false
      },
      {
        "face_id": 1,
        "x0": 1427,
        "y0": 86,
        "x1": 1603,
        "y1": 262,
        "speaking": false
      }
    ]
  },
  {
    "frame_number": 1,
    "faces": [
      {
        "face_id": 0,
        "x0": 287,
        "y0": 120,
        "x1": 450,
        "y1": 283,
        "speaking": false
      },
      {
        "face_id": 1,
        "x0": 1427,
        "y0": 86,
        "x1": 1603,
        "y1": 262,
        "speaking": false
      }
    ]
  },
  {
    "frame_number": 2,
    "faces": [
      {
        "face_id": 0,
        "x0": 287,
        "y0": 120,
        "x1": 450,
        "y1": 283,
        "speaking": false
      },
      {
        "face_id": 1,
        "x0": 1427,
        "y0": 86,
        "x1": 1603,
        "y1": 262,
        "speaking": false
      }
    

In [6]:
# Visualize the output and get the path to the output video
output_video_path = visualize_output(video_path, json_output)
print(f"Output video = {output_video_path}")

[~] Extracting audio from /tmp/tmpmao5i_kv.mp4 to /tmp/tmpuopx0_wy/audio.mp3


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Output video = output_video.mp4


frame=  418 fps= 88 q=-1.0 Lsize=    3660kB time=00:00:18.04 bitrate=1661.8kbits/s speed= 3.8x    
video:3370kB audio:276kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.385679%
[libx264 @ 0x558822a7a800] frame I:2     Avg QP:18.64  size:156782
[libx264 @ 0x558822a7a800] frame P:154   Avg QP:20.22  size: 16164
[libx264 @ 0x558822a7a800] frame B:262   Avg QP:26.12  size:  2470
[libx264 @ 0x558822a7a800] consecutive B-frames:  1.0% 38.3% 24.4% 36.4%
[libx264 @ 0x558822a7a800] mb I  I16..4: 10.5% 63.7% 25.8%
[libx264 @ 0x558822a7a800] mb P  I16..4:  0.6%  1.7%  0.4%  P16..4: 21.3%  7.1%  4.3%  0.0%  0.0%    skip:64.6%
[libx264 @ 0x558822a7a800] mb B  I16..4:  0.1%  0.1%  0.0%  B16..8: 15.8%  1.0%  0.2%  direct: 0.3%  skip:82.6%  L0:35.1% L1:61.2% BI: 3.6%
[libx264 @ 0x558822a7a800] 8x8 transform intra:63.3% inter:66.9%
[libx264 @ 0x558822a7a800] coded y,uvDC,uvAC intra: 42.3% 29.3% 9.0% inter: 3.9% 2.5% 0.3%
[libx264 @ 0x558822a7a800] i16 v,h,dc,p: 29% 29% 11% 31%
[

In [7]:
# Display the output video in Jupyter Notebook
Video(output_video_path)