In [7]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [104]:
import numpy as np
import pandas as pd
import torch
import torchaudio

from pathlib import Path
from pydub import AudioSegment
import  IPython.display as ipd
import decord
from decord import VideoReader, cpu
import matplotlib.pyplot as plt
import subprocess
from concurrent.futures import ProcessPoolExecutor, as_completed
import os
import re

In [67]:
vid_path = Path("dataset/train")
audio_dir = Path("dataset/audio")
metadata = pd.read_csv("dataset/filtered_metadata.csv")
masks = Path("dataset/lip_masks")
transcripts_paths = Path("/home/eliasfizesam/data_files")

In [10]:
video_paths = {video_path.stem for video_path in vid_path.iterdir() if video_path.suffix == ".mp4"}

In [68]:
transcripts = {}
for path in transcripts_paths.iterdir():
    if path.suffix == ".csv":
        video_name = path.stem
        transcripts[video_name] = pd.read_csv(path)


In [69]:
len(transcripts)

24019

In [12]:
avg_fps = 24
target_frames = 60

In [41]:
vid_id = "73vOfaiysR0_0"
audio, sr = torchaudio.load(audio_dir / f"{vid_id}.wav")

In [42]:
scaling_factor = 16000 / sr
resampler = torchaudio.transforms.Resample(sr, 16000)
audio = resampler(audio)

In [45]:
start = transcripts["73vOfaiysR0_0"].iloc[0]["start"]
end = transcripts["73vOfaiysR0_0"].iloc[-1]["end"]

total_duration = end - start
total_frames = total_duration * avg_fps

# Groups of N frames
total_groups = total_frames // target_frames

print(f"We can build {total_groups} groups of {target_frames} frames each")

# Check for words with unrealistic durations
long_words = []
for index, row in transcripts["73vOfaiysR0_0"].iterrows():
    word_duration = row["end"] - row["start"]
    if word_duration > 1:  # Threshold for unrealistically long word duration
        long_words.append((row["word"], word_duration))

if long_words:
    print("Words with unusually long durations:")
    for word, duration in long_words:
        print(f"  '{word}': {duration:.2f}s")

# Create frame groups and match words to them
valid_frame_groups = []
min_words_threshold = 1  # Minimum number of words required for a valid frame group

for i in range(int(total_groups)):
    group_start_time = start + (i * target_frames / avg_fps)
    group_end_time = group_start_time + (target_frames / avg_fps)
    
    # Find words that fall within this frame group
    group_words = []
    for index, row in transcripts["73vOfaiysR0_0"].iterrows():
        word_start = row["start"]
        word_end = row["end"]
        
        # Check if word is completely within this frame group
        if group_start_time <= word_start and word_end <= group_end_time:
            group_words.append(row["word"])
    
    # Only add groups that have enough words
    if len(group_words) >= min_words_threshold:
        valid_frame_groups.append({
            "frame_range": (i * target_frames, (i + 1) * target_frames - 1),
            "time_range": (group_start_time, group_end_time),
            "words": group_words
        })

print(f"\nCreated {len(valid_frame_groups)} valid frame groups with sufficient words:")
for i, group in enumerate(valid_frame_groups[:3]):  # Show first 3 groups
    print(f"Group {i}: Frames {group['frame_range']}, Time {group['time_range'][0]:.2f}s-{group['time_range'][1]:.2f}s")
    print(f"  Words: {' '.join(group['words'])}")


We can build 3.0 groups of 60 frames each
Words with unusually long durations:
  'Some': 2.96s

Created 2 valid frame groups with sufficient words:
Group 0: Frames (60, 119), Time 2.53s-5.03s
  Words: of the synthetics that travelled with you may have
Group 1: Frames (120, 179), Time 5.03s-7.53s
  Words: been coming here to harm humans. Do any


In [70]:
avg_fps = 24
target_frames = 60
min_words_threshold = 1  # Minimum number of words required for a valid frame group
output_file = "frame_group_mappings.csv"

# Process all videos in the transcripts dictionary
all_frame_groups = []

for video_name, transcript_df in transcripts.items():
    try:
        if transcript_df.empty:
            continue
        
        print()
        start = transcript_df.iloc[0]["start"]
        end = transcript_df.iloc[-1]["end"]
        
        total_duration = end - start
        total_frames = total_duration * avg_fps
        
        # Groups of N frames
        total_groups = total_frames // target_frames
        
        print(f"Video: {video_name} - Can build {total_groups:.1f} groups of {target_frames} frames each")
        
        # Check for words with unrealistic durations
        long_words = []
        for index, row in transcript_df.iterrows():
            word_duration = row["end"] - row["start"]
            if word_duration > 1:  # Threshold for unrealistically long word duration
                long_words.append((row["word"], word_duration))
        
        if long_words:
            print(f"  Words with unusually long durations in {video_name}:")
            for word, duration in long_words:
                print(f"    '{word}': {duration:.2f}s")
        
        # Create frame groups and match words to them
        valid_frame_groups = []
        
        for i in range(int(total_groups)):
            group_start_time = start + (i * target_frames / avg_fps)
            group_end_time = group_start_time + (target_frames / avg_fps)
            
            # Find words that fall within this frame group
            group_words = []
            for index, row in transcript_df.iterrows():
                word_start = row["start"]
                word_end = row["end"]
                
                # Check if word is completely within this frame group
                if group_start_time <= word_start and word_end <= group_end_time:
                    group_words.append(str(row["word"]))
            
            # Only add groups that have enough words
            if len(group_words) >= min_words_threshold:
                # Create new ID with _X suffix for multiple groups from same video
                new_id = f"{video_name}_{i}" if i > 0 else video_name
                
                valid_frame_groups.append({
                    "original_video_id": video_name,
                    "new_id": new_id,
                    "frame_range": (i * target_frames, (i + 1) * target_frames - 1),
                    "time_range": (group_start_time, group_end_time),
                    "start_time": group_start_time,
                    "end_time": group_end_time,
                    "words": " ".join(group_words),
                    "num_words": len(group_words)
                })
        
        print(f"  Created {len(valid_frame_groups)} valid frame groups with sufficient words for {video_name}")
        all_frame_groups.extend(valid_frame_groups)
    except KeyError as e:
        print(f"Error processing video {video_name}: KeyError - {e}")


Video: 7Cyzq1bWpZs_2 - Can build 1.0 groups of 60 frames each
  Created 1 valid frame groups with sufficient words for 7Cyzq1bWpZs_2

Video: lq0HERNHyV0_7 - Can build 0.0 groups of 60 frames each
  Created 0 valid frame groups with sufficient words for lq0HERNHyV0_7

Video: Jygs4gYq0Gs_23 - Can build 4.0 groups of 60 frames each
  Created 4 valid frame groups with sufficient words for Jygs4gYq0Gs_23

Video: -y6RPL5v1bU_3 - Can build 2.0 groups of 60 frames each
  Words with unusually long durations in -y6RPL5v1bU_3:
    'down-to-earth': 1.13s
    'England.': 1.03s
  Created 2 valid frame groups with sufficient words for -y6RPL5v1bU_3

Video: djlxq6gx6aI_0 - Can build 1.0 groups of 60 frames each
  Words with unusually long durations in djlxq6gx6aI_0:
    'ovary': 1.86s
  Created 1 valid frame groups with sufficient words for djlxq6gx6aI_0

Video: wR_e9lxh7Ds_0 - Can build 2.0 groups of 60 frames each
  Created 2 valid frame groups with sufficient words for wR_e9lxh7Ds_0

Video: 11-AlL

In [111]:

# Create and save the DataFrame
mapping_df = pd.DataFrame(all_frame_groups)
mapping_df['words'] = mapping_df['words'].apply(lambda x: re.sub(r'[.,"\-=?!]', '', x))
mapping_df.to_csv(output_file, index=False)
print(f"\nSaved {len(all_frame_groups)} frame group mappings to {output_file}")
print(f"Total videos processed: {len(transcripts)}")


Saved 43743 frame group mappings to frame_group_mappings.csv
Total videos processed: 24019


In [106]:
mapping_df.head()

Unnamed: 0,original_video_id,new_id,frame_range,time_range,start_time,end_time,words,num_words
0,7Cyzq1bWpZs_2,7Cyzq1bWpZs_2,"(0, 59)","(6.443, 8.943)",6.443,8.943,Her body is,3
1,Jygs4gYq0Gs_23,Jygs4gYq0Gs_23,"(0, 59)","(0.031, 2.531)",0.031,2.531,Peki yani o huzura erimek iin,6
2,Jygs4gYq0Gs_23,Jygs4gYq0Gs_23_1,"(60, 119)","(2.531, 5.031000000000001)",2.531,5.031,ne yapmal Vallahi insann,4
3,Jygs4gYq0Gs_23,Jygs4gYq0Gs_23_2,"(120, 179)","(5.031, 7.531)",5.031,7.531,yani kendiyle bark olmas lazm,5
4,Jygs4gYq0Gs_23,Jygs4gYq0Gs_23_3,"(180, 239)","(7.531, 10.030999999999999)",7.531,10.031,bark olmayan bir insanda zaten huzur olmaz ki,8


In [101]:
sample_vid = mapping_df[mapping_df['original_video_id'] == "JynX_WLlHOY_2"]

In [102]:
audio, sr = torchaudio.load(audio_dir / f"JynX_WLlHOY_2.wav")

for i, extract in sample_vid.iterrows():
    start = extract['start_time']
    end = extract['end_time']

    start_ind = int(sr * start)
    end_ind = int(sr * end)
    display(ipd.Audio(audio[:, start_ind:end_ind], rate=sr))
    print(f"Playing segment {i+1}/{len(sample_vid)}: {extract['words']}")

Playing segment 21510/7: I mean, obviously, you can tell by their work, are


Playing segment 21511/7: on a crazy run and are so


Playing segment 21512/7: and just on fire. And


Playing segment 21513/7: true. I think


Playing segment 21514/7: built their own little world between the two of them


Playing segment 21515/7: the past, however, 15 years they've been


Playing segment 21516/7: together. And it's
