In [8]:
# %% [code]
import re
import librosa
import soundfile as sf
from IPython.display import Audio, display

# Adjustable variables
caption_index = 700        # Change this to target a specific caption index
time_shift = 0.3         # Time shift in seconds (e.g., 0.25 or -0.5)

# Input file paths
srt_file = "The.Cocoanuts.1929.iNTERNAL.DVDRip.XVID-vRs.English.srt"
mp4_file = "The Cocoanuts.mp4"

# Load the entire audio from the video file
# librosa uses ffmpeg indirectly via `audioread`, but only requires pip installation
y, sr = librosa.load(mp4_file, sr=None)

# Function to convert SRT timestamp (HH:MM:SS,mmm) to seconds
def srt_time_to_seconds(ts):
    h, m, s_ms = ts.split(":")
    s, ms = s_ms.split(",")
    return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000

# Parse the SRT file
with open(srt_file, "r", encoding="utf-8") as f:
    content = f.read()

pattern = re.compile(
    r'(\d+)\s+(\d{2}:\d{2}:\d{2},\d{3})\s+-->\s+(\d{2}:\d{2}:\d{2},\d{3})\s+([\s\S]*?)(?=\n\n|\Z)',
    re.MULTILINE
)
matches = pattern.findall(content)

captions = []
for idx, start, end, text in matches:
    captions.append({
        'index': int(idx),
        'start': srt_time_to_seconds(start),
        'end': srt_time_to_seconds(end),
        'text': text.strip()
    })

# Locate and process the selected caption
selected_caption = next((cap for cap in captions if cap['index'] == caption_index), None)

if not selected_caption:
    print(f"Caption index {caption_index} not found.")
else:
    print(f"Caption {caption_index}:\n{selected_caption['text']}\n")
    
    # Apply time shift
    start_time = max(0, selected_caption['start'] + time_shift)
    end_time = max(0, selected_caption['end'] + time_shift)

    start_sample = int(start_time * sr)
    end_sample = int(end_time * sr)

    y_clip = y[start_sample:end_sample]

    print(f"Sampling rate: {sr} Hz")
    print(f"Start time: {start_time:.3f} s | End time: {end_time:.3f} s")
    print(f"Audio samples: {len(y_clip)}")

    # Display the audio segment
    display(Audio(data=y_clip, rate=sr))


Caption 700:
Now, we'll take lot number 21.

Sampling rate: 48000 Hz
Start time: 3385.815 s | End time: 3388.682 s
Audio samples: 137616


In [10]:
# %% [code]
import os
import re
from moviepy.editor import VideoFileClip
import soundfile as sf
import librosa
from tqdm import tqdm

# Use the same time_shift from cell 1
# time_shift = 0.0  # Already defined in cell 1

# Adjustable grouping parameters (in seconds)
min_length = 30.0  # minimum duration for a group (e.g., 30 seconds)
max_length = 180.0 # maximum duration for a group (e.g., 180 seconds)

# File paths (assumed defined in cell 1)
# srt_file = "The Cocoanuts.srt"
# mp4_file = "The Cocoanuts.mp4"
output_parent_folder = "Exported_Clips"

# Create the parent output folder if it does not exist
os.makedirs(output_parent_folder, exist_ok=True)

# Helper function to convert SRT timestamp string (HH:MM:SS,mmm) to seconds (float)
def srt_time_to_seconds(ts):
    h, m, s_ms = ts.split(":")
    s, ms = s_ms.split(",")
    return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000

# Helper function to convert seconds (float) to SRT timestamp string (HH:MM:SS,mmm)
def seconds_to_srt_time(sec):
    hours = int(sec // 3600)
    minutes = int((sec % 3600) // 60)
    seconds = int(sec % 60)
    milliseconds = int(round((sec - int(sec)) * 1000))
    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"

# Helper function to convert seconds (float) to VTT timestamp string (HH:MM:SS.mmm)
def seconds_to_vtt_time(sec):
    hours = int(sec // 3600)
    minutes = int((sec % 3600) // 60)
    seconds = int(sec % 60)
    milliseconds = int(round((sec - int(sec)) * 1000))
    return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"

# Parse the SRT file with time_shift applied
with open(srt_file, "r", encoding="utf-8") as f:
    srt_content = f.read()

pattern = re.compile(
    r'(\d+)\s+(\d{2}:\d{2}:\d{2},\d{3})\s+-->\s+(\d{2}:\d{2}:\d{2},\d{3})\s+([\s\S]*?)(?=\n\n|\Z)',
    re.MULTILINE
)
matches = pattern.findall(srt_content)

captions = []
for idx, start, end, text in matches:
    # Apply time_shift to start and end times
    start_sec = max(0, srt_time_to_seconds(start) + time_shift)
    end_sec = max(0, srt_time_to_seconds(end) + time_shift)
    captions.append({
        'index': int(idx),
        'start': start_sec,
        'end': end_sec,
        'text': text.strip(),
        'orig_start': srt_time_to_seconds(start),
        'orig_end': srt_time_to_seconds(end)
    })

# Group captions into segments where each segment has at least min_length.
groups = []
i = 0
n = len(captions)
while i < n:
    group = []
    group_start = captions[i]['start']  # Already includes time_shift
    group_end = captions[i]['end']      # Already includes time_shift
    group.append(captions[i])
    j = i
    # Expand group until duration >= min_length
    while group_end - group_start < min_length and j < n - 1:
        j += 1
        group.append(captions[j])
        group_end = captions[j]['end']  # Already includes time_shift
    group_duration = group_end - group_start
    # Accept group only if it does not exceed max_length
    if group_duration <= max_length:
        groups.append({'captions': group, 'start': group_start, 'end': group_end})
    else:
        print(f"Skipping group from {seconds_to_srt_time(group_start)} to {seconds_to_srt_time(group_end)} "
              f"(duration {group_duration:.2f}s) because it exceeds max length.")
    i = j + 1

if not groups:
    print("No caption groups found matching the specified criteria.")
else:
    # Load video once using MoviePy
    video = VideoFileClip(mp4_file)
    
    # Process each group with a progress bar
    for grp in tqdm(groups, desc="Exporting groups"):
        grp_start = grp['start']  # Already includes time_shift
        grp_end = grp['end']      # Already includes time_shift
        grp_duration = grp_end - grp_start
        
        # Folder name based on group timestamps in SRT format
        folder_name = f"{seconds_to_srt_time(grp_start)} --> {seconds_to_srt_time(grp_end)}"
        output_folder = os.path.join(output_parent_folder, folder_name)
        os.makedirs(output_folder, exist_ok=True)
        
        # Export video clip
        clip = video.subclip(grp_start, grp_end)
        video_output_path = os.path.join(output_folder, "video.mp4")
        clip.write_videofile(video_output_path, codec="libx264", audio_codec="aac", verbose=False, logger=None)
        
        # Export audio from clip as MP3
        audio_output_path = os.path.join(output_folder, "audio.mp3")
        clip.audio.write_audiofile(audio_output_path, logger=None)
        
        # Create the VTT file with re-aligned (zero-based) timestamps within this clip
        vtt_lines = ["WEBVTT\n"]
        for cap in grp['captions']:
            # Calculate timestamps relative to clip start
            rel_start = cap['start'] - grp_start
            rel_end = cap['end'] - grp_start
            
            start_vtt = seconds_to_vtt_time(rel_start)
            end_vtt = seconds_to_vtt_time(rel_end)
            
            vtt_lines.append(f"{start_vtt} --> {end_vtt}")
            vtt_lines.append(cap['text'] + "\n")
        
        vtt_content = "\n".join(vtt_lines)
        vtt_output_path = os.path.join(output_folder, "captions.vtt")
        with open(vtt_output_path, "w", encoding="utf-8") as vtt_file:
            vtt_file.write(vtt_content)
    
    # Clean up video resources
    video.reader.close()
    if video.audio is not None:
        video.audio.reader.close_proc()


Skipping group from 00:35:27,458 to 00:39:02,896 (duration 215.44s) because it exceeds max length.
Skipping group from 00:51:08,198 to 00:54:38,033 (duration 209.84s) because it exceeds max length.
Skipping group from 01:12:10,759 to 01:15:25,347 (duration 194.59s) because it exceeds max length.


Exporting groups: 100%|██████████| 133/133 [12:46<00:00,  5.77s/it]
