In [1]:
# # Download ong's data
# !gdown --id 1ZZJ23ejcMEsaLBkSbOZmlxXyhIVZqfJo

# # Download amm's data
# !gdown --id 1TRPRcDEyi6ogu8j4OE_d_ePJRxUw_X1m

In [1]:
from pathlib import Path
from itertools import chain
import re

import pysrt
import pandas as pd
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip, ffmpeg_extract_audio

In [2]:
ROOT_PATH = Path.cwd().parent / "data" / "Opp Day SRTs" 
(VIDEO_CHUNKS_PATH := ROOT_PATH / "chunks").mkdir(exist_ok=True)

In [3]:
fr_srts = list((ROOT_PATH / "Freelancer SRT").rglob("*.srt"))
capcut_srts = list((ROOT_PATH / "Capcut SRT").rglob("*.srt"))
vids = list((ROOT_PATH / "Original Videos").rglob("*.mp4"))
# vids = list((ROOT_PATH).rglob("*.mp4"))

In [4]:
def gen_srt_map(ls: list[Path], exist_dict: dict = {}) -> dict[str, Path]:
    def hard_mapping(s: str) -> str:
        s = Path(clean_logic(s)).stem.upper()
        mapping = {"interpharma": "IP"}
        return mapping.get(s.lower(), s)
        
        
    def clean_logic(s: str) -> str:
        s = re.sub("FL-", "", s)
        if " " in s:
            candidate = s.split(" ")[0]
            if candidate == "Opp":
                return re.search("[A-Z]{2,}", s).group(0)
            return candidate
        
        return s.split("-")[0]
        
    return exist_dict | {hard_mapping(clean_logic(l.name)): l for l in ls}

def gen_video_map(ls: list[Path]) -> dict[str, Path]:
    return {re.search("[A-Z]{2,}", l.name).group(0): l for l in ls}

def cut_video_into_chunks(symbol: str, srt_maps: dict, vid_maps: dict) -> list[str, Path]:
    sub_pth, vid_pth = srt_maps.get(symbol, None), vid_maps.get(symbol, None)
    
    if sub_pth is None or vid_pth is None:
        print("sub or vid is not exist", sub_pth, vid_pth)
        return

    subs = pysrt.open(sub_pth)
    
    metadata = []
    # Loop through the subtitles
    for i, sub in enumerate(subs):
        print(sub)
        # Get the start and end times of each subtitle
        start_time = sub.start.ordinal / 1000.0
        end_time = sub.end.ordinal / 1000.0

        # Use ffmpeg to cut the video for this subtitle
        target_pth = VIDEO_CHUNKS_PATH / f"{vid_pth.name}_{i:04d}.mp4"
        target_audio_pth = VIDEO_CHUNKS_PATH / f"{vid_pth.name}_{i:04d}.mp3"
        transcription = sub.text
        ffmpeg_extract_subclip(vid_pth, start_time, end_time, targetname=target_pth)
        ffmpeg_extract_audio(target_pth, target_audio_pth)
    
        metadata.append((symbol, target_audio_pth, transcription))
        
    return metadata
        

In [5]:
srt_maps = gen_srt_map(capcut_srts)
srt_maps = gen_srt_map(fr_srts, srt_maps)

In [6]:
vid_maps = gen_video_map(vids)

In [7]:
print(sorted(srt_maps.keys()))
print(sorted(vid_maps.keys()))

['ADB', 'ADVANC', 'BGRIM', 'BGRIMM', 'EPG', 'GC', 'GLOBAL', 'HENG', 'III', 'IP', 'IRPC', 'MTC', 'MUANGTHAI', 'NOBLE', 'PSH', 'PTTEP', 'SAK', 'SPRC', 'SPVI', 'TAN', 'THAIKOM', 'THCOM', 'THREL', 'TRIPLEI']
['ADB', 'ADVANC', 'BGRIM', 'GC', 'HENG', 'III', 'IRPC', 'MINT', 'MTC', 'NOBLE', 'PSH', 'SAK', 'SAWAD', 'SPRC', 'SPVI', 'TAN', 'TH', 'THCOM', 'THREL']


In [8]:
existing_df = pd.read_csv(ROOT_PATH / "train_metadata.tsv", sep="\t")

In [9]:
existed_symbols = set(existing_df.symbol.unique().tolist())

In [10]:
target_symbols = set(vid_maps.keys()).intersection(srt_maps.keys())
print(target_symbols)
print(f"remaining symbols: {target_symbols - existed_symbols}")

{'SAK', 'MTC', 'SPVI', 'BGRIM', 'SPRC', 'THREL', 'IRPC', 'NOBLE', 'III', 'GC', 'THCOM', 'TAN', 'ADVANC', 'HENG', 'ADB', 'PSH'}
remaining symbols: {'III', 'BGRIM', 'THCOM'}


In [86]:
# remaining_symbols = {"ADB", "SPRC", "SPVI"}
remaining_symbols = target_symbols - existed_symbols

In [87]:
existing_df = existing_df[~existing_df.symbol.isin(remaining_symbols)]

In [89]:
results = list(chain.from_iterable([cut_video_into_chunks(symbol, srt_maps, vid_maps) for symbol in remaining_symbols]))

1
00:00:11,110 --> 00:00:12,945
สวัสดีค่ะท่านนักลงทุนทุกท่าน

Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
1
00:00:11,766 --> 00:00:14,800
สวัสดีครับท่านผู้นักลงทุนทั้งหลายนะครับ

Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command returned an error


OSError: ffmpeg version 4.2.2-static https://johnvansickle.com/ffmpeg/  Copyright (c) 2000-2019 the FFmpeg developers
  built with gcc 8 (Debian 8.3.0-6)
  configuration: --enable-gpl --enable-version3 --enable-static --disable-debug --disable-ffplay --disable-indev=sndio --disable-outdev=sndio --cc=gcc --enable-fontconfig --enable-frei0r --enable-gnutls --enable-gmp --enable-libgme --enable-gray --enable-libaom --enable-libfribidi --enable-libass --enable-libvmaf --enable-libfreetype --enable-libmp3lame --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-librubberband --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libvorbis --enable-libopus --enable-libtheora --enable-libvidstab --enable-libvo-amrwbenc --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libdav1d --enable-libxvid --enable-libzvbi --enable-libzimg
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavformat    58. 29.100 / 58. 29.100
  libavdevice    58.  8.100 / 58.  8.100
  libavfilter     7. 57.100 /  7. 57.100
  libswscale      5.  5.100 /  5.  5.100
  libswresample   3.  5.100 /  3.  5.100
  libpostproc    55.  5.100 / 55.  5.100
Input #0, mov,mp4,m4a,3gp,3g2,mj2, from '/home/jupyter/set-speechtotext-poc/notebooks/whisper-v3/data/Opp Day SRTs/Original Videos/x2mate.com-Oppday Q3_2023 SPVI บมจ. เอส พี วี ไอ.mp4':
  Metadata:
    major_brand     : mp42
    minor_version   : 0
    compatible_brands: isommp42
    creation_time   : 2023-11-10T01:14:56.000000Z
  Duration: 00:50:43.44, start: 0.000000, bitrate: 360 kb/s
    Stream #0:0(und): Video: h264 (High) (avc1 / 0x31637661), yuv420p(tv, bt709), 1280x720 [SAR 1:1 DAR 16:9], 228 kb/s, 30 fps, 30 tbr, 15360 tbn, 60 tbc (default)
    Metadata:
      creation_time   : 2023-11-10T01:14:56.000000Z
      handler_name    : ISO Media file produced by Google Inc. Created on: 11/09/2023.
    Stream #0:1(und): Audio: aac (LC) (mp4a / 0x6134706D), 44100 Hz, stereo, fltp, 128 kb/s (default)
    Metadata:
      creation_time   : 2023-11-10T01:14:56.000000Z
      handler_name    : ISO Media file produced by Google Inc. Created on: 11/09/2023.
/home/jupyter/set-speechtotext-poc/notebooks/whisper-v3/data/Opp Day SRTs/chunks/x2mate.com-Oppday Q3_2023 SPVI บมจ. เอส พี วี ไอ.mp4_0000.mp4: Permission denied


In [90]:
df = pd.DataFrame(results, columns=["symbol", "path", "transcript"])

NameError: name 'results' is not defined

In [91]:
pd.concat([df, existing_df]).to_csv(ROOT_PATH / "train_metadata.tsv", sep="\t", index=False)

NameError: name 'df' is not defined