In [136]:
import os
import pathlib
import datetime

import pysrt
import pandas as pd
from pytube import YouTube
from pydub import AudioSegment
from tqdm import tqdm
import librosa
import soundfile as sf


In [137]:
ROOT_DIR = pathlib.Path.cwd().parent
AUDIO_DOWNLOAD_DIR = ROOT_DIR / "data/audio.code-workspace"
SRT_DIR = ROOT_DIR / "data/srt_data"
SAK_url = 'https://www.youtube.com/watch?v=XSHwMnfW77o&list=PLQtlXHTArVHvkM-9XuDQZqwvgWASgH_qe&index=2'

# Download audio

In [138]:
def youtube_audio_download(video_url: str):
    """ Function for download audio from the Youtube"""
    video = YouTube(video_url)
    audio = video.streams.filter(only_audio = True).first()

    try:
        audio.download(str(AUDIO_DOWNLOAD_DIR))
    except:
        print("Failed to download audio")

    print("audio was downloaded successfully")

In [139]:
youtube_audio_download(SAK_url)

audio was downloaded successfully


# Open SRT file

In [140]:
target_srt = "ADVANC-updated.srt"

In [141]:
def to_second(date_time: datetime.time) -> datetime.time:
    
    total_seconds = date_time.hour * 3600 + date_time.minute * 60 + date_time.second + ((date_time.microsecond / 1e6) + 0.25)
    return total_seconds

In [142]:
def split_audio(audio_dir, timestamps, target_folder):
    # Convert timestamps to sample indices
    sampling_rate = 16000
    y, sr = librosa.load(audio_dir, sr=sampling_rate)
    indices = []
    for start, stop in timestamps:
        indices.append((int(start * sr), int((stop) * sr)))
    audio_name = pathlib.Path(audio_dir).stem
    dest_dir = f"{target_folder}/{audio_name}"
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    output_file_list = []
    for i, index in tqdm(enumerate(indices)):
        start_index, end_index = index
        split_audio = y[start_index:end_index]
        output_file = f"{dest_dir}/{audio_name}_chunk_{i:05d}.wav"
        sf.write(output_file, split_audio, sampling_rate)
        output_file_list.append(f"{audio_name}/{audio_name}_chunk_{i:05d}.wav")
    return output_file_list

In [143]:
subs = pysrt.open(str(SRT_DIR / target_srt))
start_list = [to_second(sub.start.to_time()) for sub in subs if sub.text != ""]
end_list = [to_second(sub.end.to_time()) for sub in subs if sub.text != ""]
text_list = [sub.text for sub in subs if sub.text != ""]

# split audio

In [145]:
audio_dir = str(AUDIO_DOWNLOAD_DIR / "Oppday Q32023 ADVANC บมจ แอดวานซ์ อินโฟร์ เซอร์วิส.mp4")

In [146]:
output_dirs = split_audio(
    audio_dir,
    zip(start_list, end_list),
    str(ROOT_DIR / "data")
)

  y, sr = librosa.load(audio_dir, sr=sampling_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
1152it [00:02, 490.87it/s]


In [147]:
label_df = {
    'filename': output_dirs,
    'text': text_list,
    'start': start_list,
    'end': end_list
}
label_df = pd.DataFrame(label_df)
label_df.head()

Unnamed: 0,filename,text,start,end
0,Oppday Q32023 ADVANC บมจ แอดวานซ์ อินโฟร์ เซอร...,สวัสดีครับ,11.883,12.683
1,Oppday Q32023 ADVANC บมจ แอดวานซ์ อินโฟร์ เซอร...,ท่านนักลงทุนและท่านผู้สนใจเข้ารับฟังทุกท่านนะครับ,12.75,16.216
2,Oppday Q32023 ADVANC บมจ แอดวานซ์ อินโฟร์ เซอร...,รายงานผลประกอบการของ บริษัท,17.35,18.983
3,Oppday Q32023 ADVANC บมจ แอดวานซ์ อินโฟร์ เซอร...,แอดวานซ์ อินโฟร์ เซอร์วิส มหาชน จํากัด,18.983,20.75
4,Oppday Q32023 ADVANC บมจ แอดวานซ์ อินโฟร์ เซอร...,ครับ ประจําไตรมาส 3,21.183,22.583


# concat to previous train-label

In [111]:
mode = "train"

In [148]:
date_today = datetime.date.today().strftime("%d_%m_%Y")

In [149]:
prev_label_dir = ROOT_DIR / f"data/labels/{mode}_label_{date_today}.csv"

In [150]:
prev_label = pd.read_csv(prev_label_dir)
print("shape before merge", prev_label.shape)
prev_label = prev_label[['filename', 'text', 'start', 'end']]
current_label = pd.concat((prev_label, label_df), axis=0)
print("shape after merge", current_label.shape)

shape before merge (1651, 5)
shape after merge (2803, 4)


In [135]:
current_label.to_csv(ROOT_DIR / f"data/labels/{mode}_label_{date_today}.csv")