In [2]:
import os
import shutil
import random
import pandas as pd
from glob import glob
from datasets import Dataset


random.seed(42)

split = 'train'

src_dir = '/media/theodore/TRANSCEND/data/vietnamese-speaker-lip-clip'
src_metadata_dir = os.path.join(src_dir, 'metadata')
src_visual_dir = os.path.join(src_dir, 'visual')

dest_dir = '/media/theodore/TRANSCEND/data/vasr'
dest_old_metadata_dir = os.path.join(dest_dir, 'metadata')
dest_new_metadata_dir = os.path.join(dest_dir, 'pretrain')
dest_visual_dir = os.path.join(dest_dir, 'visual')
dest_audio_dir = os.path.join(dest_dir, 'audio')

train_ratio, val_ratio, test_ratio = 0.8, 0.1, 0.1

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
metadata_files = os.listdir(dest_old_metadata_dir)
channel_num_samples = {
    metadata_file: pd.read_parquet(os.path.join(dest_old_metadata_dir, metadata_file)).shape[0]
    for metadata_file in metadata_files
}
channel_num_samples = {k: v for k, v in sorted(channel_num_samples.items(), key=lambda item: item[1], reverse=True)}
old_df = pd.concat([pd.read_parquet(os.path.join(dest_old_metadata_dir, metadata_file)) for metadata_file in channel_num_samples.keys()], ignore_index=True)
old_df['video'] = old_df['id']
old_df['id'] = old_df.index.map(lambda x: f'{x:07d}')
old_df

Unnamed: 0,id,channel,duration,fps,sampling_rate,video
0,0000000,behomethansohoc7,3.0,25,16000,728521255754192000100000-0-3
1,0000001,behomethansohoc7,3.0,25,16000,728521255754192000100000-2-5
2,0000002,behomethansohoc7,3.0,25,16000,728521255754192000100000-4-7
3,0000003,behomethansohoc7,3.0,25,16000,728521255754192000100000-6-9
4,0000004,behomethansohoc7,3.0,25,16000,728521255754192000100000-8-11
...,...,...,...,...,...,...
1282237,1282237,vietrilieu1006,3.0,25,16000,730153677436544948000008-0-3
1282238,1282238,vietrilieu1006,3.0,25,16000,730153677436544948000009-2-5
1282239,1282239,vietrilieu1006,3.0,25,16000,730153677436544948000009-4-7
1282240,1282240,vietrilieu1006,3.0,25,16000,730153677436544948000004-0-3


In [4]:
dfs = []
for split in ['train', 'test', 'val']:
    split_df = pd.read_parquet(os.path.join(dest_new_metadata_dir, f'{split}_completed.parquet'))
    split_df['split'] = pd.Series([split] * len(split_df))
    dfs.append(split_df)
new_df = pd.concat(dfs)
new_df

Unnamed: 0,id,fps,sampling_rate,transcript,shard,video_num_frames,audio_num_frames,split
0,0264850,25,16000,bất cứ một chút thu hút nào cả có nghĩa rằng là,1,76,48000,train
1,1249460,25,16000,độ từ vựng gần như cao nhất trong sử dụng tiến...,1,76,48000,train
2,1225299,25,16000,khi mà mình bắt đầu làm các vi đi ô hay là mìn...,1,76,48000,train
3,1039184,25,16000,kỳ kỳ cầm một cái sản phẩm kỳ không có bỏ tiền...,1,76,48000,train
4,0406289,25,16000,em gặp rất là nhiều đối tượng và không có phân...,1,76,48000,train
...,...,...,...,...,...,...,...,...
128205,0220647,25,16000,tử tế niềm vui và hạnh phúc đến với,1283,76,48000,val
128206,0220804,25,16000,hoài là không có ai quan tâm đến bạn ngoài anh ta,1283,76,48000,val
128207,0194363,25,16000,làm mẹ nhưng vẫn chưa được ở ngoài ấy rất nhiều,1283,76,48000,val
128208,0139143,25,16000,đang kết kỉnh giống như ở trên trang đứa phân ...,1283,76,48000,val


In [5]:
joined_df = pd.merge(new_df, old_df, how='left', on='id')
joined_df = joined_df[['id', 'shard', 'channel', 'video', 'split']]
joined_df.to_parquet('mapping.parquet')
joined_df

Unnamed: 0,id,shard,channel,video,split
0,0264850,1,laihofficial,713610947467836137000000-50-53,train
1,1249460,1,dr.hieuielts,722709729912458778100000-18-21,train
2,1225299,1,bahuy.henry,728901091546110694600007-0-3,train
3,1039184,1,unofficiallykyky,729115300077145625800005-7-10,train
4,0406289,1,cohoichoai,703184817938137421000010-1-4,train
...,...,...,...,...,...
1282237,0220647,1283,lifecoachtuean,726858208965252224200001-40-43,val
1282238,0220804,1283,lifecoachtuean,703374814094352716900000-29-32,val
1282239,0194363,1283,tamtinhmebim.mcv,718576990105792025800037-2-5,val
1282240,0139143,1283,genz.justdoit,725146313466005427300000-2-5,val


In [12]:
joined_df[joined_df.id == '0140186']

Unnamed: 0,id,shard,channel,video,split
1175269,140186,213,genz.justdoit,705892226323987174500000-12-15,val


In [6]:
def move_files(sample, src_visual_dir, dest_visual_dir):
    id = sample['id']
    video = sample['video']
    channel = sample['channel']
    shard = str(sample['shard']).zfill(4)
    split = sample['split']
    
    src_path = os.path.join(src_visual_dir, channel, video + '.mp4')
    os.makedirs(os.path.dirname(src_path), exist_ok=True)
    dest_path = os.path.join(dest_visual_dir, split + '_' + shard, id + '.mp4')
    os.makedirs(os.path.dirname(dest_path), exist_ok=True)
    if os.path.exists(src_path):
        if not os.path.exists(dest_path):
            shutil.move(src_path, dest_path)
        else:
            os.remove(src_path)


ds = Dataset.from_pandas(joined_df)
ds.map(
    move_files,
    fn_kwargs={'src_visual_dir': src_visual_dir, 'dest_visual_dir': dest_visual_dir},
    num_proc=6,
    load_from_cache_file=False,
)

Map (num_proc=6): 100%|██████████| 1282242/1282242 [04:01<00:00, 5314.91 examples/s]


Dataset({
    features: ['id', 'shard', 'channel', 'video', 'split'],
    num_rows: 1282242
})

In [7]:
import os
import torchaudio
from torchvision.io import read_video

def check_video(video_path):
    try:
        read_video(video_path, pts_unit='sec')
    except Exception:
        return False
    return True

def check_audio(audio_path):
    try:
        torchaudio.load(audio_path, backend='soundfile')
    except Exception:
        return False
    return True

def check_sanity(sample, visual_dir, audio_dir):
    id = sample['id']
    shard = str(sample['shard']).zfill(4)
    split = sample['split']

    visual_path = os.path.join(visual_dir, split + '_' + shard, id + '.mp4')
    sample['visual_status'] = check_video(visual_path)

    audio_path = os.path.join(audio_dir, split + '_' + shard, id + '.wav')
    sample['audio_status'] = check_audio(audio_path)

    return sample

ds = Dataset.from_pandas(joined_df)
new_ds = ds.map(
    check_sanity,
    fn_kwargs={'visual_dir': dest_visual_dir, 'audio_dir': dest_audio_dir},
    num_proc=15,
    load_from_cache_file=False,
)
new_ds = new_ds.filter(
    lambda sample: not sample['visual_status'] or not sample['audio_status'],
    num_proc=6,
    load_from_cache_file=False,
)

Map (num_proc=15):  35%|███▌      | 450772/1282242 [56:24<1:30:38, 152.88 examples/s]moov atom not found
Map (num_proc=15):  82%|████████▏ | 1052156/1282242 [2:11:34<28:36, 134.06 examples/s] moov atom not found
Map (num_proc=15): 100%|██████████| 1282242/1282242 [2:42:46<00:00, 131.29 examples/s] 
Filter (num_proc=6): 100%|██████████| 1282242/1282242 [00:03<00:00, 354502.36 examples/s]


In [14]:
miss_df = new_ds.to_pandas()
miss_df['channel'].unique().tolist()

['tinmoinhat369',
 'phanhoangmyvietjetair',
 'laihofficial',
 'gioiielts98',
 'tuquyenlifecoach',
 'thedev_dad',
 'duyyy.real.channel',
 'vtvgiaitriofficial',
 'hoangngoctu68',
 'duongduongpodcast',
 'thangbancaytinikun',
 'zing.podcast',
 'khanh.sartorial',
 'doctorhousing',
 'hien.thichhat',
 'gaufamilyy',
 'thalicvoice',
 'dong.congnghe',
 'fisc.vn',
 'dongthaptv',
 'thiendi180119',
 'trungthucedu',
 'ngao_bao',
 'nguoilamphim69',
 'kenh14specialvn',
 'saigonteu',
 'madisonmediagroup',
 'truyenthongkhangofficial',
 'quynhcuaneee',
 'thayqueo',
 'vivodio',
 'mcnguyenkhang',
 'henantrua.mcv',
 'mcvtop.mcv',
 'longkhoahoc',
 'markusnguyen37',
 'pcm.studio',
 'thuyle2000',
 'duyluandethuong',
 'gearvn.store',
 'nonkhanran',
 'vietsuccess',
 'luonganhmyy',
 'tungbtkhoinghiep',
 'tanab.ne',
 'vuongtuhoayogi',
 'leo.cuong',
 'huydao',
 'dangdocgiday',
 'radiotamsu03',
 'dangthuhaf',
 'hocvientommy',
 'haitrieucareer',
 'lepthelittlekid',
 'fptlongchau',
 'syhuyvuituoi',
 'nkhanhm2605',
 'u

In [None]:
import os
import torchaudio
from datasets import load_dataset
from torchvision.io import read_video

def get_num_frames(sample: dict, visual_dir: str, audio_dir: str, split: str):
    video_path = os.path.join(visual_dir, split + '_' + str(sample['shard']).zfill(4), sample['id'] + '.mp4')
    video, _, _ = read_video(video_path, pts_unit='sec', output_format='THWC')
    audio_path = os.path.join(audio_dir, split + '_' + str(sample['shard']).zfill(4), sample['id'] + '.wav')
    audio, _ = torchaudio.load(audio_path, channels_first=False)
    return {
        'id': sample['id'],
        'shard': sample['shard'],
        'fps': sample['fps'],
        'sampling_rate': sample['sampling_rate'],
        'video_num_frames': video.shape[0],
        'audio_num_frames': audio.shape[0],
        'transcript': sample['transcript'],
    }

ds = load_dataset('parquet', data_files=metadata_path, split='train')
ds = ds.map(
    get_num_frames,
    fn_kwargs={
        'visual_dir': visual_dir,
        'audio_dir': audio_dir,
        'split': split,
    },
    num_proc=4,
    load_from_cache_file=False,
    remove_columns=['duration'],
)
ds.to_parquet(os.path.join(os.path.dirname(metadata_path), f'{split}_completed.parquet'))
ds[0]

In [1]:
import os
import torchaudio
from datasets import load_dataset
from torchvision.io import read_video

def get_num_frames(sample: dict, visual_dir: str, audio_dir: str, split: str):
    video_path = os.path.join(visual_dir, split + '_' + str(sample['shard']).zfill(4), sample['id'] + '.mp4')
    video, _, _ = read_video(video_path, pts_unit='sec', output_format='THWC')
    audio_path = os.path.join(audio_dir, split + '_' + str(sample['shard']).zfill(4), sample['id'] + '.wav')
    audio, _ = torchaudio.load(audio_path, channels_first=False)
    return {
        'id': sample['id'],
        'shard': sample['shard'],
        'fps': sample['fps'],
        'sampling_rate': sample['sampling_rate'],
        'video_num_frames': video.shape[0],
        'audio_num_frames': audio.shape[0],
        'transcript': sample['transcript'],
    }

split = 'train'
data_dir = '/media/theodore/TRANSCEND/data/vasr'
metadata_path = os.path.join(data_dir, 'pretrain', f'{split}.parquet')
visual_dir = os.path.join(data_dir, 'visual')
audio_dir = os.path.join(data_dir, 'audio')
ds = load_dataset('parquet', data_files=metadata_path, split='train')
ds = ds.map(
    get_num_frames,
    fn_kwargs={
        'visual_dir': visual_dir,
        'audio_dir': audio_dir,
        'split': split,
    },
    num_proc=4,
    load_from_cache_file=False,
    remove_columns=['duration'],
)
ds.to_parquet(os.path.join(os.path.dirname(metadata_path), f'{split}_completed.parquet'))
ds[0]

  from .autonotebook import tqdm as notebook_tqdm
Map (num_proc=4): 100%|██████████| 1025921/1025921 [3:48:38<00:00, 74.78 examples/s]  
Creating parquet from Arrow format: 100%|██████████| 1026/1026 [00:00<00:00, 1601.23ba/s]


{'id': '0264850',
 'fps': 25,
 'sampling_rate': 16000,
 'transcript': 'bất cứ một chút thu hút nào cả có nghĩa rằng là',
 'shard': 1,
 'video_num_frames': 76,
 'audio_num_frames': 48000}