In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import shutil
import zipfile
import tarfile

Assuming that `zippedFaces.tar.gz`, merged `vox1_dev_wav.zip`, test set `vox1_test_wav.zip`, textual video frames `vox1_dev_txt.zip` and `vox1_test_txt.zip`, as well as files with splits `Splits.zip`, test pairs `testpairs.zip`  and meta information files are located in `DATA_FOLDER`

In [2]:
DATA_PATH = '/home/hdd/data/voxceleb1/'

In [3]:
ZIPPED_FACES_PATH = os.path.join(DATA_PATH, 'zippedFaces.tar.gz')

AUDIO_DEV_PATH = os.path.join(DATA_PATH, 'vox1_dev_wav.zip')
AUDIO_TEST_PATH = os.path.join(DATA_PATH, 'vox1_test_wav.zip')
VIDEO_TXT_DEV_PATH = os.path.join(DATA_PATH, 'vox1_dev_txt.zip')
VIDEO_TXT_TEST_PATH = os.path.join(DATA_PATH, 'vox1_test_txt.zip')
SPLIT_ZIP_PATH = os.path.join(DATA_PATH, 'Splits.zip')
TESTPAIRS_ZIP_PATH = os.path.join(DATA_PATH, 'testpairs.zip')

AUDIO_PATH = os.path.join(DATA_PATH, 'audio/')
VIDEO_PATH = os.path.join(DATA_PATH, 'video/')
VIDEO_TXT_PATH = os.path.join(DATA_PATH, 'txt/')

Unzip the content of the archives and change the folder names

In [4]:
print('Starting to unpack zippedFaces.tar.gz')
tar = tarfile.open(ZIPPED_FACES_PATH)
tar.extractall(DATA_PATH)
tar.close()
print('Done. Starting to unpack vox1_dev_wav.zip')
zip = zipfile.ZipFile(AUDIO_DEV_PATH, 'r')
zip.extractall(DATA_PATH)
zip.close()
print('Done. Starting to unpack vox1_test_wav.zip')
zip = zipfile.ZipFile(AUDIO_TEST_PATH, 'r')
zip.extractall(DATA_PATH)
zip.close()
print('Done. Staring to unpack vox1_dev_txt.zip and vox1_test_txt.zip')
zip = zipfile.ZipFile(VIDEO_TXT_DEV_PATH, 'r')
zip.extractall(DATA_PATH)
zip.close()
zip = zipfile.ZipFile(VIDEO_TXT_TEST_PATH, 'r')
zip.extractall(DATA_PATH)
zip.close()
print('Done. Staring to unpack Splits.zip and testpairs.zip')
zip = zipfile.ZipFile(SPLIT_ZIP_PATH, 'r')
zip.extractall(DATA_PATH)
zip.close()
zip = zipfile.ZipFile(TESTPAIRS_ZIP_PATH, 'r')
zip.extractall(DATA_PATH)
zip.close()
print('Done')

Starting to unpack zippedFaces.tar.gz
Done. Starting to unpack vox1_dev_wav.zip
Done. Starting to unpack vox1_test_wav.zip
Done. Staring to unpack vox1_dev_txt.zip and vox1_test_txt.zip
Done. Staring to unpack Splits.zip and testpairs.zip
Done


In [5]:
os.rename(os.path.join(DATA_PATH, 'wav'), AUDIO_PATH)
os.rename(os.path.join(DATA_PATH, 'unzippedFaces'), VIDEO_PATH)

Remove `1.6/` folder from `video` (prev. `unzippedFaces`)

In [6]:
basedir = os.path.join(DATA_PATH, 'video/')

list_of_dirs = os.listdir(basedir)

for folder in tqdm(list_of_dirs):
    root = basedir + folder
    
    for folder2 in os.listdir(root + '/1.6/'):
        shutil.move(root + '/1.6/' + folder2, os.path.join(root, folder2))
        
    os.rmdir(root + '/1.6/')

100%|██████████| 1251/1251 [00:06<00:00, 180.77it/s]


Let's rename identity folders `id00**` to real celebrity names in `txt` and `audio` folders. First, we need to load the metadata

In [7]:
meta = pd.read_csv(os.path.join(DATA_PATH, 'vox1_meta.csv'), sep='\t')
meta.head()

Unnamed: 0,VoxCeleb1 ID,VGGFace1 ID,Gender,Nationality,Set
0,id10001,A.J._Buckley,m,Ireland,dev
1,id10002,A.R._Rahman,m,India,dev
2,id10003,Aamir_Khan,m,India,dev
3,id10004,Aaron_Tveit,m,USA,dev
4,id10005,Aaron_Yoo,m,USA,dev


id001** to Real Celebrity's name

In [8]:
id2name = meta.set_index('VoxCeleb1 ID')['VGGFace1 ID'].to_dict()

### AUDIO ###
list_of_dirs = os.listdir(AUDIO_PATH)

for folder in tqdm(list_of_dirs):
    root = AUDIO_PATH + folder
    prev_name = folder
    new_name = id2name[folder]
    os.rename(AUDIO_PATH + prev_name, AUDIO_PATH + new_name)

### VIDEO ###
list_of_dirs = os.listdir(VIDEO_TXT_PATH)

for folder in tqdm(list_of_dirs):
    root = VIDEO_TXT_PATH + folder
    prev_name = folder
    new_name = id2name[folder]
    os.rename(VIDEO_TXT_PATH + prev_name, VIDEO_TXT_PATH + new_name)

100%|██████████| 1251/1251 [00:00<00:00, 109043.71it/s]
100%|██████████| 1251/1251 [00:01<00:00, 662.15it/s]


Just in case: real Celebrity's name to id001**

In [19]:
# name2id = meta.set_index('VGGFace1 ID')['VoxCeleb1 ID'].to_dict()

# ### AUDIO ###
# list_of_dirs = os.listdir(AUDIO_PATH)

# for folder in tqdm(list_of_dirs):
#     root = AUDIO_PATH + folder
#     prev_name = folder
#     new_name = name2id[folder]
#     os.rename(AUDIO_PATH + prev_name, AUDIO_PATH + new_name)
    
# ### TEXTUAL VIDEO (txt) ###
# list_of_dirs = os.listdir(VIDEO_TXT_PATH)

# for folder in tqdm(list_of_dirs):
#     root = VIDEO_TXT_PATH + folder
#     prev_name = folder
#     new_name = name2id[folder]
#     os.rename(VIDEO_TXT_PATH + prev_name, VIDEO_TXT_PATH + new_name)

100%|██████████| 1251/1251 [00:00<00:00, 82966.88it/s]
100%|██████████| 1251/1251 [00:00<00:00, 101819.69it/s]


Merge `txt` and `video` folders

In [9]:
for src_dir, dirs, files in tqdm(os.walk(VIDEO_TXT_PATH)):
    dst_dir = src_dir.replace(VIDEO_TXT_PATH, VIDEO_PATH, 1)
    if not os.path.exists(dst_dir):
        os.makedirs(dst_dir)
    for file_ in files:
        src_file = os.path.join(src_dir, file_)
        dst_file = os.path.join(dst_dir, file_)
        if os.path.exists(dst_file):
            os.remove(dst_file)
        shutil.move(src_file, dst_dir)

23748it [00:26, 912.42it/s] 


Remove tracks from `voice_set_labels.txt` (i.e. dataset) which does not have frames in `zippedFaces.tar.gz`. The authors mention: _"we should never have more than 20 frames in a sequence from the same track"_ is taken into account. 

In [4]:
SPLIT_PATH = os.path.join(DATA_PATH, 'Splits/voice_set_labels.txt')

In [30]:
## outputs unique tracks that have at least one frame missing. 

tracks_to_filter_path = os.path.join(DATA_PATH, 'Splits/tracks_to_filter.txt')
index_to_cut_filepath = len(os.path.join(DATA_PATH + 'video')) + 1

tracks_to_filter = []

voice_set_labels = pd.read_table(SPLIT_PATH, sep=' ', names=['path', 'phase'])
voice_set_labels.replace({'_000': '/0', '.wav$': ''}, inplace=True, regex=True)

for path in tqdm(voice_set_labels['path']):
    video_path = os.path.join(VIDEO_PATH, path + '.txt')
    
    if os.path.isfile(os.path.join(DATA_PATH, 'audio', path + '.wav')):
        frames = pd.read_table(video_path, skiprows=6, usecols=['FRAME '])
        
    earliest = frames['FRAME '].iloc[0]
    latest = frames['FRAME '].iloc[-1]
    frame_list = np.arange(earliest, latest+1)
    mask = np.where(frame_list % 25 == 0)
    frames_sec = frame_list[mask]
    # only 20 per each face-track (see the asterics on the project page)
    # frames_sec = frame_list[mask]
    frames_sec = frame_list[mask][:20]
    
    for frame_number in frames_sec:
        filename ='{0:07d}.jpg'.format(frame_number)
        selected_frame_path = os.path.join(DATA_PATH, 'video', path[:-5] + filename)
        
        if os.path.isfile(selected_frame_path):
            continue
            
        else:
            # remove dir path and file extention
            tracks_to_filter.append(video_path[index_to_cut_filepath:])
            break
            
with open(tracks_to_filter_path, 'w') as out_f:
    for file in tracks_to_filter:
        out_f.write(file + '\n')

100%|██████████| 153486/153486 [03:35<00:00, 711.91it/s]


In [32]:
153486 - len(tracks_to_filter)

153333

In [41]:
mask = voice_set_labels['path'].apply(lambda x: x + '.txt' in tracks_to_filter)
indices = voice_set_labels[mask].index
print('{}'.format(len(indices)))

filteredpath = os.path.join(DATA_PATH, 'Splits', 'filtered_voice_set_labels.txt')

with open(SPLIT_PATH, 'r') as r, open(filteredpath, 'w') as w:
    
    for idx, line in enumerate(r.readlines()):
        
        if idx not in indices:
            w.write(line)
        
        else:
            continue

153


In [39]:
## which phases the missing files belong to (1 and 3 -- train and seen test)
print(voice_set_labels[mask]['phase'].value_counts())

1    151
3      2
Name: phase, dtype: int64
