In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import shutil
import zipfile
import tarfile

Assuming that `zippedFaces.tar.gz`, merged `vox1_dev_wav.zip`, test set `vox1_test_wav.zip`, textual video frames `vox1_dev_txt.zip` and `vox1_test_txt.zip`, as well as identity split, verification list, and meta information files are located in `DATA_FOLDER`

In [2]:
DATA_PATH = '/home/nvme/data/vc1/'

In [3]:
ZIPPED_FACES_PATH = os.path.join(DATA_PATH, 'zippedFaces.tar.gz')

AUDIO_DEV_PATH = os.path.join(DATA_PATH, 'vox1_dev_wav.zip')
AUDIO_TEST_PATH = os.path.join(DATA_PATH, 'vox1_test_wav.zip')
VIDEO_TXT_DEV_PATH = os.path.join(DATA_PATH, 'vox1_dev_txt.zip')
VIDEO_TXT_TEST_PATH = os.path.join(DATA_PATH, 'vox1_test_txt.zip')

AUDIO_PATH = os.path.join(DATA_PATH, 'audio/')
VIDEO_PATH = os.path.join(DATA_PATH, 'video/')
VIDEO_TXT_PATH = os.path.join(DATA_PATH, 'txt/')

Unzip the content of the archives and change the folder names

In [4]:
print('Starting to unpack zippedFaces.tar.gz')
tar = tarfile.open(ZIPPED_FACES_PATH)
tar.extractall(DATA_PATH)
tar.close()
print('Done. Starting to unpack vox1_dev_wav.zip')
zip = zipfile.ZipFile(AUDIO_DEV_PATH, 'r')
zip.extractall(DATA_PATH)
zip.close()
print('Done. Starting to unpack vox1_test_wav.zip')
zip = zipfile.ZipFile(AUDIO_TEST_PATH, 'r')
zip.extractall(DATA_PATH)
zip.close()
print('Done. Staring to unpack vox1_dev_txt.zip and vox1_test_txt.zip')
zip = zipfile.ZipFile(VIDEO_TXT_DEV_PATH, 'r')
zip.extractall(DATA_PATH)
zip.close()
zip = zipfile.ZipFile(VIDEO_TXT_TEST_PATH, 'r')
zip.extractall(DATA_PATH)
zip.close()
print('Done')

Starting to unpack zippedFaces.tar.gz
Done. Starting to unpack vox1_dev_wav.zip
Done. Starting to unpack vox1_test_wav.zip
Done. Staring to unpack vox1_dev_txt.zip and vox1_test_txt.zip
Done


In [5]:
os.rename(os.path.join(DATA_PATH, 'wav'), AUDIO_PATH)
os.rename(os.path.join(DATA_PATH, 'unzippedFaces'), VIDEO_PATH)

Remove `1.6/` folder from `video` (prev. `unzippedFaces`)

In [6]:
list_of_dirs = os.listdir(VIDEO_PATH)

for folder in tqdm(list_of_dirs):
    root = VIDEO_PATH + folder
    
    for folder2 in os.listdir(root + '/1.6/'):
        shutil.move(root + '/1.6/' + folder2, os.path.join(root, folder2))
        
    os.rmdir(root + '/1.6/')

100%|██████████| 1251/1251 [00:00<00:00, 1835.47it/s]


Let's rename identity folders that have real celebrity names to `id00**` in `video` folder. First, we need to load the metadata

In [7]:
meta = pd.read_csv(os.path.join(DATA_PATH, 'vox1_meta.csv'), sep='\t')
meta.head()

Unnamed: 0,VoxCeleb1 ID,VGGFace1 ID,Gender,Nationality,Set
0,id10001,A.J._Buckley,m,Ireland,dev
1,id10002,A.R._Rahman,m,India,dev
2,id10003,Aamir_Khan,m,India,dev
3,id10004,Aaron_Tveit,m,USA,dev
4,id10005,Aaron_Yoo,m,USA,dev


Let's rename them

In [8]:
name2id = meta.set_index('VGGFace1 ID')['VoxCeleb1 ID'].to_dict()
list_of_dirs = os.listdir(VIDEO_PATH)

for folder in tqdm(list_of_dirs):
    root = VIDEO_PATH + folder
    prev_name = folder
    new_name = name2id[folder]
    os.rename(VIDEO_PATH + prev_name, VIDEO_PATH + new_name)

100%|██████████| 1251/1251 [00:00<00:00, 87259.27it/s]


Merge `txt` and `video` folders. Make sure they have the same folders

In [9]:
for src_dir, dirs, files in tqdm(os.walk(VIDEO_TXT_PATH)):
    dst_dir = src_dir.replace(VIDEO_TXT_PATH, VIDEO_PATH, 1)
    if not os.path.exists(dst_dir):
        os.makedirs(dst_dir)
    for file_ in files:
        src_file = os.path.join(src_dir, file_)
        dst_file = os.path.join(dst_dir, file_)
        if os.path.exists(dst_file):
            os.remove(dst_file)
        shutil.move(src_file, dst_dir)

23748it [00:04, 5523.31it/s]


Just in case: id001** to Real Celebrity's name

In [10]:
# id2name = meta.set_index('VoxCeleb1 ID')['VGGFace1 ID'].to_dict()

# ### AUDIO ###
# list_of_dirs = os.listdir(AUDIO_PATH)

# for folder in tqdm(list_of_dirs):
#     root = AUDIO_PATH + folder
#     prev_name = folder
#     new_name = id2name[folder]
#     os.rename(AUDIO_PATH + prev_name, AUDIO_PATH + new_name)

# ### VIDEO ###
# list_of_dirs = os.listdir(VIDEO_PATH)

# for folder in tqdm(list_of_dirs):
#     root = VIDEO_PATH + folder
#     prev_name = folder
#     new_name = id2name[folder]
#     os.rename(VIDEO_PATH + prev_name, VIDEO_PATH + new_name)

100%|██████████| 1251/1251 [00:00<00:00, 101541.86it/s]
100%|██████████| 1251/1251 [00:00<00:00, 113025.04it/s]


and back: real Celebrity's name to id001**

In [11]:
# name2id = meta.set_index('VGGFace1 ID')['VoxCeleb1 ID'].to_dict()

# ### AUDIO ###
# list_of_dirs = os.listdir(AUDIO_PATH)

# for folder in tqdm(list_of_dirs):
#     root = AUDIO_PATH + folder
#     prev_name = folder
#     new_name = name2id[folder]
#     os.rename(AUDIO_PATH + prev_name, AUDIO_PATH + new_name)
    
# ### VIDEO ###
# list_of_dirs = os.listdir(VIDEO_PATH)

# for folder in tqdm(list_of_dirs):
#     root = VIDEO_PATH + folder
#     prev_name = folder
#     new_name = name2id[folder]
#     os.rename(VIDEO_PATH + prev_name, VIDEO_PATH + new_name)

100%|██████████| 1251/1251 [00:00<00:00, 97310.40it/s]
100%|██████████| 1251/1251 [00:00<00:00, 101484.91it/s]


Clean up the mess

In [12]:
shutil.rmtree(VIDEO_TXT_PATH)