In [None]:
# https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html
wget https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/vox1_meta.csv
wget https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/vox1_dev_txt.zip
wget https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/vox1_test_txt.zip
wget https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip

wget https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa
wget https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab
wget https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac
wget https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad

cat vox1_dev_wav_partab >> vox1_dev_wav_partaa && rm vox1_dev_wav_partab
cat vox1_dev_wav_partac >> vox1_dev_wav_partaa && rm vox1_dev_wav_partac
cat vox1_dev_wav_partad >> vox1_dev_wav_partaa && rm vox1_dev_wav_partad
mv vox1_dev_wav_partaa vox1_dev_wav.zip

sudo apt install fuse-zip
mkdir dev_wav test_wav dev_txt test_txt
fuse-zip -r vox1_dev_wav.zip dev_wav
fuse-zip -r vox1_dev_txt.zip dev_txt
fuse-zip -r vox1_test_wav.zip test_wav
fuse-zip -r vox1_test_txt.zip test_txt

In [None]:
!pip3 install torch librosa soundfile speechbrain numpy pandas tqdm

In [1]:
import torch
import librosa
import soundfile
import speechbrain
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm

In [14]:
df_meta = pd.read_csv('vox1_meta.csv', sep='\t')
df_meta.columns = ['id','name','gender','nationality','set']

In [15]:
files = glob('dev_wav/wav/*/*/00001.wav')
ids, videos, wavs = list(zip(*[f[12:].split('/') for f in files]))
df_dev_wav = pd.DataFrame({'id': ids, 'video': videos, 'wav': wavs})

In [16]:
files = glob('dev_txt/txt/*/*/00001.txt')
ids, videos, txts = list(zip(*[f[12:].split('/') for f in files]))
df_dev_txt = pd.DataFrame({'id': ids, 'video': videos, 'txt': txts})

In [17]:
files = glob('test_wav/wav/*/*/00001.wav')
ids, videos, wavs = list(zip(*[f[13:].split('/') for f in files]))
df_test_wav = pd.DataFrame({'id': ids, 'video': videos, 'wav': wavs})

In [18]:
files = glob('test_txt/txt/*/*/00001.txt')
ids, videos, txts = list(zip(*[f[13:].split('/') for f in files]))
df_test_txt = pd.DataFrame({'id': ids, 'video': videos, 'txt': txts})

In [19]:
df_wav = df_dev_wav.append(df_test_wav)
df_txt = df_dev_txt.append(df_test_txt)

In [20]:
df_wav_txt = pd.merge(df_wav, df_txt, on=['id','video'], how='inner')

In [21]:
df = pd.merge(df_meta, df_wav_txt, on='id', how='inner')

In [24]:
wav_paths = df['set'] + '_wav/wav/' + df['id'] + '/' + df['video'] + '/' + df['wav']
df['duration'] = [librosa.get_duration(filename=f) for f in tqdm(wav_paths)]

100%|████████████████████████████████████| 22496/22496 [01:16<00:00, 292.61it/s]


In [25]:
def extract_offset_and_frame(txt_path):
    with open(txt_path) as fd:
        txt = fd.read().splitlines()
        return int(txt[2][13:]), int(txt[7][:6])

txt_paths = df['set'] + '_txt/txt/' + df['id'] + '/' + df['video'] + '/' + df['txt']
df['offset'], df['frame'] = zip(*[extract_offset_and_frame(path) for path in tqdm(txt_paths)])
df['start'] = (df['frame'] / 25).astype(int) # + df['offset']

100%|███████████████████████████████████| 22496/22496 [00:05<00:00, 4394.08it/s]


In [26]:
df[:3]

Unnamed: 0,id,name,gender,nationality,set,video,wav,txt,duration,offset,frame,start
0,id10001,A.J._Buckley,m,Ireland,dev,1zcIwhmdeo4,00001.wav,00001.txt,8.120062,-5,368,14.72
1,id10001,A.J._Buckley,m,Ireland,dev,7gWzIy6yIIk,00001.wav,00001.txt,8.640063,-1,5169,206.76
2,id10001,A.J._Buckley,m,Ireland,dev,7w0IBEWc9Qw,00001.wav,00001.txt,28.040063,-3,200,8.0


In [28]:
df.to_csv('vox.csv', index=False)

In [2]:
df = pd.read_csv('vox.csv')
wav_paths = df['set'] + '_wav/wav/' + df['id'] + '/' + df['video'] + '/' + df['wav']
params = {'source': 'speechbrain/spkrec-ecapa-voxceleb', 'run_opts': {'device': 'cpu'}}
ecapa = speechbrain.pretrained.EncoderClassifier.from_hparams(**params)

In [3]:
def load_wav(path):
    max_audio = 4*16000
    audio, sr = soundfile.read(path, frames=max_audio)
    if audio.shape[0] < max_audio:
        shortage = max_audio - audio.shape[0]
        audio = np.pad(audio, (0, shortage), 'wrap')
    return torch.tensor(np.expand_dims(audio, axis=0))

def get_embs(wav_paths):
    audio = [load_wav(path) for path in wav_paths]
    return ecapa.encode_batch(torch.vstack(audio)).squeeze(1).detach().numpy()

batches = np.array_split(wav_paths, len(wav_paths) / 32)
embs = np.vstack([get_embs(batch_paths) for batch_paths in tqdm(batches)])

np.save('ecapa_vox', embs)

100%|███████████████████████████████████████| 703/703 [1:33:26<00:00,  7.97s/it]


In [4]:
print(embs.shape)

(22496, 192)
