# VoxCeleb2 --- Stats

|                    | Dev       |
| ---                | ---       |
| Number of samples  | 1,092,009 |
| Number of speakers |     5,994 |

In [None]:
%load_ext autoreload
%autoreload 2

import os
import sys
os.chdir('../..')
sys.path.insert(1, os.path.join(sys.path[0], '../..'))

In [None]:
from glob import glob
from tqdm import tqdm
import soundfile as sf

from plotnine import *
import pandas as pd
import numpy as np

In [None]:
DATASET_NAME = 'VoxCeleb2'
DATASET_PATH = 'data/voxceleb2'

In [None]:
files = glob(f'{DATASET_PATH}/**/**/*.wav')
speakers = glob(f'{DATASET_PATH}/*')

print(f'Number of samples: {len(files)}')
print(f'Number of speakers: {len(speakers)}')

## Number of utterances per speaker distribution

In [None]:
speakers = {}
tmp = {}

for file in files:
    speaker = file.split('/')[2]
    if speaker not in speakers:
        speakers[speaker] = 0
        tmp[speaker] = []
    speakers[speaker] += 1
    tmp[speaker].append(file)

In [None]:
utt_per_speaker = np.array(list(speakers.values()))

np.sum(utt_per_speaker < 133) / len(utt_per_speaker)

In [None]:
np.median(utt_per_speaker[utt_per_speaker < 133])

In [None]:
np.median(utt_per_speaker)

In [None]:
plot = (
    ggplot()
    + xlab('Number of utterances per speaker')
    + ylab('Count')
    + ggtitle(f'Number of utterances per speaker distribution of {DATASET_NAME} samples')
    + theme_bw()
    + theme(figure_size=(10, 6), text=element_text(size=10))
    + geom_histogram(
        pd.DataFrame({'Number of utterances per speaker': utt_per_speaker}),
        aes(x='Number of utterances per speaker'),
        binwidth=10,
        color='black',
        position='identity',
        size=0.25
    )
)

plot

In [None]:
for speaker in tmp.keys():
    utterances = tmp[speaker]

    if len(utterances) < 133:
        for _ in range(133 // len(utterances)):
            utterances += utterances
    utterances = utterances[:133]
    
    if speaker == 'id04313':
        print(speaker, len(utterances))
        
    tmp[speaker] = utterances

In [None]:
tmp['id04313'][:52]

## Length distribution

In [None]:
lengths = []

for file in tqdm(files):
    audio, sr = sf.read(file, dtype='int16')
    lengths.append(len(audio))

df_length = pd.DataFrame({'Length': lengths})
df_length['Length'] = df_length['Length'] / 16000
df_length = df_length.drop(df_length[df_length['Length'] > 20].index)

plot = (
    ggplot()
    + xlab('Length (s)')
    + ylab('Count')
    + ggtitle(f'Lengths distribution of {DATASET_NAME} samples')
    + theme_bw()
    + theme(figure_size=(10, 6), text=element_text(size=10))
    + geom_histogram(
        df_length,
        aes(x='Length'),
        binwidth=1,
        color='black',
        position='identity',
        size=0.25
    )
    + scale_x_continuous(breaks=list(range(4, 21, 1)))
)

plot

In [None]:
100 * len(df_length[df_length['Length'] <= 4]) / len(df_length)

In [None]:
100 * len(df_length[df_length['Length'] > 10]) / len(df_length)