In [1]:
%matplotlib inline

import librosa
import matplotlib.pyplot as plt
import librosa.display
import numpy as np
import torch
import os
import sys
import glob
import pickle
import copy
import random
import time
import traceback
import shutil

from multiprocessing import Process, Manager

# Config

In [2]:
PROJECT_ROOT = '/workspace/GREAT_ASV_system/'
sys.path.append(PROJECT_ROOT)

In [3]:
OPT_INDEX = '/workspace/DATASET/server9/STD_VOX_EGS'
if not os.path.exists(OPT_INDEX):
    os.mkdir(OPT_INDEX)

In [4]:
VOX_TRAIN_DIR = '/workspace/DATASET/server9/voxceleb'
AUG_INFO_DIR = '/workspace/DATASET/server9/STD_musan&rir_info'

In [5]:
INFO_DIR = os.path.join(OPT_INDEX, 'VOX_TRIAL_INFO')
if not os.path.exists(INFO_DIR):
    os.mkdir(INFO_DIR)

# Make Trials dict

In [31]:
# trial dict
trial_dir = os.path.join(VOX_TRAIN_DIR, 'vox1')
trial_dict_out = os.path.join(INFO_DIR, 'vox_trial_dict')
trial_dict = {}

tmp_dir_list = glob.glob(trial_dir+'/*/*/*/*/*.wav')
for count, this_dir in enumerate(tmp_dir_list):
    trial_dict[count] = this_dir
with open(trial_dict_out, 'wb') as handle:
    pickle.dump(trial_dict, handle)

In [32]:
len(trial_dict)

153516

In [33]:
# trial dict o
trial_dir = os.path.join(VOX_TRAIN_DIR, 'vox1')
trial_dict_o_out = os.path.join(INFO_DIR, 'vox_trial_dict_o')
trial_dict = {}

tmp_dir_list = glob.glob(trial_dir+'/test/*/*/*/*.wav')
for count, this_dir in enumerate(tmp_dir_list):
    trial_dict[count] = this_dir
with open(trial_dict_o_out, 'wb') as handle:
    pickle.dump(trial_dict, handle)

In [34]:
len(trial_dict)

4874

# Make trials data

In [6]:
from preprocessing.Utils.preprocessing_3type import ThreeTypes_IterableDataset

In [7]:
MFCC_O = os.path.join(OPT_INDEX, 'VOX_TRIALS_MFCC')
LOGMELFB_O = os.path.join(OPT_INDEX, 'VOX_TRIALS_FB')
RAWWAV_O = os.path.join(OPT_INDEX, 'VOX_TRIALS_RAW')
EER_LOG = os.path.join(OPT_INDEX, 'EER_LOG_VOX_TRIAL')
G_LOG = os.path.join(OPT_INDEX, 'G_LOG_VOX_TRIAL')

In [8]:
if not os.path.exists(MFCC_O):
    os.mkdir(MFCC_O)
else:
    shutil.rmtree(MFCC_O)
    
if not os.path.exists(LOGMELFB_O):
    os.mkdir(LOGMELFB_O)
else:
    shutil.rmtree(LOGMELFB_O)
    
if not os.path.exists(RAWWAV_O):
    os.mkdir(RAWWAV_O)
else:
    shutil.rmtree(RAWWAV_O)

with open(EER_LOG, 'w') as f:
    pass

with open(G_LOG, 'w') as f:
    pass

In [9]:
config = {}

config['sr'] = 16000
config['repeats'] = 1
config['batch_size'] = 1

trial_dict_dir = os.path.join(INFO_DIR, 'vox_trial_dict')

In [None]:
batched_feats_rawwave, batched_feats_LogMelFB, batched_feats_MFCC

In [10]:
def trial_data_preload(dataset, i, trial_dict_dir):
    with open(trial_dict_dir, 'rb') as handle:
        trial_dict = pickle.load(handle)
        
    start_time = time.time()
    for count, i in enumerate(trial_dict):
        rw, fb, mfcc = dataset.process_one_utt(trial_dict[i])
        lbpart = trial_dict[i].split('/')[-3:]
        lbpart = lbpart[0]+'-'+lbpart[1]+'-'+lbpart[2]
        label = lbpart[:-4]
        
        file_name = str(i)
        with open(os.path.join(RAWWAV_O, file_name), 'wb') as handle:
            pickle.dump((rw.astype(np.float16), [label]), handle)
        with open(os.path.join(LOGMELFB_O, file_name), 'wb') as handle:
            pickle.dump((fb.astype(np.float16), [label]), handle)
        with open(os.path.join(MFCC_O, file_name), 'wb') as handle:
            pickle.dump((mfcc.astype(np.float16), [label]), handle)        
        if (count+1) % 5000 == 0:
            print(count+1)

dataset = ThreeTypes_IterableDataset(config)

processes = [Process(target = trial_data_preload, args = (dataset, i, trial_dict_dir)) for i in range(1)]
start_time = time.time()
[p.start() for p in processes]
joined = [p.join() for p in processes]

5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
135000
140000
145000
150000
25268.819949150085


# Make trials index

In [11]:
trials_mfcc_dir = os.path.join(INFO_DIR, 'trials_mfcc.csv')
trials_fb_dir = os.path.join(INFO_DIR, 'trials_fb.csv')
trials_rw_dir = os.path.join(INFO_DIR, 'trials_raw.csv')

In [15]:
mfcc_len = glob.glob(MFCC_O+'/*')
fb_len = glob.glob(LOGMELFB_O+'/*')
raw_len = glob.glob(RAWWAV_O+'/*')

In [16]:
print('mfcc', len(mfcc_len))
print('fb', len(fb_len))
print('raw', len(raw_len))

mfcc 153516
fb 153516
raw 153516


In [17]:
count = 0
with open(trials_mfcc_dir, 'w') as f:
    for path in mfcc_len:
        assert os.path.isfile(path)
        f.write(path+'\n')
        count += 1
print('mfcc', count)

count = 0
with open(trials_fb_dir, 'w') as f:
    for path in fb_len:
        assert os.path.isfile(path)
        f.write(path+'\n')
        count += 1
print('fb', count)

count = 0
with open(trials_rw_dir, 'w') as f:
    for path in raw_len:
        assert os.path.isfile(path)
        f.write(path+'\n')
        count += 1
print('raw', count)

mfcc 153516
fb 153516
raw 153516


# Make trials data o

In [6]:
from preprocessing.Utils.preprocessing_3type import ThreeTypes_IterableDataset

In [39]:
MFCC_O = os.path.join(OPT_INDEX, 'VOX_TRIALS_MFCC_O')
LOGMELFB_O = os.path.join(OPT_INDEX, 'VOX_TRIALS_FB_O')
RAWWAV_O = os.path.join(OPT_INDEX, 'VOX_TRIALS_RAW_O')
EER_LOG = os.path.join(OPT_INDEX, 'EER_LOG_VOX_TRIAL_O')
G_LOG = os.path.join(OPT_INDEX, 'G_LOG_VOX_TRIAL_O')

In [40]:
if not os.path.exists(MFCC_O):
    os.mkdir(MFCC_O)
else:
    shutil.rmtree(MFCC_O)
    
if not os.path.exists(LOGMELFB_O):
    os.mkdir(LOGMELFB_O)
else:
    shutil.rmtree(LOGMELFB_O)
    
if not os.path.exists(RAWWAV_O):
    os.mkdir(RAWWAV_O)
else:
    shutil.rmtree(RAWWAV_O)

with open(EER_LOG, 'w') as f:
    pass

with open(G_LOG, 'w') as f:
    pass

In [41]:
config = {}

config['sr'] = 16000
config['repeats'] = 1
config['batch_size'] = 1

trial_dict_dir = os.path.join(INFO_DIR, 'vox_trial_dict_o')

In [42]:
def trial_data_preload(dataset, i, trial_dict_dir):
    with open(trial_dict_dir, 'rb') as handle:
        trial_dict = pickle.load(handle)
        
    start_time = time.time()
    for count, i in enumerate(trial_dict):
        rw, fb, mfcc = dataset.process_one_utt(trial_dict[i])
        lbpart = trial_dict[i].split('/')[-3:]
        lbpart = lbpart[0]+'-'+lbpart[1]+'-'+lbpart[2]
        label = lbpart[:-4]
        
        file_name = str(i)
        with open(os.path.join(RAWWAV_O, file_name), 'wb') as handle:
            pickle.dump((rw.astype(np.float16), [label]), handle)
        with open(os.path.join(LOGMELFB_O, file_name), 'wb') as handle:
            pickle.dump((fb.astype(np.float16), [label]), handle)
        with open(os.path.join(MFCC_O, file_name), 'wb') as handle:
            pickle.dump((mfcc.astype(np.float16), [label]), handle)        
        if (count+1) % 1000 == 0:
            print(count+1)

dataset = ThreeTypes_IterableDataset(config)

processes = [Process(target = trial_data_preload, args = (dataset, i, trial_dict_dir)) for i in range(1)]
start_time = time.time()
[p.start() for p in processes]
joined = [p.join() for p in processes]

1000
2000
3000
4000


# Make trials index o

In [43]:
trials_mfcc_dir = os.path.join(INFO_DIR, 'trials_mfcc_o.csv')
trials_fb_dir = os.path.join(INFO_DIR, 'trials_fb_o.csv')
trials_rw_dir = os.path.join(INFO_DIR, 'trials_raw_o.csv')

In [44]:
mfcc_len = glob.glob(MFCC_O+'/*')
fb_len = glob.glob(LOGMELFB_O+'/*')
raw_len = glob.glob(RAWWAV_O+'/*')

In [45]:
print('mfcc', len(mfcc_len))
print('fb', len(fb_len))
print('raw', len(raw_len))

mfcc 4874
fb 4874
raw 4874


In [46]:
count = 0
with open(trials_mfcc_dir, 'w') as f:
    for path in mfcc_len:
        assert os.path.isfile(path)
        f.write(path+'\n')
        count += 1
print('mfcc', count)

count = 0
with open(trials_fb_dir, 'w') as f:
    for path in fb_len:
        assert os.path.isfile(path)
        f.write(path+'\n')
        count += 1
print('fb', count)

count = 0
with open(trials_rw_dir, 'w') as f:
    for path in raw_len:
        assert os.path.isfile(path)
        f.write(path+'\n')
        count += 1
print('raw', count)

mfcc 4874
fb 4874
raw 4874


# Make trial keys

In [24]:
vox_trials_o_dir = os.path.join(INFO_DIR, 'vox_trials_o')
vox_trials_e_dir = os.path.join(INFO_DIR, 'vox_trials_e')
vox_trials_h_dir = os.path.join(INFO_DIR, 'vox_trials_h')

In [25]:
ori_vox_trials_o_dir = os.path.join(INFO_DIR, 'ORI_veri_test2.txt')
ori_vox_trials_e_dir = os.path.join(INFO_DIR, 'ORI_list_test_all2.txt')
ori_vox_trials_h_dir = os.path.join(INFO_DIR, 'ORI_list_test_hard2.txt')

In [26]:
with open(ori_vox_trials_o_dir, 'r') as f:
    lines = f.readlines()

with open(vox_trials_o_dir, 'w') as f:
    for count, line in enumerate(lines):
        cmp = line[:-1].split(' ')
        if cmp[0] == '0':
            cmp_a = cmp[1].split('/')
            out_1 = cmp_a[0]+'-'+cmp_a[1]+'-'+cmp_a[2]
            cmp_b = cmp[2].split('/')
            out_2 = cmp_b[0]+'-'+cmp_b[1]+'-'+cmp_b[2]
            out_3 = 'nontarget'
            out = out_1+' '+out_2+' '+out_3+'\n'
        else:
            cmp_a = cmp[1].split('/')
            out_1 = cmp_a[0]+'-'+cmp_a[1]+'-'+cmp_a[2]
            cmp_b = cmp[2].split('/')
            out_2 = cmp_b[0]+'-'+cmp_b[1]+'-'+cmp_b[2]
            out_3 = 'target'
            out = out_1+' '+out_2+' '+out_3+'\n'
        f.write(out)
print('vox_o', count+1)

vox_o 37611


In [27]:
with open(ori_vox_trials_e_dir, 'r') as f:
    lines = f.readlines()

with open(vox_trials_e_dir, 'w') as f:
    for count, line in enumerate(lines):
        cmp = line[:-1].split(' ')
        if cmp[0] == '0':
            cmp_a = cmp[1].split('/')
            out_1 = cmp_a[0]+'-'+cmp_a[1]+'-'+cmp_a[2]
            cmp_b = cmp[2].split('/')
            out_2 = cmp_b[0]+'-'+cmp_b[1]+'-'+cmp_b[2]
            out_3 = 'nontarget'
            out = out_1+' '+out_2+' '+out_3+'\n'
        else:
            cmp_a = cmp[1].split('/')
            out_1 = cmp_a[0]+'-'+cmp_a[1]+'-'+cmp_a[2]
            cmp_b = cmp[2].split('/')
            out_2 = cmp_b[0]+'-'+cmp_b[1]+'-'+cmp_b[2]
            out_3 = 'target'
            out = out_1+' '+out_2+' '+out_3+'\n'
        f.write(out)
print('vox_e', count+1)

vox_e 579818


In [28]:
with open(ori_vox_trials_h_dir, 'r') as f:
    lines = f.readlines()

with open(vox_trials_h_dir, 'w') as f:
    for count, line in enumerate(lines):
        cmp = line[:-1].split(' ')
        if cmp[0] == '0':
            cmp_a = cmp[1].split('/')
            out_1 = cmp_a[0]+'-'+cmp_a[1]+'-'+cmp_a[2]
            cmp_b = cmp[2].split('/')
            out_2 = cmp_b[0]+'-'+cmp_b[1]+'-'+cmp_b[2]
            out_3 = 'nontarget'
            out = out_1+' '+out_2+' '+out_3+'\n'
        else:
            cmp_a = cmp[1].split('/')
            out_1 = cmp_a[0]+'-'+cmp_a[1]+'-'+cmp_a[2]
            cmp_b = cmp[2].split('/')
            out_2 = cmp_b[0]+'-'+cmp_b[1]+'-'+cmp_b[2]
            out_3 = 'target'
            out = out_1+' '+out_2+' '+out_3+'\n'
        f.write(out)
print('vox_h', count+1)

vox_h 550894
