In [1]:
%matplotlib inline

import librosa
import matplotlib.pyplot as plt
import librosa.display
import numpy as np
import torch
import os
import sys
import glob
import pickle
import copy
import random
import time
import traceback
import shutil

from multiprocessing import Process, Manager

# Config

In [4]:
PROJECT_ROOT = '/workspace/GREAT_ASV_system/'
sys.path.append(PROJECT_ROOT)

In [5]:
OPT_INDEX = '/workspace/DATASET/server9_ssd/STD_SRE_EGS'
if not os.path.exists(OPT_INDEX):
    os.mkdir(OPT_INDEX)

In [6]:
SRE16_TEST_DIR = '/workspace/DATASET/server9/SRE16_TEST'

In [7]:
INFO_DIR = os.path.join(OPT_INDEX, 'SRE16_TRIAL_INFO')
if not os.path.exists(INFO_DIR):
    os.mkdir(INFO_DIR)

# Make Trials dict

In [10]:
SRE16_TEST_DIR

'/workspace/DATASET/server9/SRE16_TEST'

In [12]:
# trial dict
trial_dir = SRE16_TEST_DIR
trial_dict_out = os.path.join(INFO_DIR, 'sre16_test_trial_dict')
trial_dict = {}

tmp_dir_list = glob.glob(trial_dir+'/Enroll/*/*.wav') + glob.glob(trial_dir+'/Eval/*.wav')
for count, this_dir in enumerate(tmp_dir_list):
    trial_dict[count] = this_dir
with open(trial_dict_out, 'wb') as handle:
    pickle.dump(trial_dict, handle)

In [13]:
len(trial_dict)

10496

# Make trials data for SRE16 TEST

In [14]:
from preprocessing.Utils.preprocessing_3type import ThreeTypes_IterableDataset

In [15]:
MFCC_O = os.path.join(OPT_INDEX, 'SRE16_TEST_TRIALS_MFCC')
LOGMELFB_O = os.path.join(OPT_INDEX, 'SRE16_TEST_TRIALS_FB')
RAWWAV_O = os.path.join(OPT_INDEX, 'SRE16_TEST_TRIALS_RAW')
EER_LOG = os.path.join(OPT_INDEX, 'EER_LOG_SRE16_TEST_TRIAL')
G_LOG = os.path.join(OPT_INDEX, 'G_LOG_SRE16_TEST_TRIAL')

In [16]:
if not os.path.exists(MFCC_O):
    os.mkdir(MFCC_O)
else:
    shutil.rmtree(MFCC_O)
    
if not os.path.exists(LOGMELFB_O):
    os.mkdir(LOGMELFB_O)
else:
    shutil.rmtree(LOGMELFB_O)
    
if not os.path.exists(RAWWAV_O):
    os.mkdir(RAWWAV_O)
else:
    shutil.rmtree(RAWWAV_O)

with open(EER_LOG, 'w') as f:
    pass

with open(G_LOG, 'w') as f:
    pass

In [17]:
config = {}

config['sr'] = 16000
config['repeats'] = 1
config['batch_size'] = 1

trial_dict_dir = os.path.join(INFO_DIR, 'sre16_test_trial_dict')

In [None]:
batched_feats_rawwave, batched_feats_LogMelFB, batched_feats_MFCC

In [19]:
def trial_data_preload(dataset, i, trial_dict_dir):
    with open(trial_dict_dir, 'rb') as handle:
        trial_dict = pickle.load(handle)
        
    start_time = time.time()
    for count, i in enumerate(trial_dict):
        rw, fb, mfcc = dataset.process_one_utt(trial_dict[i])
        lbpart = trial_dict[0].split('/')[-1]
        label = lbpart[:-4]
        
        file_name = str(i)
        with open(os.path.join(RAWWAV_O, file_name), 'wb') as handle:
            pickle.dump((rw.astype(np.float16), [label]), handle)
        with open(os.path.join(LOGMELFB_O, file_name), 'wb') as handle:
            pickle.dump((fb.astype(np.float16), [label]), handle)
        with open(os.path.join(MFCC_O, file_name), 'wb') as handle:
            pickle.dump((mfcc.astype(np.float16), [label]), handle)        
        if (count+1) % 1000 == 0:
            print(count+1)

dataset = ThreeTypes_IterableDataset(config)

processes = [Process(target = trial_data_preload, args = (dataset, i, trial_dict_dir)) for i in range(1)]
start_time = time.time()
[p.start() for p in processes]
joined = [p.join() for p in processes]

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000


# Make trials index SRE16 TEST

In [20]:
trials_mfcc_dir = os.path.join(INFO_DIR, 'sre16_test_trials_mfcc.csv')
trials_fb_dir = os.path.join(INFO_DIR, 'sre16_test_trials_fb.csv')
trials_rw_dir = os.path.join(INFO_DIR, 'sre16_test_trials_raw.csv')

In [21]:
mfcc_len = glob.glob(MFCC_O+'/*')
fb_len = glob.glob(LOGMELFB_O+'/*')
raw_len = glob.glob(RAWWAV_O+'/*')

In [22]:
print('mfcc', len(mfcc_len))
print('fb', len(fb_len))
print('raw', len(raw_len))

mfcc 10496
fb 10496
raw 10496


In [23]:
count = 0
with open(trials_mfcc_dir, 'w') as f:
    for path in mfcc_len:
        assert os.path.isfile(path)
        f.write(path+'\n')
        count += 1
print('mfcc', count)

count = 0
with open(trials_fb_dir, 'w') as f:
    for path in fb_len:
        assert os.path.isfile(path)
        f.write(path+'\n')
        count += 1
print('fb', count)

count = 0
with open(trials_rw_dir, 'w') as f:
    for path in raw_len:
        assert os.path.isfile(path)
        f.write(path+'\n')
        count += 1
print('raw', count)

mfcc 10496
fb 10496
raw 10496


# Make enroll model file

In [24]:
SRE16_TEST_DIR

'/workspace/DATASET/server9/SRE16_TEST'

In [25]:
# trial dict
trial_dir = SRE16_TEST_DIR
enroll_model_out = os.path.join(INFO_DIR, 'sre16_test_enroll_model')
spk2utt = {}

label_dir = glob.glob(trial_dir+'/Enroll/*')
label = [i.split('/')[-1] for i in label_dir]

for count, i in enumerate(label):
    spk2utt[i] = glob.glob(os.path.join(trial_dir, 'Enroll', i)+'/*.wav')
    
print(len(spk2utt))

with open(enroll_model_out, 'w') as f:
    for label in spk2utt:
        data = spk2utt[label]
        line = label
        for i in data:
            line += ' '+i.split('/')[-1][:-4]
        f.write(line+'\n')

802


# Make trial keys

In [26]:
sre16_test_trials_dir = os.path.join(INFO_DIR, 'sre16_test_trials')

In [31]:
ori_sre16_test_trials_dir = os.path.join(INFO_DIR, 'ori_sre16_eval_trial_key.tsv')

In [33]:
with open(ori_sre16_test_trials_dir, 'r') as f:
    lines = f.readlines()

with open(sre16_test_trials_dir, 'w') as f:
    for count, line in enumerate(lines):
        if count == 0:
            continue
        
        cmp = line[:-1].split('\t')
        
#         if cmp[1].split('.')[1] == 'flac':
#             break
        
        if cmp[3] == 'nontarget':
            out_1 = cmp[0]
            out_2 = cmp[1]
            out_3 = cmp[3]
            out = out_1+' '+out_2+' '+out_3+'\n'
        elif cmp[3] == 'target':
            out_1 = cmp[0]
            out_2 = cmp[1]
            out_3 = cmp[3]
            out = out_1+' '+out_2+' '+out_3+'\n'
        else:
            print('err')
            
        f.write(out)
        
print('sre16_test', count)

sre16_test 1986728
