### <center>功能函数</center>

In [1]:
import os
import random

import numpy as np

# Set the random seeds
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)

### 文件后缀修改

In [2]:
def find_wavs(input_path):
    # find all wav files
    flac_files = []
    for root, folders, files in os.walk(input_path):
        for file in filter(lambda x: x.endswith('wav'), files):
            if '._S_' in file:# for IEEE
                continue
            else:
                flac_files.append(os.path.join(root, file))
    return flac_files

In [3]:
input_path = r'D:\workspace\nan\test\datasets\Corpus\LibriSpeech'
# find all files
flac_files = find_wavs(input_path)

In [4]:
len(flac_files)

28539

In [5]:
for flac_file in flac_files:
    wav_file = flac_file.replace('.flac', '.wav')
    os.rename(flac_file, wav_file)

### 将List写入txt

In [11]:
input_path = r'.\IEEE\wideband'
# find all files
wav_files = find_wavs(input_path)
list_to_txt(wav_files,'1.txt')

In [3]:
def list_to_txt(wav_list,txt_file): 
    with open(txt_file, 'w') as f:
        for wav_file in wav_list:
            #wav_file = wav_file.replace(input_path, '')
            f.write(wav_file+"\n")

### 随机分割

In [4]:
# 从列表中随机选择指定个数的元素
def cut_list(wav_list,*param):
    random.shuffle(wav_list)
    start = 0
    starts = []
    end = 0
    ends = []
    for cut_index in param:
        end=end+cut_index
        starts.append(start)
        ends.append(end)
        start=end
    return [wav_list[start:end] for start,end in zip(starts,ends)]

In [30]:
output=cut_list([1,2,3,4,5,6,7,8,9,10],3,1,6)
len(output)

3

### 按照指定名称写入列表

In [5]:
def write_txt(wav_lists,corpus):
    txt_files=[corpus+'-train-files.txt',corpus+'-val-files.txt',corpus+'-test-files.txt']
    for wav_list, txt_file in zip(wav_lists, txt_files):
        list_to_txt(wav_list,txt_file)

### 获得指定路径下文件夹名

### <center> timit <center>

In [7]:
timit_train = find_wavs(r'.\TIMIT\TRAIN')
len(timit_train)

4620

In [8]:
timit_test = find_wavs(r'.\TIMIT\TEST')
len(timit_test)

1680

In [9]:
output = cut_list(timit_test,1153, 192)
timit_val = output[0]
timit_test = output[1]
print(len(timit_val),len(timit_test))

1153 192


### <center> vctk </center>

In [10]:
vctk_train = find_wavs(r'.\VCTK')
total = len(vctk_train)
print(total)

44455


In [11]:
output = cut_list(vctk_train,int(total*0.88), int(total*0.06), int(total*0.06))
vctk_train = output[0]
vctk_val = output[1]
vctk_test = output[2]
print(len(vctk_train),len(vctk_val),len(vctk_test))

39120 2667 2667


### <center> WSJ </center>

In [12]:
path = r'.\WSJ'
dirs = os.listdir(path)
file_list = []
for file in dirs:
    file_list.append(file)

In [13]:
output = cut_list(file_list,100, 10, 8)
output[1]

['052', '026', '20g', '20q', '00d', '40l', '01n', '01g', '20n', '407']

In [14]:
def wsj_from(speakers):
    path = r'.\WSJ'
    total_list=[]
    for speaker in speakers:
        total_list += find_wavs(os.path.join(path,speaker))
    return total_list

In [15]:
wsj_train = wsj_from(output[0])
wsj_val = wsj_from(output[1])
wsj_test = wsj_from(output[2])
print(len(wsj_train),len(wsj_val),len(wsj_test))

26187 2022 2170


In [16]:
wsj_train = cut_list(wsj_train,12736)[0]
wsj_val = cut_list(wsj_val,1206)[0]
wsj_test = cut_list(wsj_test,651)[0]
print(len(wsj_train),len(wsj_val),len(wsj_test))

12736 1206 651


### <center> LIBRI </center>

In [17]:
libri_val = find_wavs(r'.\LIBRI\dev-clean')
len(libri_val)

2703

In [18]:
libri_test = find_wavs(r'.\LIBRI\test-clean')
len(libri_test)

2620

In [19]:
libri_train = find_wavs(r'.\LIBRI\train-clean-100')
len(libri_train)

28539

### <center> IEEE </center>

In [20]:
ieee_train = find_wavs(r'.\IEEE\wideband')
len(ieee_train)

720

In [21]:
output = cut_list(ieee_train,576, 72, 72)
ieee_train = output[0]
ieee_val = output[1]
ieee_test = output[2]
print(len(ieee_train),len(ieee_val),len(ieee_test))

576 72 72


### <center> 写入对应txt文件 <center>

In [22]:
write_txt([timit_train,timit_val,timit_test],'timit')

In [23]:
write_txt([vctk_train,vctk_val,vctk_test],'vctk-multispeaker')
write_txt([wsj_train,wsj_val,wsj_test],'wsj')
write_txt([libri_train,libri_val,libri_test],'libri')
write_txt([ieee_train,ieee_val,ieee_test],'ieee')

### <center> mir20 </center>

In [24]:
mirs = find_wavs(r'.\VintageMics')
mir20 = cut_list(mirs,20)[0]
list_to_txt(mir20,'mir20-files.txt')

### <center> mixed </center>

In [25]:
mixed1 = timit_train+wsj_train+libri_train+ieee_train
mixed_train = cut_list(mixed1,10000)[0]
list_to_txt(mixed_train,'mixed-train-files.txt')
len(mixed1)

46471

In [26]:
mixed2 = timit_val+wsj_val+libri_val+ieee_val
len(mixed2)

5134

In [27]:
mixed_val = cut_list(mixed2,int(10000/46471*5134))[0]
list_to_txt(mixed_val,'mixed-val-files.txt')

In [28]:
len(mixed_val)

1104

In [29]:
! dir /b

.ipynb_checkpoints
01-prep-txt.ipynb
IEEE
ieee-test-files.txt
ieee-train-files.txt
ieee-val-files.txt
LIBRI
libri-test-files.txt
libri-train-files.txt
libri-val-files.txt
mir20-files.txt
mixed-train-files.txt
mixed-val-files.txt
test.txt
TIMIT
timit-test-files.txt
timit-train-files.txt
timit-val-files.txt
VCTK
vctk-multispeaker-test-files.txt
vctk-multispeaker-train-files.txt
vctk-multispeaker-val-files.txt
vctk-speaker1-train-files.txt
vctk-speaker1-val-files.txt
VintageMics
WSJ
wsj-test-files.txt
wsj-train-files.txt
wsj-val-files.txt
