In [1]:
#!/usr/bin/python3

In [12]:
"""This is the main module that structures the audio directory for Kaldi to consume  
   This script will prepare the audio directory as audio/<test || train>/<speaker id>/<recording>  
   e.g. audio/train/NC03FBX/NC03FBX_020101.flac  
"""

'This is the main module that structures the audio directory for Kaldi to consume  \n   This script will prepare the audio directory as audio/<test || train>/<speaker id>/<recording>  \n   e.g. audio/train/NC03FBX/NC03FBX_020101.flac  \n'

In [3]:
__author__="Emily Hua"

In [4]:
import re
import os
import glob
from shutil import copyfile
from shutil import rmtree
from collections import defaultdict 

Go to kaldi/egs/code-switch directory and create itnerview_audio folder. In kaldi-trunk/egs/code-switch/interview_audio create two folders: train and test. Select ten speaker of your choice to represent testing data set. Use this speaker's 'speakerID' as a name for an another new folder in kaldi-trunk/egs/code-switch/interview_audio/test directory. Then put there all the audio files related to that person. Put the rest (84 speakers) into train folder - this will be your training data set. Also create subfolders for each speaker.

In [2]:
ls -l ../LDC2015S04/seame_d2/data/interview/audio | head -3

total 8991184
-rwxr-xr-x  1 yehua  staff  48047643 Feb 16 12:56 NI01MAX_0101.flac*
-rwxr-xr-x  1 yehua  staff  50547368 Feb 16 12:56 NI02FAX_0101.flac*


In [3]:
ls -l ../LDC2015S04/seame_d1/data/conversation/audio | head -3

total 7925480
-rwxr-xr-x  1 yehua  staff   95609160 Feb 16 12:51 01NC01FBX_0101.flac*
-rwxr-xr-x  1 yehua  staff  123991309 Feb 16 12:51 01NC02FBY_0101.flac*


In [14]:
%%bash
# look out for this speaker who is an exmaple of having multuple files under the name NC08FBY
find ../LDC2015S04/seame_d1/data/conversation/audio -type f -print | grep 'NC08FBY'

../LDC2015S04/seame_d1/data/conversation/audio/04NC08FBY_0101.flac
../LDC2015S04/seame_d1/data/conversation/audio/04NC08FBY_0201.flac


In [5]:
parent_path = os.path.split(os.getcwd())[0]
print ("the parent path is {}".format(parent_path))

the parent path is /Volumes/STARTRACK/deep-learning/code-switch/speech-to-text/codeswitch


In [9]:
directory = parent_path+"/audio/test"
if os.path.exists(directory):
    clean_up(directory) # remove whatever is already in the train directory

In [10]:
ls -l ../audio/test

In [55]:
def get_file_list(audio_path, dirn):
    r"""Count the number of recordings in Conversation and Interview directories, return a list of file names
    Returns
    -------
    dir_list : list
        a list of file names under the corresponding Conversation or Interview's audio directory
    """
    dir_list = os.listdir(audio_path)
    dir_list = [f for f in os.listdir(audio_path) if re.match(r'[^\\]+\.flac', f)] # makes sure unwanted files like .DS_Store doesn't show up here! 
    print ("there are total {} of files in {}\n".format(len(dir_list), audio_path))
    if dirn == 'interview':
        assert (len(dir_list) == 210), 'LDC2015S04/seame_d1/data/interview/audio should have 210 recordings, check if the directory is corrupted'
    else:
        assert (len(dir_list) == 87), 'LDC2015S04/seame_d2/data/conversation/audio should have 87 recordings, check if the directory is corrupted'
    return dir_list

audio_path_i = parent_path + '/LDC2015S04/seame_d2/data/interview/audio'
audio_path_c = parent_path + '/LDC2015S04/seame_d1/data/conversation/audio'
dir_list_i = get_file_list(audio_path_i, 'interview')
dir_list_c = get_file_list(audio_path_c, 'conversation')

there are total 210 of files in /Volumes/STARTRACK/deep-learning/code-switch/speech-to-text/codeswitch/LDC2015S04/seame_d2/data/interview/audio

there are total 87 of files in /Volumes/STARTRACK/deep-learning/code-switch/speech-to-text/codeswitch/LDC2015S04/seame_d1/data/conversation/audio



In [31]:

def speaker_re_counts(dir_list):
    r"""Create a dictionary mapping of prefix to the number of recordings under this prefix. 
    Returns
    -------
    id_dic : collections.defaultdict
        a dictionary with key as recording prefix (tentative speaker id) and number of files associated with this prefix
        e.g. (interview) 'NI52MBQ': 2
        e.g. (conversation) '37NC45MBP': 1
    """
    id_dic = defaultdict(int)
    for file in dir_list:
        id_dic[re.split('_', file)[0]] += 1
    #print ('there are {} unique prefix sets'.format(len(id_dic)))
    return id_dic
id_dic_i = speaker_re_counts(dir_list_i)
id_dic_c = speaker_re_counts(dir_list_c)
    

there are 95 unique prefix sets
there are 79 unique prefix sets


In [32]:
# no need to be included in the final script  
id_dic_cc = defaultdict(int)
for file in dir_list_c:
    id_dic_cc[re.split('_', file)[0][2:]] += 1
print ('there are {} unique prefix sets'.format(len(id_dic_cc)))

there are 64 unique prefix sets


In [40]:
# no need to be included in the final script  
print ("sample speaker ids and number of recordings in converstaion: {}".format(dict(list(id_dic_c.items())[:3])))

sample speaker ids and number of recordings in converstaion: {'28NC51MBP': 1, '37NC45MBP': 1, '36NC46FBQ': 1}


In [41]:
# no need to be included in the final script   
print ("sample speaker ids and number of recordings in interview: {}".format(dict(list(id_dic_i.items())[:3])))

sample speaker ids and number of recordings in interview: {'NI52MBQ': 2, 'UI26MAZ': 5, 'UI19MAZ': 5}


In [38]:
test_short_ids_i = ['01MA', '03FA','08MA', '29FA','29MB','42FB','44MB','45FB','67MB','55FB']

In [39]:
test_short_ids_c = ['01NC01FB', '01NC02FB','06NC11MA', '06NC12MA']

In [66]:
def train_test_split(id_dic, test_short_ids, dirn):
    r"""generate a list of speaker id that should be the train or test set 
    Returns
    -------
    train_ids, test_ids : list, list
        a list of ids that belong to train or test set 
        e.g. (interview train) 'NI52MBQ' (interview test) 'NI55FBP'
        e.g. (conversation train) '37NC45MBP' (interview test) '01NC02FBY'
    """
    train_ids = []
    test_ids = []
    for key in id_dic:
        if key[2:-1] in test_short_ids or key[:-1] in test_short_ids:
            test_ids.append(key)
        else: 
            train_ids.append(key)
    print ('there are {} unprocessed speaker ids in the training set'.format(len(train_ids)))
    print ('there are {} unprocessed speaker ids in the testing set\n'.format(len(test_ids)))
    
    if dirn == "interview":
        assert (len(train_ids) == 85 and len(test_ids) == 10), "For interview, there should be 85 speakers be moved to the training set, 10 speakers in test set"
    else:
        assert (len(train_ids) == 75 and len(test_ids) == 4), "For conversation, there should be 75 speakers be moved to the training set, 4 speakers in test set"
    
    return train_ids, test_ids
train_ids_i, test_ids_i = train_test_split(id_dic_i, test_short_ids_i, "interview")
train_ids_c, test_ids_c = train_test_split(id_dic_c, test_short_ids_c, "conversation")

there are 85 unprocessed speaker ids in the training set
there are 10 unprocessed speaker ids in the testing set

there are 75 unprocessed speaker ids in the training set
there are 4 unprocessed speaker ids in the testing set



there are 85 speaker ids in the training set
there are 10 speaker ids in the testing set

there are 75 speaker ids (60 after process) in the training set
there are 4 speaker ids in the testing set

In [68]:
# no need to be included in the final script 
print ("speaker ids in the converation test set: {}\n".format(test_ids_c))
print ("speaker ids in the interview test set: {}".format(test_ids_i))

speaker ids in the converation test set: ['01NC02FBY', '06NC12MAY', '01NC01FBX', '06NC11MAX']

speaker ids in the interview test set: ['NI55FBP', 'UI08MAZ', 'NI67MBQ', 'NI45FBP', 'NI29MBP', 'NI42FBQ', 'UI29FAZ', 'NI01MAX', 'NI44MBQ', 'UI03FAZ']


In [70]:
train_ids_c_short = set([x[2:] for x in train_ids_c])

In [71]:
# no need to be included in the final script 
len(train_ids_c_short)

60

In [73]:
# no need to be included in the final script 
print (train_ids_c_short)

{'NC47MBP', 'NC35FBQ', 'NC17FBP', 'NC54FBQ', 'NC35FBP', 'NC23FBP', 'NC39MBP', 'NC07FBX', 'NC33FBP', 'NC59MAX', 'NC08FBY', 'NC28MBQ', 'NC22MBQ', 'NC20MBQ', 'NC16FBQ', 'NC03FBX', 'NC42MBQ', 'NC43FBQ', 'NC05FAX', 'NC09FAX', 'NC52FBQ', 'NC49FBQ', 'NC47MBQ', 'NC13MBP', 'NC36MBQ', 'NC46FBQ', 'NC40FBQ', 'NC25MBP', 'NC27MBP', 'NC15MBP', 'NC32FBQ', 'NC61FBQ', 'NC29FBP', 'NC43FBP', 'NC51MBP', 'NC56MBP', 'NC30MBQ', 'NC53MBP', 'NC44MBQ', 'NC37MBP', 'NC18MBQ', 'NC50FBP', 'NC04FBY', 'NC60FBQ', 'NC41MBP', 'NC34FBQ', 'NC38FBQ', 'NC26MBQ', 'NC14FBQ', 'NC45MBP', 'NC50XFB', 'NC57FBX', 'NC06FAY', 'NC24FBQ', 'NC48FBP', 'NC31FBP', 'NC10MAY', 'NC21FBP', 'NC19MBP', 'NC58FAY'}


In [74]:
# no need to be included in the final script 
print ("speaker ids in the conversation train set: \n{}\n".format(train_ids_c))
print ("speaker ids in the interview train set: \n{}".format(train_ids_i))

speaker ids in the conversation train set: 
['37NC45MBP', '28NC51MBP', '36NC46FBQ', '19NC37MBP', '39NC57FBX', '07NC13MBP', '26NC48FBP', '10NC19MBP', '25NC47MBP', '13NC25MBP', '02NC04FBY', '33NC37MBP', '07NC14FBQ', '43NC61FBQ', '31NC50XFB', '14NC28MBQ', '24NC35FBQ', '46NC41MBP', '09NC18MBQ', '45NC22MBQ', '28NC52FBQ', '11NC21FBP', '14NC27MBP', '21NC41MBP', '34NC37MBP', '26NC49FBQ', '17NC33FBP', '25NC43FBQ', '21NC42MBQ', '30NC48FBP', '22NC43FBP', '15NC30MBQ', '23NC45MBP', '27NC50FBP', '02NC03FBX', '17NC34FBQ', '12NC24FBQ', '33NC43FBQ', '05NC09FAX', '08NC16FBQ', '15NC29FBP', '22NC44MBQ', '09NC17FBP', '31NC35FBQ', '23NC35FBQ', '11NC22MBQ', '27NC47MBQ', '35NC56MBP', '18NC35FBP', '18NC36MBQ', '29NC54FBQ', '05NC10MAY', '08NC15MBP', '44NC44MBQ', '03NC06FAY', '41NC59MAX', '04NC08FBY', '12NC23FBP', '32NC50FBP', '03NC05FAX', '30NC49FBQ', '20NC39MBP', '04NC07FBX', '38NC50FBP', '24NC45MBP', '19NC38FBQ', '16NC32FBQ', '10NC20MBQ', '29NC53MBP', '40NC58FAY', '13NC26MBQ', '16NC31FBP', '20NC40FBQ', '32NC3

In [80]:
def create_test_wannabe(dir_list, test_ids, dirn):
    r"""generate a list of recording file name that should be moved to the test set 
    Returns
    -------
    test_wannabe : list
        a list of file names that we moved to test set
        e.g. (interview) 'NI01MAX_0101.flac'
        e.g. (conversation) '01NC01FBX_0101.flac'
    """
    test_wannabe = []
    for file in dir_list:
        speaker_id = re.split('_', file)[0]
        if speaker_id in test_ids:
            test_wannabe.append(file)
            # I have add a file from this prefix into the test set, no need for more from this prefix
    print ("there are {} files ready to be moved into the test set".format(len(test_wannabe)))
    if dirn == "interview":
        assert (len(test_wannabe) == 16), "16 files from interview should be moved into the test set"
    else:
        assert (len(test_wannabe) == 4), "4 files from conversation should be moved into the test set"    
    return test_wannabe

test_wannabe_i = create_test_wannabe(dir_list_i, test_ids_i, "interview")
test_wannabe_c = create_test_wannabe(dir_list_c, test_ids_c, "conversation")

there are 16 files ready to be moved into the test set
there are 4 files ready to be moved into the test set


should be   
there are 16 files ready to be moved into the test set  
there are 4 files ready to be moved into the test set  


In [81]:
ls -l ../audio/

total 0
drwxr-xr-x   16 yehua  staff   544 Apr  6 22:12 [1m[36mtest[m[m/
drwxr-xr-x  148 yehua  staff  5032 Apr  7 21:59 [1m[36mtrain[m[m/


In [84]:
# load interview recordings into the test folder
    
# def load_test_interview():
#     directory += "/test"
#     print ("....loading interview recordings into {}".format(directory))
#     if not os.path.exists(directory):
#         os.makedirs(directory)
#         for file in test_wannabe_i:
#             speaker_id = re.split('_',file)[0]
#             if not os.path.exists(directory + "/" + speaker_id):
#                 os.makedirs(directory + "/" + speaker_id)
#             src = audio_path_i + "/" + file
#             dst = directory + "/" + speaker_id + "/" + file
#             copyfile(src, dst)
#     print ("loading finished!")

In [7]:
def clean_up (dir_path):
    files = glob.glob(dir_path+'/*')
    for f in files:
        rmtree(f)

In [129]:
def load_audios_to_test(test_files_list,audio_path, dirn):
    counter = 0
    directory = parent_path + "/audio"
    if not os.path.exists(directory):
        os.makedirs(directory)
    directory += "/test"
    print ("loading {} recordings into {}".format(dirn, directory))
    
    if not os.path.exists(directory):
        os.makedirs(directory)        
    for file in test_files_list:
        if dirn == "interview":
            speaker_id = re.split('_',file)[0]
        elif dirn =="conversation":
            speaker_id = re.split('_',file)[0][2:]
        if not os.path.exists(directory + "/" + speaker_id):
            os.makedirs(directory + "/" + speaker_id)
        src = audio_path + "/" + file
        dst = directory + "/" + speaker_id + "/" + file
        copyfile(src, dst)
        counter += 1
    if dirn == "interview":
        assert (counter == 16), "should move 16 interview files to test folder, the number mismatched, investigate!"
    else:
        assert (counter == 4), "should move 4 conversation files to test folder, the number mismatched, investigate!"
    print ("loading {} to test finished!".format(dirn))

In [130]:
load_audios_to_test(test_wannabe_i, audio_path_i, "interview")
load_audios_to_test(test_wannabe_c, audio_path_c, "conversation")

loading interview recordings into /Volumes/STARTRACK/deep-learning/code-switch/speech-to-text/codeswitch/audio/test
loading interview to test finished!
loading conversation recordings into /Volumes/STARTRACK/deep-learning/code-switch/speech-to-text/codeswitch/audio/test
loading conversation to test finished!


In [21]:
# # load conversation recordings into the test set
# from shutil import copyfile
# directory = parent_path + "/audio"
# if not os.path.exists(directory):
#     os.makedirs(directory)
# print (directory)
# directory += "/test"
# print ("loading conversation recordings into {}".format(directory))

# for file in test_wannabe_c:
#     speaker_id = re.split('_',file)[0][2:]
#     if not os.path.exists(directory + "/" + speaker_id):
#         os.makedirs(directory + "/" + speaker_id)
#     src = audio_path_c + "/" + file
#     dst = directory + "/" + speaker_id + "/" + file
#     newname = directory + "/" + speaker_id + "/" + file[2:]
#     copyfile(src, dst)
#     os.rename(dst, newname)
# print ("loading finished")

/Volumes/STARTRACK/deep-learning/code-switch/speech-to-text/codeswitch/audio
loading conversation recordings into /Volumes/STARTRACK/deep-learning/code-switch/speech-to-text/codeswitch/audio/test
loading finished


In [131]:
ls -l ../audio/test

total 0
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:19 [1m[36mNC01FBX[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:19 [1m[36mNC02FBY[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:19 [1m[36mNC11MAX[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:19 [1m[36mNC12MAY[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:20 [1m[36mNI01MAX[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:20 [1m[36mNI29MBP[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:20 [1m[36mNI42FBQ[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:20 [1m[36mNI44MBQ[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:20 [1m[36mNI45FBP[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:20 [1m[36mNI55FBP[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:20 [1m[36mNI67MBQ[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:20 [1m[36mUI03FAZ[m[m/
drwxr-xr-x  6 yehua  staff  204 Apr 13 17:20 [1m[36mUI08MAZ[m[m/
drwxr-xr-x  6 yehua  staff  204 Apr 13 17:20 [1m[36mUI29FAZ[m[m/


In [132]:
ls -l ../audio/test

total 0
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:19 [1m[36mNC01FBX[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:19 [1m[36mNC02FBY[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:19 [1m[36mNC11MAX[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:19 [1m[36mNC12MAY[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:20 [1m[36mNI01MAX[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:20 [1m[36mNI29MBP[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:20 [1m[36mNI42FBQ[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:20 [1m[36mNI44MBQ[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:20 [1m[36mNI45FBP[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:20 [1m[36mNI55FBP[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:20 [1m[36mNI67MBQ[m[m/
drwxr-xr-x  3 yehua  staff  102 Apr 13 17:20 [1m[36mUI03FAZ[m[m/
drwxr-xr-x  6 yehua  staff  204 Apr 13 17:20 [1m[36mUI08MAZ[m[m/
drwxr-xr-x  6 yehua  staff  204 Apr 13 17:20 [1m[36mUI29FAZ[m[m/


In [134]:
ls -l ../audio/test/NC01FBX

total 186744
-rw-r--r--  1 yehua  staff  95609160 Apr 13 17:20 01NC01FBX_0101.flac


In [157]:
def load_interview_train(dir_list):
    r""" loading corresponding interview recordings to the training directory 
    Returns
    -------
    None
    """
    directory = parent_path + "/audio"
    directory += "/train"
    print ("loading interview recordings into {}".format(directory))
    
    if not os.path.exists(directory):
        os.makedirs(directory)
    else:
        clean_up(directory) # remove whatever is already in the train directory
    
    for file in dir_list:
        speaker_id = re.split('_',file)[0]
        if speaker_id in train_ids_i:
            if not os.path.exists(directory + "/" + speaker_id):
                os.makedirs(directory + "/" + speaker_id)
            src = audio_path_i + "/" + file
            dst = directory + "/" + speaker_id + "/" + file
            copyfile(src, dst)
    print ("loading finished")

In [136]:
load_interview_train(dir_list_i) 

loading interview recordings into /Volumes/STARTRACK/deep-learning/code-switch/speech-to-text/codeswitch/audio/train
loading finished


In [137]:
ls -1 ../audio/train | wc -l 

      85


In [138]:
85+75

160

In [98]:
train_ids_c[:4]

['22NC44MBQ', '08NC16FBQ', '02NC03FBX', '37NC45MBP']

In [144]:
'04NC08FBY' in train

True

In [143]:
'04NC08FBY' inc

True

In [149]:
'NC08FBY' in speaker_multiple

True

In [146]:
for i in dir_list_c:
    if '04NC08FBY' in i:
        print (i)

04NC08FBY_0101.flac
04NC08FBY_0201.flac


In [148]:
dic['NC08FBY']

['04NC08FBY_0101.flac', '04NC08FBY_0201.flac']

In [169]:
len(speaker_multiple)

19

In [158]:
# find me conversation with different recordings under the same speaker 
# train = ['15NC29FBP', '29NC53MBP', '28NC52FBQ', '20NC39MBP', '04NC08FBY', '10NC20MBQ', '19NC37MBP', '40NC58FAY', '23NC45MBP', '46NC41MBP', '07NC14FBQ', '32NC36MBQ', '12NC24FBQ', '34NC37MBP', '13NC25MBP', '28NC51MBP', '31NC35FBQ', '25NC43FBQ', '26NC49FBQ', '31NC50XFB', '14NC28MBQ', '44NC44MBQ', '03NC06FAY', '33NC43FBQ', '09NC17FBP', '04NC07FBX', '45NC22MBQ', '26NC48FBP', '37NC45MBP', '27NC47MBQ', '18NC36MBQ', '15NC30MBQ', '02NC03FBX', '05NC10MAY', '03NC05FAX', '32NC50FBP', '36NC46FBQ', '11NC21FBP', '35NC56MBP', '11NC22MBQ', '19NC38FBQ', '22NC43FBP', '12NC23FBP', '29NC54FBQ', '07NC13MBP', '39NC57FBX', '25NC47MBP', '08NC16FBQ', '22NC44MBQ', '21NC41MBP', '17NC33FBP', '16NC31FBP', '08NC15MBP', '18NC35FBP', '05NC09FAX', '09NC18MBQ', '14NC27MBP', '42NC60FBQ', '17NC34FBQ', '30NC48FBP', '10NC19MBP', '41NC59MAX', '24NC35FBQ', '43NC61FBQ', '24NC45MBP', '27NC50FBP', '13NC26MBQ', '33NC37MBP', '30NC49FBQ', '16NC32FBQ', '02NC04FBY', '20NC40FBQ', '38NC50FBP', '23NC35FBQ', '21NC42MBQ']
def get_speaker_multiple(ids):
    r"""generate a list of recording prefix which recordings need to be renamed 
    Returns
    -------
    speaker_multiple : list
        a list of speaker ids that have multiple files under its name, and need to be renamed 
    """
    speaker_multiple = []
    sets = set([])
    for i in ids:
        sets.add(i[2:])

    dic = defaultdict(list)
    for file in dir_list_c:
        dic[re.split("_",file)[0][2:]].append(file)

    for key in dic:
        if len(dic[key]) > 1 :
            speaker_multiple.append(key)
    print ("speakers with multiple recordings:\n {}".format(speaker_multiple))
    return speaker_multiple

In [159]:
speaker_multiple = get_speaker_multiple(train_ids_c)

speakers with multiple recordings:
 ['NC35FBQ', 'NC06FAY', 'NC48FBP', 'NC08FBY', 'NC22MBQ', 'NC03FBX', 'NC49FBQ', 'NC43FBQ', 'NC09FAX', 'NC36MBQ', 'NC07FBX', 'NC44MBQ', 'NC37MBP', 'NC04FBY', 'NC41MBP', 'NC50FBP', 'NC45MBP', 'NC05FAX', 'NC10MAY']


In [161]:
c = 0
for f in dir_list_c:
    for n in train_ids_c:
        if n in f:
            c += 1
print (c)

83


In [162]:
def load_conversation_train():
    r"""loading corresponding conversation recordings into the train set
    Returns
    -------
    None
    """
    directory = parent_path + "/audio"
    directory += "/train"
    print ("loading conversation recordings into {}".format(directory))
    counter = 0
    for file in dir_list_c:
        speaker_id = re.split('_',file)[0]
        if speaker_id in train_ids_c:
            counter += 1
            if not os.path.exists(directory + "/" + speaker_id[2:]):
                os.makedirs(directory + "/" + speaker_id[2:])
            src = audio_path_c + "/" + file
            dst = directory + "/" + speaker_id[2:] + "/" + file
            if speaker_id[2:] in speaker_multiple:
                print (" {} has multiple recordings under its name".format(speaker_id))
                pre = re.split("_",file)[0][:2]
                end = re.split("_",file)[1].split(".")[0]
                newfile = re.split("_",file)[0][2:] + '_' + pre + end + ".flac"

                newname = directory + "/" + speaker_id[2:] + "/" + newfile
                print ("formated it from {} to {}".format(file, newname))

            else: 
                newname = directory + "/" + speaker_id[2:] + "/" + file[2:]
            copyfile(src, dst)
            os.rename(dst, newname)
    assert (counter == 83), "should move 83 files from conversation to the train folder, the number mismatched, investigate"
    print ("loading finished")
    print ("loaded {} conversation recordings in to train set ".format(counter))

In [164]:
load_conversation_train()

loading conversation recordings into /Volumes/STARTRACK/deep-learning/code-switch/speech-to-text/codeswitch/audio/train
 02NC03FBX has multiple recordings under its name
formated it from 02NC03FBX_0101.flac to /Volumes/STARTRACK/deep-learning/code-switch/speech-to-text/codeswitch/audio/train/NC03FBX/NC03FBX_020101.flac
 02NC03FBX has multiple recordings under its name
formated it from 02NC03FBX_0201.flac to /Volumes/STARTRACK/deep-learning/code-switch/speech-to-text/codeswitch/audio/train/NC03FBX/NC03FBX_020201.flac
 02NC04FBY has multiple recordings under its name
formated it from 02NC04FBY_0101.flac to /Volumes/STARTRACK/deep-learning/code-switch/speech-to-text/codeswitch/audio/train/NC04FBY/NC04FBY_020101.flac
 02NC04FBY has multiple recordings under its name
formated it from 02NC04FBY_0201.flac to /Volumes/STARTRACK/deep-learning/code-switch/speech-to-text/codeswitch/audio/train/NC04FBY/NC04FBY_020201.flac
 03NC05FAX has multiple recordings under its name
formated it from 03NC05FAX

In [165]:
ls -1 ../audio/train | wc -l 

     145


In [166]:
ls -1 ../audio/test | wc -l

      14


In [168]:
ls -1 ../interview_audio/train/NC08FBY

NC08FBY_040101.flac
NC08FBY_040201.flac
