In [1]:
import pandas as pd
import os
import uuid
import json
from sklearn.model_selection import train_test_split
from pandarallel import pandarallel
import time
import shutil

pandarallel.initialize(nb_workers=10, progress_bar=True, use_memory_fs=False)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
path = "data/processed_data/metadata.csv"
metadata = pd.read_csv(path, index_col=0)
metadata.head()

Unnamed: 0,user_id,id,question_content,score
0,52077.0,5580000,STATISTICS,"{""words"": [[""STATISTICS"", ""S T AH0 T IH1 S T I..."
1,88226.0,5580001,SEAT,"{""words"": [[""SEAT"", ""S IY1 T"", 53]], ""phonemes..."
2,199169.0,5580003,FOGGY,"{""words"": [[""FOGGY"", ""F AA1 G IY0"", 80]], ""pho..."
3,237799.0,5580004,SCHOOL,"{""words"": [[""SCHOOL"", ""S K UW1 L"", 69]], ""phon..."
4,88226.0,5580007,SEAT,"{""words"": [[""SEAT"", ""S IY1 T"", 68]], ""phonemes..."


In [3]:
def create_wav_scp(f, wav_id, wav_path):
    line = f'{wav_id}\t{wav_path}'
    
    f.write(line + "\n")
    
def create_text(f, utt_id, text):
    line = f'{utt_id}\t{text.upper()}'
    
    f.write(line+"\n")
    
def create_utt2spk(f, utt_id, spk):
    line = f'{utt_id}\t{spk}'
    
    f.write(line+"\n")
    
def create_spk2utt(f, spk, utt_id):
    line = f'{spk}\t{utt_id}'
    
    f.write(line+"\n")
    
def gen_spk_id(*args):
    return uuid.uuid1()

def create_text_phoneme(f, utt_id, words):
    words = json.loads(words)["words"]
    for index, word in enumerate(words):
        phonemes = word[1].split()
        if len(phonemes) == 1:
            phonemes[0] = phonemes[0] + "_S"
            
        if len(phonemes) >= 2:
            phonemes[0] = phonemes[0] + "_B"
            phonemes[-1] = phonemes[-1] + "_E"
            
        if len(phonemes) > 2:
            for i in range(1, len(phonemes)-1):
                phonemes[i] = phonemes[i] + "_I"
            
        line = f'{utt_id}.{index}\t{" ".join(phonemes)}'
        f.write(line + "\n")
        
def check_downloaded(path):
    if os.path.exists(path):
        return False
    return True

In [4]:
def extract_score(score):
    score = json.loads(score)
    word_scores = []
    for word in score["words"]:
        word_scores.append(word[2])
    
    return word_scores

metadata["word_score"] = metadata.score.apply(extract_score)

In [5]:
wav_dir = "/working/data/processed_data/wavs"
metadata["wav_path"] = metadata.id.parallel_apply(lambda x: os.path.join(wav_dir, f'{x}.wav'))
metadata["text"] = metadata["question_content"]
metadata["spk_id"] = metadata.id.parallel_apply(lambda x: x)
metadata.head()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=95758), Label(value='0 / 95758')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=95758), Label(value='0 / 95758')))…

Unnamed: 0,user_id,id,question_content,score,word_score,wav_path,text,spk_id
0,52077.0,5580000,STATISTICS,"{""words"": [[""STATISTICS"", ""S T AH0 T IH1 S T I...",[90],/working/data/processed_data/wavs/5580000.wav,STATISTICS,5580000
1,88226.0,5580001,SEAT,"{""words"": [[""SEAT"", ""S IY1 T"", 53]], ""phonemes...",[53],/working/data/processed_data/wavs/5580001.wav,SEAT,5580001
2,199169.0,5580003,FOGGY,"{""words"": [[""FOGGY"", ""F AA1 G IY0"", 80]], ""pho...",[80],/working/data/processed_data/wavs/5580003.wav,FOGGY,5580003
3,237799.0,5580004,SCHOOL,"{""words"": [[""SCHOOL"", ""S K UW1 L"", 69]], ""phon...",[69],/working/data/processed_data/wavs/5580004.wav,SCHOOL,5580004
4,88226.0,5580007,SEAT,"{""words"": [[""SEAT"", ""S IY1 T"", 68]], ""phonemes...",[68],/working/data/processed_data/wavs/5580007.wav,SEAT,5580007


In [6]:
text_phoneme_path = 'data/processed_data/text-phone'

with open(text_phoneme_path, "w", encoding="utf-8") as f:
    metadata.apply(lambda x: create_text_phoneme(f, x["id"], x["score"]), axis=1)

In [7]:
def convert_to_kaldi_format(metadata, data_dir):
    wavscp_path = f'{data_dir}/wav.scp'
    text_path = f'{data_dir}/text'
    spk2utt_path = f'{data_dir}/spk2utt'
    utt2spk_path = f'{data_dir}/utt2spk'

    with open(wavscp_path, "w", encoding="utf-8") as f:
        metadata.sort_values("id").apply(lambda x: create_wav_scp(f, x["id"], x["wav_path"]), axis=1)
        
    with open(text_path, "w", encoding="utf-8") as f:
        metadata.apply(lambda x: create_text(f, x["id"], x["text"]), axis=1)
        
    with open(spk2utt_path, "w", encoding="utf-8") as f:
        metadata.sort_values("spk_id").apply(lambda x: create_spk2utt(f, x["spk_id"], x["id"]), axis=1)
        
    with open(utt2spk_path, "w", encoding="utf-8") as f:
        metadata.sort_values("id").apply(lambda x: create_utt2spk(f, x["id"], x["spk_id"]), axis=1)
        
    print("saved metadata to: ", data_dir)

In [8]:
import time

In [9]:
n = 100
step = int(metadata.shape[0]/n)
for i in range(0, n):
    data_dir = "/data/codes/prep_gopt/data/processed_data/train"
    convert_to_kaldi_format(metadata=metadata[i*step: (i+1)*step], data_dir=data_dir)
    time.sleep(10)

saved metadata to:  /data/codes/prep_gopt/data/processed_data/train
saved metadata to:  /data/codes/prep_gopt/data/processed_data/train
saved metadata to:  /data/codes/prep_gopt/data/processed_data/train


KeyboardInterrupt: 