In [1]:
from pandarallel import pandarallel
from glob import glob
import soundfile as sf
import pandas as pd
import librosa
import random
import json
import re
import os

pandarallel.initialize(nb_workers=5, progress_bar=True)

INFO: Pandarallel will run on 5 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [24]:
audio_dir = "/data/audio_data/prep_submission_audio/9"
metadata_path="/data/audio_data/pronunciation_scoring_result/info_question_type-9_01082022_18092023.csv"
metadata = pd.read_csv(metadata_path)
metadata.head(2)

Unnamed: 0,id,is_deleted,user_id,question_id,question_type,question_content,url,score,fidelity_class,created_at,total_time,word_count
0,5580125,0,105954.0,224272,9,"To be honest with you, I have never watched a ...",https://storage.googleapis.com/materials-eleme...,,,2023-09-18 21:18:30,23.66,62.0
1,5580126,0,105954.0,224271,9,Definitely yes. When you have been robot aroun...,https://storage.googleapis.com/materials-eleme...,,,2023-09-18 21:18:30,23.37,66.0


In [25]:
def check_audio_is_exist(audio_id):
    abs_path = os.path.join(audio_dir, f'{audio_id}.wav')
    if os.path.exists(abs_path):
        return True
    return False

metadata["is_exist"] =  metadata.id.parallel_apply(check_audio_is_exist)
print(metadata.shape)
metadata = metadata[metadata["is_exist"] == True]
metadata.reset_index(inplace=True)
print(metadata.shape)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=47612), Label(value='0 / 47612')))…

(238057, 13)
(237992, 14)


In [26]:
def normalize(text):
    text = re.sub('[\!@#$%^&*\(\)\\\.\'\"\,\?\;\:\+\-\_\/\|~`]', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.upper().strip()
    return text

def load_lexicon(path="resources/lexicon.txt"):
    with open(path, "r", encoding="utf-8") as f:
        content = f.readlines()
        lines = [line.strip() for line in content]
    lexicon = {}
    for line in lines:
        tmp = line.split()
        word, arpabet = tmp[0], " ".join(tmp[1:])

        if word not in lexicon:
            lexicon[word] = [arpabet, ]
        else:
            lexicon[word].append(arpabet)

    for key in lexicon.keys():
        lexicon[key] = set(lexicon[key])
    
    return lexicon


In [27]:
lexicon = load_lexicon(path="/data/codes/prep_gopt/egs/librispeech/s5/data/lexicon.txt")

In [28]:
def is_selected_word(word, phoneme_score=40, decision=["correct", "warning"], phoneme_error="normal"):
    text = normalize(word["text"])

    if text not in lexicon:
        return False
    else:
        if word["trans_arpabet"] not in lexicon[text]:
            return False
        
    if phoneme_score is not None:
        for phoneme in word["phonemes"]:
            if phoneme["nativeness_score"] < phoneme_score:
                return False
            
    if decision is not None:
        for phoneme in word["phonemes"]:
            if phoneme["decision"] not in decision:
                return False
            
    if phoneme_error is not None:
        for phoneme in word["phonemes"]:
            if phoneme["phoneme_error"] != phoneme_error:
                return False
        
    return True

In [29]:
def segment_sentence(sentence, min_num_word=5, phoneme_score=40, decision=["correct", "warning"], phoneme_error="normal"):
    count = 0
    segmented_sentence, tmp_words = [], []
    for word in sentence["words"]:
        assert count == len(tmp_words)
        if is_selected_word(word, phoneme_score, decision, phoneme_error):
            count += 1
            tmp_words.append(word)
        else:
            if len(tmp_words) >= min_num_word:
                segmented_sentence.append(tmp_words)
            count = 0
            tmp_words = []
    return segmented_sentence

def parse_segmented_sentence(selected_words):
    words, text = [], []
    start_time, end_time = selected_words[0]["start_time"], selected_words[-1]["end_time"]
    for word in selected_words:
        text.append(word["text"])
        words.append(
            {
                'text': word["text"],
                'arpabet': word["trans_arpabet"],
                'start_time': word["start_time"],
                'end_time': word["end_time"],
                'score': word["nativeness_score"]
            }
        )

    sentence = {
            "start_time": start_time,
            "end_time": end_time,
            "text": " ".join(text),
            "words": words,
        }
    
    return sentence

In [30]:
def preprocess_data(json_path):
    with open(json_path, "r") as f:
        raw_sample = json.load(f)
        
    spk_id = raw_sample["speaker_id"]

    sentences = []
    sample_id = os.path.basename(json_path).split(".json")[0]
    for index_1, utterance in enumerate(raw_sample["utterances"]):
        if utterance["result"] is None:
            continue
        segmented_sentences = segment_sentence(utterance["result"], min_num_word=4, phoneme_score=None, decision=["correct", "warning"], phoneme_error="normal")
        
        parsed_sentences = []
        for sentence in segmented_sentences:
            parsed_sentence = parse_segmented_sentence(sentence)
            parsed_sentences.append(parsed_sentence)

        for index, sent in enumerate(parsed_sentences):
            sent["utt_id"] = f'{sample_id}{index_1}{index}'
            sent["id"] = f'{sample_id}'
            sent["spk_id"] = spk_id
            sentences.append(sent)

    return sentences

json_dir = "/data/audio_data/pronunciation_scoring_result/marking_data/9"
tmp = metadata.id.parallel_apply(lambda x: preprocess_data(os.path.join(json_dir, f'{x}.json')))


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=47599), Label(value='0 / 47599')))…

In [31]:
_id = 5581471
json_path = f'/data/audio_data/pronunciation_scoring_result/marking_data/9/{_id}.json'
with open(json_path, "r", encoding="utf-8") as f:
    content = json.load(f)
    print(content)



In [32]:
total_time = 0
for index in tmp.index:
    for sent in tmp.iloc[index]:
        total_time += (sent["end_time"] - sent["start_time"])

total_time / 3600

185.54153888889007

In [33]:
tmp_metadata = tmp.explode()

df = pd.DataFrame(tmp_metadata.values, columns=['sent'])
df.dropna(inplace=True)

df = df[0:10000]

In [34]:
in_dir = '/data/audio_data/prep_submission_audio/9'
out_dir = '/data/codes/prep_gopt/egs/librispeech/s5/data/prep/wav'
def get_audio(sent):
    utt_id = sent["utt_id"]

    in_path = f'{in_dir}/{sent["id"]}.wav'
    out_path = f'{out_dir}/{utt_id}.wav'

    waveform, sr = librosa.load(in_path, sr=16000)
    start_time = int(sent["start_time"] * sr)
    end_time = int(sent["end_time"] * sr)

    sf.write(out_path, waveform[start_time: end_time], samplerate=sr)

df.sent.parallel_apply(get_audio)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2000), Label(value='0 / 2000'))), …

0        None
1        None
2        None
3        None
4        None
         ... 
12864    None
12865    None
12866    None
12867    None
12869    None
Name: sent, Length: 10000, dtype: object

In [35]:
wav_dir = "/data/codes/prep_gopt/egs/librispeech/s5/data/prep/wav/"

df["text"] = df.sent.parallel_apply(lambda x: normalize(x["text"]))
df["wav_path"] = df.sent.parallel_apply(lambda x: os.path.join(wav_dir, f'{x["utt_id"]}.wav'))
df["utt_id"] = df.sent.parallel_apply(lambda x: x["utt_id"])
df["spk_id"] = ['0'*(8-len(str(spk_id))) + f'{spk_id}' for spk_id in range(df.shape[0])]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2000), Label(value='0 / 2000'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2000), Label(value='0 / 2000'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2000), Label(value='0 / 2000'))), …

In [44]:
def get_duration(path):
    wav, sr =librosa.load(path)

    return wav.shape[0]/sr
duration = df[9000:].wav_path.parallel_apply(get_duration)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=200), Label(value='0 / 200'))), HB…

In [45]:
def create_text_file(f, first_column, second_column):
    line = f'{first_column}\t{second_column}'
    f.write(line + "\n")


data_dir = "/data/codes/prep_gopt/egs/librispeech/s5/data/prep/test"

wavscp_path = f'{data_dir}/wav.scp'
text_path = f'{data_dir}/text'
spk2utt_path = f'{data_dir}/spk2utt'
utt2spk_path = f'{data_dir}/utt2spk'

with open(wavscp_path, "w", encoding="utf-8") as f:
    df[9000:][(duration > 1) & (duration < 8)].sort_values("utt_id").apply(lambda x: create_text_file(f, x["utt_id"], x["wav_path"]), axis=1)
    
with open(text_path, "w", encoding="utf-8") as f:
    df[9000:][(duration > 1) & (duration < 8)].sort_values("utt_id").apply(lambda x: create_text_file(f, x["utt_id"], x["text"]), axis=1)
    
with open(spk2utt_path, "w", encoding="utf-8") as f:
    df[9000:][(duration > 1) & (duration < 8)].sort_values("utt_id").apply(lambda x: create_text_file(f, x["utt_id"], x["utt_id"]), axis=1)
    
with open(utt2spk_path, "w", encoding="utf-8") as f:
    df[9000:][(duration > 1) & (duration < 8)].sort_values("utt_id").apply(lambda x: create_text_file(f, x["utt_id"], x["utt_id"]), axis=1)

In [42]:
def get_duration(path):
    wav, sr =librosa.load(path)

    return wav.shape[0]/sr
duration = df[0:9000].wav_path.parallel_apply(get_duration)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1800), Label(value='0 / 1800'))), …

In [43]:
def create_text_file(f, first_column, second_column):
    line = f'{first_column}\t{second_column}'
    f.write(line + "\n")


data_dir = "/data/codes/prep_gopt/egs/librispeech/s5/data/prep/train"

wavscp_path = f'{data_dir}/wav.scp'
text_path = f'{data_dir}/text'
spk2utt_path = f'{data_dir}/spk2utt'
utt2spk_path = f'{data_dir}/utt2spk'

with open(wavscp_path, "w", encoding="utf-8") as f:
    df[0:9000][(duration > 1) & (duration < 8)].sort_values("utt_id").apply(lambda x: create_text_file(f, x["utt_id"], x["wav_path"]), axis=1)
    
with open(text_path, "w", encoding="utf-8") as f:
    df[0:9000][(duration > 1) & (duration < 8)].sort_values("utt_id").apply(lambda x: create_text_file(f, x["utt_id"], x["text"]), axis=1)
    
with open(spk2utt_path, "w", encoding="utf-8") as f:
    df[0:9000][(duration > 1) & (duration < 8)].sort_values("utt_id").apply(lambda x: create_text_file(f, x["utt_id"], x["utt_id"]), axis=1)
    
with open(utt2spk_path, "w", encoding="utf-8") as f:
    df[0:9000][(duration > 1) & (duration < 8)].sort_values("utt_id").apply(lambda x: create_text_file(f, x["utt_id"], x["utt_id"]), axis=1)