In [44]:
import pandas as pd
import numpy as np
import librosa
import json
import os

from pandarallel import pandarallel

pandarallel.initialize(nb_workers=16, progress_bar=16)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [45]:
def load_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()
        lines = [json.loads(line.strip()) for line in lines]

    return lines

In [46]:
train_dir = "/data/codes/apa/kaldi/stt/data/stt-data/kaldi/prep_data_type_10"
test_dir = "/data/codes/apa/kaldi/stt/data/stt-data/kaldi/test_type_10"

path = "/data/codes/apa/kaldi/stt/data/stt-data/final/info_question_type-10_01082022_18092023.jsonl"
metadata_v1 = load_jsonl(path)
metadata = pd.DataFrame(metadata_v1)
metadata.head()

Unnamed: 0,sid,utt_id,elsa,prep,start_time,end_time,audio_path
0,3276751,,DOG,DOG,0,1.536,/data/audio_data/prep_submission_audio/10/3276...
1,3376542,,DOG,DOG,0,1.76,/data/audio_data/prep_submission_audio/10/3376...
2,2548703,,DOG,DOG,0,2.218625,/data/audio_data/prep_submission_audio/10/2548...
3,4985757,,DOG,DOG,0,1.962625,/data/audio_data/prep_submission_audio/10/4985...
4,3922824,,DOG,DOG,0,1.877312,/data/audio_data/prep_submission_audio/10/3922...


In [47]:
# train_dir = "/data/codes/apa/kaldi/stt/data/stt-data/kaldi/prep_data_type_9"
# test_dir = "/data/codes/apa/kaldi/stt/data/stt-data/kaldi/test_type_9"

# path = "/data/codes/apa/kaldi/stt/data/stt-data/final/info_question_type-9_01082022_18092023.jsonl"
# metadata_v1 = load_jsonl(path)
# path = "/data/codes/apa/kaldi/stt/data/stt-data/final/info_question_type-9_19092023_21122023.jsonl"
# metadata_v2 = load_jsonl(path)

# metadata = metadata_v1 + metadata_v2
# metadata = pd.DataFrame(metadata)
# metadata.head()

In [48]:
def get_duration(path):
    wav, sr = librosa.load(path, sr=16000)

    return wav.shape[0]/sr

duration = metadata.audio_path.parallel_apply(get_duration)
print(duration.sum()/3600)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=5408), Label(value='0 / 5408'))), …

59.144882326388895


In [49]:
duration = metadata.apply(lambda x: x["end_time"] - x["start_time"], axis=1)
print(duration.sum()/3600)

59.144882326388895


In [50]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(metadata, test_size=0.05, random_state=42)

In [51]:
if not os.path.exists(train_dir):
    os.mkdir(train_dir)

if not os.path.exists(test_dir):
    os.mkdir(test_dir)

train_data["id"] = train_data.audio_path.apply(lambda x: os.path.basename(x).split(".wav")[0])
train_data = train_data.sort_values("id")

test_data["id"] = test_data.audio_path.apply(lambda x: os.path.basename(x).split(".wav")[0])
test_data = test_data.sort_values("id")

In [52]:
def create_text_file(f, first_column, second_column):
    line = f'{first_column}\t{second_column}'
    f.write(line + "\n")

wavscp_path = f'{train_dir}/wav.scp'
text_path = f'{train_dir}/text'
spk2utt_path = f'{train_dir}/spk2utt'
utt2spk_path = f'{train_dir}/utt2spk'

with open(wavscp_path, "w", encoding="utf-8") as f:
    train_data.apply(lambda x: create_text_file(f, x["id"], x["audio_path"]), axis=1)
    
with open(text_path, "w", encoding="utf-8") as f:
    train_data.apply(lambda x: create_text_file(f, x["id"], x["elsa"]), axis=1)
    
with open(spk2utt_path, "w", encoding="utf-8") as f:
    train_data.apply(lambda x: create_text_file(f, x["id"], x["id"]), axis=1)
    
with open(utt2spk_path, "w", encoding="utf-8") as f:
    train_data.apply(lambda x: create_text_file(f, x["id"], x["id"]), axis=1)

In [None]:
def create_text_file(f, first_column, second_column):
    line = f'{first_column}\t{second_column}'
    f.write(line + "\n")

wavscp_path = f'{test_dir}/wav.scp'
text_path = f'{test_dir}/text'
spk2utt_path = f'{test_dir}/spk2utt'
utt2spk_path = f'{test_dir}/utt2spk'

with open(wavscp_path, "w", encoding="utf-8") as f:
    test_data.apply(lambda x: create_text_file(f, x["id"], x["audio_path"]), axis=1)
    
with open(text_path, "w", encoding="utf-8") as f:
    test_data.apply(lambda x: create_text_file(f, x["id"], x["elsa"]), axis=1)
    
with open(spk2utt_path, "w", encoding="utf-8") as f:
    test_data.apply(lambda x: create_text_file(f, x["id"], x["id"]), axis=1)
    
with open(utt2spk_path, "w", encoding="utf-8") as f:
    test_data.apply(lambda x: create_text_file(f, x["id"], x["id"]), axis=1)