In [None]:
import pandas as pd
import librosa
import pickle
import json

In [None]:
path = "/data/codes/prep_ps_pykaldi/prep_data/raw/info_in_domain_long_sentence_testset_old.csv"
metadata = pd.read_csv(path, index_col=0)
metadata["score"] = metadata.score.apply(lambda x: json.loads(x))
metadata.head()

In [None]:
wav_dir = "/data/codes/prep_ps_pykaldi/prep_data/wav"
def is_valid(id):
    wav_path = f'{wav_dir}/{id}.wav'
    
    try:
        librosa.load(wav_path, sr=16000)
        return True
    except:
        return False
    
temp = metadata.id.apply(is_valid)
metadata = metadata[temp==True]

In [None]:
metadata["word_ids"] = None
for index in metadata.index:
    word_ids = []
    for word_id, word in enumerate(metadata["score"][index]["phonemes"]):
        expanded_phone = []
        for phone in word:
            if len(phone["trans"].split()) > 1:
                cpy_phone = phone.copy()
                for trans, arpa in zip(phone["trans"].split(), phone["arpa"].split()):
                    cpy_phone["trans"] = trans
                    cpy_phone["arpa"] = arpa
                    expanded_phone.append(cpy_phone.copy())
            else:
                expanded_phone.append(phone)
            word_ids.append(word_id)
        metadata.loc[index, "score"]["phonemes"][word_id] = expanded_phone
        
    metadata["word_ids"][index] = word_ids

In [None]:
data = []
for index in metadata.index:
    user_id = str(metadata["user_id"][index])
    wav_id = str(metadata["id"][index])
    mark_metadata = metadata["score"][index]
    text = metadata["question_content"][index]
    question_id = str(metadata["question_id"][index])
        
    arpas = [word["arpa"] for word in mark_metadata["words"]] 
    arpas = " ".join(arpas).split()
    trans = [word["trans"] for sample in mark_metadata["phonemes"] for word in sample] 
    phone_scores = [int(phone["score"]) for word in mark_metadata["phonemes"] for phone in word] 
    word_ids = metadata["word_ids"][index]
    word_scores = [int(word["score"]) for word in mark_metadata["words"]]
    utterance_score = mark_metadata["utterance"]

    sample = {
        "uid": user_id,
        "id": wav_id,
        "qid":question_id,
        "text": text,
        "arpas":arpas,
        "phone_scores":phone_scores,
        "word_ids": word_ids,
        "trans":trans,
        "word_scores":word_scores,
        "utterance_scores": utterance_score
    }
    
    data.append(sample)

In [None]:
path = "/data/codes/prep_ps_pykaldi/prep_data/jsonl_v1/info_in_domain_long_sentence_testset_old.jsonl"
with open(path, "w", encoding="utf-8") as f:
    for line in data:
        json_obj = json.dumps(line)
        f.write(f'{json_obj}\n')

### Prepare data

In [None]:
import json
import pandas as pd

In [None]:
path = "/data/codes/prep_ps_pykaldi/prep_data/jsonl_v1/info_in_domain_long_sentence_testset_old.jsonl"
with open(path, "r", encoding="utf-8") as f:
    lines = [json.loads(line.strip()) for line in f.readlines()]

df = pd.DataFrame(lines)
df.head()

In [None]:
df[["id", "text"]].to_csv("/data/codes/prep_ps_pykaldi/prep_data/info_in_domain_long_sentence_testset_old.csv", sep="|", index=None, header=None)

### Copy audio

In [None]:
import pandas as pd
import torchaudio
import librosa
import shutil
import json
import os

from pandarallel import pandarallel
pandarallel.initialize(nb_workers=8, progress_bar=True)

In [None]:
audio_dir = "/data/codes/prep_ps_pykaldi/prep_data/wav"
audio_files = os.listdir(audio_dir)
audio_files = [audio_file.split(".")[0] for audio_file in audio_files]
audio_files = set(audio_files)

In [None]:
path = "/data/codes/prep_ps_pykaldi/prep_data/info_in_domain_long_sentence_testset.csv"
df = pd.read_csv(path, names=["id", "text"], sep="|")
df.head()

In [None]:
df.id.parallel_apply(lambda x: str(x) in audio_files).sum()

In [None]:
in_dir = "/data/audio_data/prep_submission_audio/10"
out_dir = audio_dir

def copy_audio(id):
    path = f'{in_dir}/{id}.wav'
    wav, sr = torchaudio.load(path)
    if sr == 8000:
        return False
    # shutil.copy(src=path, dst=out_dir)
    return True
    

is_success = df.id.parallel_apply(copy_audio)

In [None]:
df[~is_success]