In [19]:
%cd /data/codes/prep_gopt/

/data/codes/prep_gopt


In [20]:
import pandas as pd
import kaldi_io
import os
from glob import glob
from tqdm import tqdm
import json
import numpy as np
import torchaudio

In [21]:
path = "data/processed_data/metadata.csv"
metadata = pd.read_csv(path, index_col=0)
metadata.head()

Unnamed: 0,user_id,id,question_content,score
0,52077.0,5580000,STATISTICS,"{""words"": [[""STATISTICS"", ""S T AH0 T IH1 S T I..."
1,88226.0,5580001,SEAT,"{""words"": [[""SEAT"", ""S IY1 T"", 53]], ""phonemes..."
2,199169.0,5580003,FOGGY,"{""words"": [[""FOGGY"", ""F AA1 G IY0"", 80]], ""pho..."
3,237799.0,5580004,SCHOOL,"{""words"": [[""SCHOOL"", ""S K UW1 L"", 69]], ""phon..."
4,88226.0,5580007,SEAT,"{""words"": [[""SEAT"", ""S IY1 T"", 68]], ""phonemes..."


In [22]:
def preprocess_word(words):
    processed_words = []
    for word in words:
        if len(word[1].split()) == 1:
            processed_words.append(word)
        else:
            for phn in word[1].split(" "):
                processed_words.append([word[0], phn, word[2]])
    return processed_words

In [23]:
word2score, phone2score, utt2score = {}, {}, {}
phone2wordid = {}

def extract_score(utt_id, score):
    score = json.loads(score)
    words = score["words"]
    phonemes = score["phonemes"]

    assert len(words) == len(phonemes)
    assert utt_id not in utt2score
    utt2score[str(utt_id)] = score["utterance"]
    index = 0
    
    for wrd_id, (word, phoneme) in enumerate(zip(words, phonemes)):
        phoneme = preprocess_word(phoneme)
        assert len(word[1].split()) == len(phoneme)
        for _, (x, y ) in enumerate(zip(word[1].split(), phoneme)):
            key = f'{utt_id}.{index}'
            
            assert key not in word2score
            assert key not in phone2score
            
            word2score[key] = word[-1]
            phone2score[key] = y[-1]
            phone2wordid[key] = wrd_id
            
            index+=1
              
metadata.apply(lambda x: extract_score(x["id"], x["score"]), axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
55095    None
55096    None
55097    None
55098    None
55099    None
Length: 50000, dtype: object

In [24]:
def load_phone_symbol_table(filename):
    if not os.path.isfile(filename):
        return None, None
    int2sym = {}
    sym2int = {}
    with open(filename, 'r') as f:
        for line in f:
            sym, idx = line.strip('\n').split('\t')
            idx = int(idx)
            int2sym[idx] = sym
            sym2int[sym] = idx
    return sym2int, int2sym

path = "egs/gop_speechocean762/s5/data/lang_nosp/phones-pure.txt"
_, phone_int2sym = load_phone_symbol_table(path)

In [29]:
%cd /data/codes/prep_gopt/egs/gop_speechocean762/s5
path = "/data/codes/prep_gopt/egs/gop_speechocean762/s5/exp/gop_test/feat.scp"
keys, features, labels = [], [], []
for phn_id, feature in tqdm(kaldi_io.read_vec_flt_scp(path)):
    phn_id = str(phn_id)
    uut_id = phn_id.split(".")[0]
    
    features.append(feature)
    keys.append(phn_id)
    
    phoneme = phone_int2sym[feature[0]]
    labels.append(
        [
            phoneme, 
            phone2score[phn_id],
            word2score[phn_id],
            phone2wordid[phn_id],
            utt2score[uut_id],
            ]
    )

/data/codes/prep_gopt/egs/gop_speechocean762/s5


0it [00:00, ?it/s]

5816it [00:00, 112730.41it/s]


In [30]:
if os.path.exists('gopt_feats') == False:
    os.mkdir('gopt_feats')

np.savetxt('gopt_feats/te_feats.csv', features, delimiter=',')
np.savetxt('gopt_feats/te_keys.csv', keys, delimiter=',', fmt='%s')
np.savetxt('gopt_feats/te_labels.csv', labels, delimiter=',', fmt='%s')
