# Load testing dataset

In [1]:
import json
with open("./dict/clean_th_lyrics.json", encoding="utf8") as f:
    test_input = json.loads(f.read())

print(len(test_input))

930


In [2]:
with open("./dict/clean_en_lyrics.json", encoding="utf8") as f:
    test_output = json.loads(f.read())

print(len(test_output))

930


# Import module

In [3]:
from model import sefr_cut
from model.TH2IPA import TH2IPA
from model.IPA2ENG import IPA_matching
from neuspell import BertChecker
import re

data folder is set to `e:\nlp\nlp-env\neuspell\neuspell\../data` script


# Initial variables

In [4]:
th2ipa = TH2IPA()
checker = BertChecker()

sefr_cut.SEFR_CUT.load_model(engine='model')
checker.from_pretrained()

loading model.....
Success
loading vocab from path:e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\vocab.pkl
initializing model
Number of parameters in the model: 185211810
Loading model params from checkpoint dir: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise


In [5]:
tokenize_result = []
th2ipa_result = []
ipa2eng_result = []
corrector_result = []

# Pipeline

In [6]:
def integration_pipeline(th_sent, th2ipa, checker):
    token_list = sefr_cut.tokenize(th_sent, k=80)[0]
    tokenize_result.append(token_list)
    ipa_list = th2ipa(token_list)
    th2ipa_result.append(ipa_list)
    eng_sent = ' '.join([IPA_matching(i)[0] for i in ipa_list]) 
    ipa2eng_result.append(eng_sent)
    correct_sent = checker.correct(eng_sent)
    correct_sent = re.sub(' \' ', '\'', correct_sent)
    corrector_result.append(correct_sent)
    return correct_sent

In [13]:
def evaluate(all_true, all_pred):
    all_word = 0
    correct_word = 0
    incorrect_word = 0
    for i in range(len(all_true)):
        sent_true = all_true[i].strip().lower()
        sent_pred = all_pred[i].strip().lower()

        sent_pred = re.sub(" \' ", "\'", sent_pred)
        sent_pred = re.sub(" \'", "\'", sent_pred)
        
        word_true = sent_true.split()
        word_pred = sent_pred.split()

        n = len(word_true) if len(word_true) < len(word_pred) else len(word_pred)
        for j in range(n):
            if word_true[j] == word_pred[j]:
                correct_word += 1
            else:
                incorrect_word += 1
            all_word += 1
        
    accuracy = correct_word / all_word
    print('all:', all_word, ' correct:', correct_word, ' incorrect:', incorrect_word)
    print('accuracy:', accuracy)
    return accuracy, all_word, correct_word, incorrect_word

# Test

In [8]:
for i in test_input:
    integration_pipeline(i, th2ipa, checker)



In [9]:
corrector_result[:10]

['tell her that she beauty full en die day i aw real mild her',
 'i kettle in her as i like dust geneticists',
 'did i say that little animal sewed coming a bass mind',
 'budget we done have deis soul',
 "tease me that you'll love me baby say aw it's aw again",
 "repeat bass overran over huckaba till it's in my brain",
 "until to send you're oh vain i caning",
 'did i say that dateline',
 'animals so coming an about mind',
 'wish i look back']

In [10]:
test_output[:10]

['tell her that she beautiful every day i remind her',
 'i get lost in her eyes like dust from the skies',
 "did i say that out loud i'm so crazy about mine",
 "but we don't have the same soul",
 'tell me that you love me baby say it again',
 "repeat it over and over until it's in my brain",
 "you need to send your location i can't think",
 'did i say that out loud',
 "i'm so crazy about mine",
 'when i look back']

In [14]:
accuracy, all_word, correct_word, incorrect_word = evaluate(test_output, corrector_result)

all: 5965  correct: 2107  incorrect: 3858
accuracy: 0.3532271584241408
