In [18]:
from hyphenate import hyphenate_word
from model import sp_syllabler
import pickle
from nltk.metrics.distance import edit_distance
import pandas as pd
from hyphenate import hyphenate_word
from tensorflow.keras.preprocessing.sequence import pad_sequences
import time
import random
import numpy as np
import syllables
import pyphen

In [10]:
# 1
# prepping and calling functions

def get_probability(y_val):
    num_pos = 0
    num_neg = 0
    total = 0
    for word in y_val:
        for c in word:
            if c == 2:
                num_pos += 1
                total += 1
            elif c == 1:
                num_neg += 1
                total += 1
    return float(num_pos)/float(total)

def calc_brier(attempted, probability):
    total = 0
    sum_brier = 0
    for word in attempted:
        for c in attempted:
            total += 1
            if c == 2:
                sum_brier += (probability - 1)**2
            elif c == 1:
                sum_brier += (probability - 0)**2
    return (1./total)*(sum_brier)

def calc_f1(attempted, true):
    true_pos = 0
    true_neg = 0
    false_pos = 0
    false_neg = 0
    correct_num_char = 0
    total_checked = 0
    for i in range(0, len(attempted)):
        total_checked += 1
        if (len(attempted[i]) == len(true[i])):
            correct_num_char += 1
            for j in range(0, len(attempted[i])):
                if(attempted[i][j] == true[i][j]):
                    if true[i][j] == 1:
                        true_neg += 1
                    elif true[i][j] == 2:
                        true_pos += 1
                else:
                    if true[i][j] == 1:
                        false_pos += 1
                    elif true[i][j] == 2:
                        false_neg += 1
    
    precision = true_pos/(true_pos+false_pos)
    recall = true_pos/(true_pos + false_neg)
    f_one = 2/((1/precision)+(1/recall))
    
    return total_checked, correct_num_char, true_pos, true_neg, false_pos, false_neg, precision, recall, f_one

def convert_to_hot(syl_word):
    hot = []
    i = 0
    while  i < len(syl_word):
        if i == len(syl_word) - 1:
            hot += [1]
            return hot
        if syl_word[i+1] == '-':
            hot += [2]
            i += 2
        else:
            hot += [1]
            i += 1
    return hot

def to_categorical(sequences):
        cat_sequences = []
        for s in sequences:
            cats = []
            for item in s:
                cats.append(np.zeros(3))
                cats[-1][item] = 1.0
            cat_sequences.append(cats)
        return np.array(cat_sequences)

def data_prep(random_seed):
    random.seed(random_seed)
    training_data_size = 20000
    e2i_vocab_ortho = []

    x_tr_ortho = []
    y_tr = []

    x_val_ortho = []
    y_val = []

    orig_file = open('data/post_clean.txt')

    orig_data = orig_file.readlines()
    orig_file.close()
    orig_data = [line.strip('\n') for line in orig_data]
    random.shuffle(orig_data)
    data_eng = [line.split(';')[0].lower() for line in orig_data]
    data_syl = [line.split(';')[1].lower() for line in orig_data]
    y_tr = []

    for line in data_eng:
        for c in line:
            if c not in e2i_vocab_ortho:
                e2i_vocab_ortho += [c]

    e2i_ortho = dict((a,i) for i,a in enumerate(e2i_vocab_ortho, 1))

    for line in data_eng[:training_data_size]:
        converted = []
        for c in line:
            converted += [e2i_ortho[c]]
        x_tr_ortho += [converted]

    x_tr_ortho = pad_sequences(x_tr_ortho, maxlen=20, padding='post')

    for line in data_syl[:training_data_size]:
        y_tr += [convert_to_hot(line)]

    y_tr = pad_sequences(y_tr, maxlen=20, padding='post')
    
    x_val_ortho = []
    y_val = []

    for line in data_eng[training_data_size:]:
        converted = []
        for c in line:
            converted += [e2i_ortho[c]]
        x_val_ortho += [converted]

    x_val_ortho = pad_sequences(x_val_ortho, maxlen=20, padding='post')

    for line in data_syl[training_data_size:]:
        y_val += [convert_to_hot(line)]

    y_val = pad_sequences(y_val, maxlen=20, padding='post')
    return x_tr_ortho, y_tr, x_val_ortho, y_val, e2i_ortho

def training_split(x_tr_ortho, y_tr):
    y_tr = to_categorical(y_tr)
    split_index = int(.8 * len(x_tr_ortho))

    x_test_ortho = x_tr_ortho[split_index:]
    y_test = y_tr[split_index:]

    x_tr_ortho = x_tr_ortho[:split_index]
    y_tr = y_tr[:split_index]
    return x_tr_ortho, x_test_ortho, y_tr, y_test

def train_model(sp, x_tr_ortho, y_tr, x_test_ortho, y_test, run_id):
    sp.fit(x_tr_ortho, y_tr, x_test_ortho, y_test, ep=70, batch_size=128, save_filename="eval_runs/%i_single_pen_best_weights.h5"%run_id, verbose=0)
    
def sp_attempts(sp, x_val_ortho):
    attempts = []
    for i in range(0, len(x_val_ortho)):
        attempts += [sp.raw_syllabify(x_val_ortho[i])]
        print(i, end='\r')
    attempts_stripped = []
    for x in attempts:
        attempts_stripped += [[i for i in x if i !=0]]
    return attempts_stripped

def back_to_eng(x_val_ortho, e2i_ortho):
    converted_back_to_eng = []
    for x in x_val_ortho:
        real_word = ""
        for i in x:
                if i != 0:
                    real_word += list(e2i_ortho.keys())[list(e2i_ortho.values()).index(i)]
        converted_back_to_eng += [real_word]
    return converted_back_to_eng

def insert_and_rehot(sp, attempts, converted_back_to_eng):
    eng_conv_attempts = []
    for i in range(0, len(attempts)):
        eng_conv_attempts += [sp.insert_syl(converted_back_to_eng[i], attempts[i])]
    rehot_attempts = []
    for word in eng_conv_attempts:
        rehot_attempts += [convert_to_hot(word)]
    return rehot_attempts

def hyphenator_run(converted_back_to_eng):
    liang_attempts = []
    for word in converted_back_to_eng:
        liang_attempts += ['-'.join(hyphenate_word(word))]

    liang_attempts_hot_encoded = [convert_to_hot(word) for word in liang_attempts]
    return liang_attempts_hot_encoded


def inconsistency_grab(sp, attempts, reals, converted_back_to_eng, run_id, sp_hyph):
    filename = 'final_evaluation/'+ sp_hyph + '_run_%i_incorrect_syls.txt'%run_id
    sum_lev = 0
    incorrect_counter = 0
    incorrect_syl_count = 0
    file = open(filename, 'w+', encoding='utf-8')
    file.write('Attempt' + '\t' + 'Real' + '\n')
    for i in range(0, len(attempts)):
        if attempts[i] != reals[i]:
            incorrect_counter += 1
            a = sp.insert_syl(converted_back_to_eng[i], attempts[i])
            r = sp.insert_syl(converted_back_to_eng[i], reals[i])
            if len(a.split('-')) != len(r.split('-')):
                incorrect_syl_count += 1
            sum_lev += edit_distance(a,r,substitution_cost=1, transpositions=True)
            file.write(a + '\t' + r + '\n')
            
    syllable_accuracy = ((len(attempts) - incorrect_syl_count)/len(attempts))
    av_lev_dist = (sum_lev/incorrect_counter)
    file.write("Words with errors: %i"%incorrect_counter +'\n')
    file.write("Words with incorrect number of syllables: %i"%incorrect_syl_count +'\n')
    file.write("Total evluated: %i"%len(attempts) +'\n')
    file.write("Perfect accuracy: %.2f"%((len(attempts) - incorrect_counter)/len(attempts)) +'\n')
    file.write("Number of syllables accuracy: %.2f"%((len(attempts) - incorrect_syl_count)/len(attempts)) +'\n')
    file.write("Average Levenshtein Distance(across incorrect words): %.2f"%(sum_lev/incorrect_counter) +'\n')
    file.close()
    return syllable_accuracy, av_lev_dist

def strip_y_val(y_val):
    reals = []
    for x in y_val:
        reals += [[i for i in x if i !=0]]
    return reals

In [11]:
# 3
# full run

# instantiating a dataframe for recording run data
df = pd.DataFrame(columns=['run_id', 'random_seed','val_sample_size','training_time_seconds' ,'y_val_syl_prob', 'sp_f1_score', 'sp_precision', 'sp_recall','sp_brier_score','sp_syllable_accuracy','sp_average_lev', 'hyph_f1_score','hyph_precision', 'hyph_recall', 'hyph_brier_score','hyph_syllable_accuracy','hyph_average_lev'])
sp = None

for i in range(1,11):
    print("run: %i"%i)
    random_seed = int(time.time())
    run_stats = {'run_id':i, 'random_seed':random_seed,'val_sample_size':'null','training_time_seconds':'null', 'y_val_syl_prob':'null', 'sp_f1_score':'null', 'sp_precision':'null', 'sp_recall':'null','sp_brier_score':'null','sp_syllable_accuracy':'null','sp_average_lev':'null', 'hyph_f1_score':'null','hyph_precision':'null', 'hyph_recall':'null', 'hyph_brier_score':'null','hyph_syllable_accuracy':'null','hyph_average_lev':'null'}
    
    x_tr_ortho, y_tr, x_val_ortho, y_val, e2i_ortho = data_prep(random_seed=random_seed)
    
    run_stats['val_sample_size'] = len(x_val_ortho)
    
    x_tr_ortho, x_test_ortho, y_tr, y_test = training_split(x_tr_ortho=x_tr_ortho, y_tr=y_tr)
    
    del sp
    sp = sp_syllabler(e2i_ortho= e2i_ortho, ortho_input_size=20,latent_dim=32,embed_dim=32 ,max_feat=36)
    
    start = time.time()
    print("Begin sp training, run: %i"%i)
    train_model(sp, x_tr_ortho, y_tr, x_test_ortho, y_test, run_id=i)
    print("End sp training, run: %i"%i)
    end = time.time()
    run_stats['training_time_seconds'] = end - start
    
    print("Begin sp attempts, run: %i"%i)
    sp_attempts_array = sp_attempts(sp, x_val_ortho)
    print("Completed sp attempts, run: %i"%i)
    
    converted_back_to_eng = back_to_eng(x_val_ortho, e2i_ortho)
    
    sp_rehot_attempts = insert_and_rehot(sp, sp_attempts_array, converted_back_to_eng)
    
    liang_attempts_hot_encoded = hyphenator_run(converted_back_to_eng)
    
    syl_prob = get_probability(y_val)
    
    run_stats['y_val_syl_prob'] = syl_prob
    
    reals = strip_y_val(y_val)
    
    sp_total_checked, sp_correct_num_char, sp_true_pos, sp_true_neg, sp_false_pos, sp_false_neg, sp_precision, sp_recall, sp_f_one = calc_f1(sp_rehot_attempts, reals)
    sp_brier = calc_brier(sp_rehot_attempts, syl_prob)
    
    run_stats['sp_f1_score'] = sp_f_one
    run_stats['sp_precision'] = sp_precision
    run_stats['sp_recall'] = sp_recall
    run_stats['sp_brier_score'] = sp_brier
    
    hyph_total_checked, hyph_correct_num_char, hyph_true_pos, hyph_true_neg, hyph_false_pos, hyph_false_neg, hyph_precision, hyph_recall, hyph_f_one = calc_f1(liang_attempts_hot_encoded, reals)
    hyph_brier = calc_brier(liang_attempts_hot_encoded, syl_prob)
    
    run_stats['hyph_f1_score'] = hyph_f_one
    run_stats['hyph_precision'] = hyph_precision
    run_stats['hyph_recall'] = hyph_recall
    run_stats['hyph_brier_score'] = hyph_brier
    
    sp_syllable_accuracy, sp_av_lev_dist = inconsistency_grab(sp, sp_rehot_attempts, reals, converted_back_to_eng, run_id=i, sp_hyph='sp')
    hyph_syllable_accuracy, hyph_av_lev_dist = inconsistency_grab(sp, liang_attempts_hot_encoded, reals, converted_back_to_eng, run_id=i, sp_hyph='hyph')
    
    run_stats['sp_syllable_accuracy'] = sp_syllable_accuracy
    run_stats['sp_average_lev'] = sp_av_lev_dist
    
    run_stats['hyph_syllable_accuracy'] = hyph_syllable_accuracy
    run_stats['hyph_average_lev'] = hyph_av_lev_dist
    
    df = df.append(run_stats,ignore_index=True)
    df.to_csv("final_evaluation/run_%i_liang_sp_comparison.csv"%i, sep=',', index=False, encoding='utf-8')
    display(df)

df.to_csv('final_evaluation/total_liang_sp_comparison.csv', sep=',', index=False, encoding='utf-8')
display(df)

run: 1
Begin sp training, run: 1
Epoch 68: early stopping
End sp training, run: 1
Begin sp attempts, run: 1
Completed sp attempts, run: 1


Unnamed: 0,run_id,random_seed,val_sample_size,training_time_seconds,y_val_syl_prob,sp_f1_score,sp_precision,sp_recall,sp_brier_score,sp_syllable_accuracy,sp_average_lev,hyph_f1_score,hyph_precision,hyph_recall,hyph_brier_score,hyph_syllable_accuracy,hyph_average_lev
0,1.0,1682287000.0,6500.0,690.2681,0.208572,0.915718,0.912705,0.91875,0.0,0.905692,1.159526,0.883243,0.972418,0.809049,0.0,0.727077,1.178936


run: 2
Begin sp training, run: 2
Epoch 60: early stopping
End sp training, run: 2
Begin sp attempts, run: 2
Completed sp attempts, run: 2


Unnamed: 0,run_id,random_seed,val_sample_size,training_time_seconds,y_val_syl_prob,sp_f1_score,sp_precision,sp_recall,sp_brier_score,sp_syllable_accuracy,sp_average_lev,hyph_f1_score,hyph_precision,hyph_recall,hyph_brier_score,hyph_syllable_accuracy,hyph_average_lev
0,1.0,1682287000.0,6500.0,690.2681,0.208572,0.915718,0.912705,0.91875,0.0,0.905692,1.159526,0.883243,0.972418,0.809049,0.0,0.727077,1.178936
1,2.0,1682289000.0,6500.0,912.439743,0.207663,0.909626,0.906282,0.912995,0.0,0.898769,1.141886,0.887075,0.972707,0.815301,0.0,0.740462,1.174302


run: 3
Begin sp training, run: 3
Epoch 65: early stopping
End sp training, run: 3
Begin sp attempts, run: 3
Completed sp attempts, run: 3


Unnamed: 0,run_id,random_seed,val_sample_size,training_time_seconds,y_val_syl_prob,sp_f1_score,sp_precision,sp_recall,sp_brier_score,sp_syllable_accuracy,sp_average_lev,hyph_f1_score,hyph_precision,hyph_recall,hyph_brier_score,hyph_syllable_accuracy,hyph_average_lev
0,1.0,1682287000.0,6500.0,690.2681,0.208572,0.915718,0.912705,0.91875,0.0,0.905692,1.159526,0.883243,0.972418,0.809049,0.0,0.727077,1.178936
1,2.0,1682289000.0,6500.0,912.439743,0.207663,0.909626,0.906282,0.912995,0.0,0.898769,1.141886,0.887075,0.972707,0.815301,0.0,0.740462,1.174302
2,3.0,1682290000.0,6500.0,1101.075163,0.208328,0.9095,0.914337,0.904713,0.0,0.9,1.170837,0.882126,0.970378,0.808588,0.0,0.73,1.192641


run: 4
Begin sp training, run: 4
Epoch 53: early stopping
End sp training, run: 4
Begin sp attempts, run: 4
Completed sp attempts, run: 4


Unnamed: 0,run_id,random_seed,val_sample_size,training_time_seconds,y_val_syl_prob,sp_f1_score,sp_precision,sp_recall,sp_brier_score,sp_syllable_accuracy,sp_average_lev,hyph_f1_score,hyph_precision,hyph_recall,hyph_brier_score,hyph_syllable_accuracy,hyph_average_lev
0,1.0,1682287000.0,6500.0,690.2681,0.208572,0.915718,0.912705,0.91875,0.0,0.905692,1.159526,0.883243,0.972418,0.809049,0.0,0.727077,1.178936
1,2.0,1682289000.0,6500.0,912.439743,0.207663,0.909626,0.906282,0.912995,0.0,0.898769,1.141886,0.887075,0.972707,0.815301,0.0,0.740462,1.174302
2,3.0,1682290000.0,6500.0,1101.075163,0.208328,0.9095,0.914337,0.904713,0.0,0.9,1.170837,0.882126,0.970378,0.808588,0.0,0.73,1.192641
3,4.0,1682292000.0,6500.0,1263.763281,0.207434,0.905639,0.905555,0.905724,0.0,0.896462,1.14402,0.881325,0.968096,0.808829,0.0,0.729077,1.188273


run: 5
Begin sp training, run: 5
Epoch 60: early stopping
End sp training, run: 5
Begin sp attempts, run: 5
Completed sp attempts, run: 5


Unnamed: 0,run_id,random_seed,val_sample_size,training_time_seconds,y_val_syl_prob,sp_f1_score,sp_precision,sp_recall,sp_brier_score,sp_syllable_accuracy,sp_average_lev,hyph_f1_score,hyph_precision,hyph_recall,hyph_brier_score,hyph_syllable_accuracy,hyph_average_lev
0,1.0,1682287000.0,6500.0,690.2681,0.208572,0.915718,0.912705,0.91875,0.0,0.905692,1.159526,0.883243,0.972418,0.809049,0.0,0.727077,1.178936
1,2.0,1682289000.0,6500.0,912.439743,0.207663,0.909626,0.906282,0.912995,0.0,0.898769,1.141886,0.887075,0.972707,0.815301,0.0,0.740462,1.174302
2,3.0,1682290000.0,6500.0,1101.075163,0.208328,0.9095,0.914337,0.904713,0.0,0.9,1.170837,0.882126,0.970378,0.808588,0.0,0.73,1.192641
3,4.0,1682292000.0,6500.0,1263.763281,0.207434,0.905639,0.905555,0.905724,0.0,0.896462,1.14402,0.881325,0.968096,0.808829,0.0,0.729077,1.188273
4,5.0,1682294000.0,6500.0,1820.057594,0.208466,0.903487,0.895525,0.911592,0.0,0.902462,1.190633,0.881564,0.968083,0.80924,0.0,0.730923,1.196757


run: 6
Begin sp training, run: 6
Epoch 65: early stopping
End sp training, run: 6
Begin sp attempts, run: 6
Completed sp attempts, run: 6


Unnamed: 0,run_id,random_seed,val_sample_size,training_time_seconds,y_val_syl_prob,sp_f1_score,sp_precision,sp_recall,sp_brier_score,sp_syllable_accuracy,sp_average_lev,hyph_f1_score,hyph_precision,hyph_recall,hyph_brier_score,hyph_syllable_accuracy,hyph_average_lev
0,1.0,1682287000.0,6500.0,690.2681,0.208572,0.915718,0.912705,0.91875,0.0,0.905692,1.159526,0.883243,0.972418,0.809049,0.0,0.727077,1.178936
1,2.0,1682289000.0,6500.0,912.439743,0.207663,0.909626,0.906282,0.912995,0.0,0.898769,1.141886,0.887075,0.972707,0.815301,0.0,0.740462,1.174302
2,3.0,1682290000.0,6500.0,1101.075163,0.208328,0.9095,0.914337,0.904713,0.0,0.9,1.170837,0.882126,0.970378,0.808588,0.0,0.73,1.192641
3,4.0,1682292000.0,6500.0,1263.763281,0.207434,0.905639,0.905555,0.905724,0.0,0.896462,1.14402,0.881325,0.968096,0.808829,0.0,0.729077,1.188273
4,5.0,1682294000.0,6500.0,1820.057594,0.208466,0.903487,0.895525,0.911592,0.0,0.902462,1.190633,0.881564,0.968083,0.80924,0.0,0.730923,1.196757
5,6.0,1682296000.0,6500.0,2404.748408,0.210763,0.908757,0.907216,0.910304,0.0,0.894769,1.151888,0.880718,0.968343,0.807636,0.0,0.723231,1.189974


run: 7
Begin sp training, run: 7
Epoch 55: early stopping
End sp training, run: 7
Begin sp attempts, run: 7
Completed sp attempts, run: 7


Unnamed: 0,run_id,random_seed,val_sample_size,training_time_seconds,y_val_syl_prob,sp_f1_score,sp_precision,sp_recall,sp_brier_score,sp_syllable_accuracy,sp_average_lev,hyph_f1_score,hyph_precision,hyph_recall,hyph_brier_score,hyph_syllable_accuracy,hyph_average_lev
0,1.0,1682287000.0,6500.0,690.2681,0.208572,0.915718,0.912705,0.91875,0.0,0.905692,1.159526,0.883243,0.972418,0.809049,0.0,0.727077,1.178936
1,2.0,1682289000.0,6500.0,912.439743,0.207663,0.909626,0.906282,0.912995,0.0,0.898769,1.141886,0.887075,0.972707,0.815301,0.0,0.740462,1.174302
2,3.0,1682290000.0,6500.0,1101.075163,0.208328,0.9095,0.914337,0.904713,0.0,0.9,1.170837,0.882126,0.970378,0.808588,0.0,0.73,1.192641
3,4.0,1682292000.0,6500.0,1263.763281,0.207434,0.905639,0.905555,0.905724,0.0,0.896462,1.14402,0.881325,0.968096,0.808829,0.0,0.729077,1.188273
4,5.0,1682294000.0,6500.0,1820.057594,0.208466,0.903487,0.895525,0.911592,0.0,0.902462,1.190633,0.881564,0.968083,0.80924,0.0,0.730923,1.196757
5,6.0,1682296000.0,6500.0,2404.748408,0.210763,0.908757,0.907216,0.910304,0.0,0.894769,1.151888,0.880718,0.968343,0.807636,0.0,0.723231,1.189974
6,7.0,1682299000.0,6500.0,2698.272454,0.20729,0.902475,0.90819,0.896831,0.0,0.896,1.155823,0.878911,0.968524,0.804477,0.0,0.727385,1.194861


run: 8
Begin sp training, run: 8
Epoch 64: early stopping
End sp training, run: 8
Begin sp attempts, run: 8
Completed sp attempts, run: 8


Unnamed: 0,run_id,random_seed,val_sample_size,training_time_seconds,y_val_syl_prob,sp_f1_score,sp_precision,sp_recall,sp_brier_score,sp_syllable_accuracy,sp_average_lev,hyph_f1_score,hyph_precision,hyph_recall,hyph_brier_score,hyph_syllable_accuracy,hyph_average_lev
0,1.0,1682287000.0,6500.0,690.2681,0.208572,0.915718,0.912705,0.91875,0.0,0.905692,1.159526,0.883243,0.972418,0.809049,0.0,0.727077,1.178936
1,2.0,1682289000.0,6500.0,912.439743,0.207663,0.909626,0.906282,0.912995,0.0,0.898769,1.141886,0.887075,0.972707,0.815301,0.0,0.740462,1.174302
2,3.0,1682290000.0,6500.0,1101.075163,0.208328,0.9095,0.914337,0.904713,0.0,0.9,1.170837,0.882126,0.970378,0.808588,0.0,0.73,1.192641
3,4.0,1682292000.0,6500.0,1263.763281,0.207434,0.905639,0.905555,0.905724,0.0,0.896462,1.14402,0.881325,0.968096,0.808829,0.0,0.729077,1.188273
4,5.0,1682294000.0,6500.0,1820.057594,0.208466,0.903487,0.895525,0.911592,0.0,0.902462,1.190633,0.881564,0.968083,0.80924,0.0,0.730923,1.196757
5,6.0,1682296000.0,6500.0,2404.748408,0.210763,0.908757,0.907216,0.910304,0.0,0.894769,1.151888,0.880718,0.968343,0.807636,0.0,0.723231,1.189974
6,7.0,1682299000.0,6500.0,2698.272454,0.20729,0.902475,0.90819,0.896831,0.0,0.896,1.155823,0.878911,0.968524,0.804477,0.0,0.727385,1.194861
7,8.0,1682303000.0,6500.0,3656.232938,0.207923,0.91038,0.905398,0.915416,0.0,0.913231,1.157143,0.883987,0.970404,0.811703,0.0,0.733077,1.17908


run: 9
Begin sp training, run: 9
Epoch 63: early stopping
End sp training, run: 9
Begin sp attempts, run: 9
Completed sp attempts, run: 9


Unnamed: 0,run_id,random_seed,val_sample_size,training_time_seconds,y_val_syl_prob,sp_f1_score,sp_precision,sp_recall,sp_brier_score,sp_syllable_accuracy,sp_average_lev,hyph_f1_score,hyph_precision,hyph_recall,hyph_brier_score,hyph_syllable_accuracy,hyph_average_lev
0,1.0,1682287000.0,6500.0,690.2681,0.208572,0.915718,0.912705,0.91875,0.0,0.905692,1.159526,0.883243,0.972418,0.809049,0.0,0.727077,1.178936
1,2.0,1682289000.0,6500.0,912.439743,0.207663,0.909626,0.906282,0.912995,0.0,0.898769,1.141886,0.887075,0.972707,0.815301,0.0,0.740462,1.174302
2,3.0,1682290000.0,6500.0,1101.075163,0.208328,0.9095,0.914337,0.904713,0.0,0.9,1.170837,0.882126,0.970378,0.808588,0.0,0.73,1.192641
3,4.0,1682292000.0,6500.0,1263.763281,0.207434,0.905639,0.905555,0.905724,0.0,0.896462,1.14402,0.881325,0.968096,0.808829,0.0,0.729077,1.188273
4,5.0,1682294000.0,6500.0,1820.057594,0.208466,0.903487,0.895525,0.911592,0.0,0.902462,1.190633,0.881564,0.968083,0.80924,0.0,0.730923,1.196757
5,6.0,1682296000.0,6500.0,2404.748408,0.210763,0.908757,0.907216,0.910304,0.0,0.894769,1.151888,0.880718,0.968343,0.807636,0.0,0.723231,1.189974
6,7.0,1682299000.0,6500.0,2698.272454,0.20729,0.902475,0.90819,0.896831,0.0,0.896,1.155823,0.878911,0.968524,0.804477,0.0,0.727385,1.194861
7,8.0,1682303000.0,6500.0,3656.232938,0.207923,0.91038,0.905398,0.915416,0.0,0.913231,1.157143,0.883987,0.970404,0.811703,0.0,0.733077,1.17908
8,9.0,1682307000.0,6500.0,4295.067349,0.207451,0.906906,0.8963,0.917766,0.0,0.897385,1.154225,0.884436,0.966087,0.815511,0.0,0.738923,1.196885


run: 10
Begin sp training, run: 10
Epoch 63: early stopping
End sp training, run: 10
Begin sp attempts, run: 10
Completed sp attempts, run: 10


Unnamed: 0,run_id,random_seed,val_sample_size,training_time_seconds,y_val_syl_prob,sp_f1_score,sp_precision,sp_recall,sp_brier_score,sp_syllable_accuracy,sp_average_lev,hyph_f1_score,hyph_precision,hyph_recall,hyph_brier_score,hyph_syllable_accuracy,hyph_average_lev
0,1.0,1682287000.0,6500.0,690.2681,0.208572,0.915718,0.912705,0.91875,0.0,0.905692,1.159526,0.883243,0.972418,0.809049,0.0,0.727077,1.178936
1,2.0,1682289000.0,6500.0,912.439743,0.207663,0.909626,0.906282,0.912995,0.0,0.898769,1.141886,0.887075,0.972707,0.815301,0.0,0.740462,1.174302
2,3.0,1682290000.0,6500.0,1101.075163,0.208328,0.9095,0.914337,0.904713,0.0,0.9,1.170837,0.882126,0.970378,0.808588,0.0,0.73,1.192641
3,4.0,1682292000.0,6500.0,1263.763281,0.207434,0.905639,0.905555,0.905724,0.0,0.896462,1.14402,0.881325,0.968096,0.808829,0.0,0.729077,1.188273
4,5.0,1682294000.0,6500.0,1820.057594,0.208466,0.903487,0.895525,0.911592,0.0,0.902462,1.190633,0.881564,0.968083,0.80924,0.0,0.730923,1.196757
5,6.0,1682296000.0,6500.0,2404.748408,0.210763,0.908757,0.907216,0.910304,0.0,0.894769,1.151888,0.880718,0.968343,0.807636,0.0,0.723231,1.189974
6,7.0,1682299000.0,6500.0,2698.272454,0.20729,0.902475,0.90819,0.896831,0.0,0.896,1.155823,0.878911,0.968524,0.804477,0.0,0.727385,1.194861
7,8.0,1682303000.0,6500.0,3656.232938,0.207923,0.91038,0.905398,0.915416,0.0,0.913231,1.157143,0.883987,0.970404,0.811703,0.0,0.733077,1.17908
8,9.0,1682307000.0,6500.0,4295.067349,0.207451,0.906906,0.8963,0.917766,0.0,0.897385,1.154225,0.884436,0.966087,0.815511,0.0,0.738923,1.196885
9,10.0,1682312000.0,6500.0,4829.122139,0.209478,0.907635,0.903954,0.911347,0.0,0.899231,1.139669,0.878699,0.966128,0.80578,0.0,0.723538,1.18439


Unnamed: 0,run_id,random_seed,val_sample_size,training_time_seconds,y_val_syl_prob,sp_f1_score,sp_precision,sp_recall,sp_brier_score,sp_syllable_accuracy,sp_average_lev,hyph_f1_score,hyph_precision,hyph_recall,hyph_brier_score,hyph_syllable_accuracy,hyph_average_lev
0,1.0,1682287000.0,6500.0,690.2681,0.208572,0.915718,0.912705,0.91875,0.0,0.905692,1.159526,0.883243,0.972418,0.809049,0.0,0.727077,1.178936
1,2.0,1682289000.0,6500.0,912.439743,0.207663,0.909626,0.906282,0.912995,0.0,0.898769,1.141886,0.887075,0.972707,0.815301,0.0,0.740462,1.174302
2,3.0,1682290000.0,6500.0,1101.075163,0.208328,0.9095,0.914337,0.904713,0.0,0.9,1.170837,0.882126,0.970378,0.808588,0.0,0.73,1.192641
3,4.0,1682292000.0,6500.0,1263.763281,0.207434,0.905639,0.905555,0.905724,0.0,0.896462,1.14402,0.881325,0.968096,0.808829,0.0,0.729077,1.188273
4,5.0,1682294000.0,6500.0,1820.057594,0.208466,0.903487,0.895525,0.911592,0.0,0.902462,1.190633,0.881564,0.968083,0.80924,0.0,0.730923,1.196757
5,6.0,1682296000.0,6500.0,2404.748408,0.210763,0.908757,0.907216,0.910304,0.0,0.894769,1.151888,0.880718,0.968343,0.807636,0.0,0.723231,1.189974
6,7.0,1682299000.0,6500.0,2698.272454,0.20729,0.902475,0.90819,0.896831,0.0,0.896,1.155823,0.878911,0.968524,0.804477,0.0,0.727385,1.194861
7,8.0,1682303000.0,6500.0,3656.232938,0.207923,0.91038,0.905398,0.915416,0.0,0.913231,1.157143,0.883987,0.970404,0.811703,0.0,0.733077,1.17908
8,9.0,1682307000.0,6500.0,4295.067349,0.207451,0.906906,0.8963,0.917766,0.0,0.897385,1.154225,0.884436,0.966087,0.815511,0.0,0.738923,1.196885
9,10.0,1682312000.0,6500.0,4829.122139,0.209478,0.907635,0.903954,0.911347,0.0,0.899231,1.139669,0.878699,0.966128,0.80578,0.0,0.723538,1.18439


In [17]:
#cross comparing syllable accuracy with syllables

syllables_results = []

for random_seed in df['random_seed']:
    true_num_syl = []
    x_tr_ortho, y_tr, x_val_ortho, y_val, e2i_ortho = data_prep(random_seed=random_seed)
    converted_back_to_eng = back_to_eng(x_val_ortho, e2i_ortho)
    for x in y_val:
        num_syls = 1
        for c in x:
            if c == 2:
                num_syls += 1
        true_num_syl += [num_syls]
    syl_attempt = []
    for word in converted_back_to_eng:
        syl_attempt += [syllables.estimate(word)]
    
    correct_syls = 0
    for i in range(0, len(syl_attempt)):
        if syl_attempt[i] == true_num_syl[i]:
            correct_syls += 1
    syllables_results += [float(correct_syls)/float(len(syl_attempt))]
    
df['syllable_module_syl_count_accuracy'] = syllables_results
display(df)
df.to_csv('final_evaluation/total_liang_syllables_sp_comparison.csv', sep=',', index=False, encoding='utf-8')

Unnamed: 0,run_id,random_seed,val_sample_size,training_time_seconds,y_val_syl_prob,sp_f1_score,sp_precision,sp_recall,sp_brier_score,sp_syllable_accuracy,sp_average_lev,hyph_f1_score,hyph_precision,hyph_recall,hyph_brier_score,hyph_syllable_accuracy,hyph_average_lev,syllable_module_syl_count_accuracy
0,1.0,1682287000.0,6500.0,690.2681,0.208572,0.915718,0.912705,0.91875,0.0,0.905692,1.159526,0.883243,0.972418,0.809049,0.0,0.727077,1.178936,0.736
1,2.0,1682289000.0,6500.0,912.439743,0.207663,0.909626,0.906282,0.912995,0.0,0.898769,1.141886,0.887075,0.972707,0.815301,0.0,0.740462,1.174302,0.739692
2,3.0,1682290000.0,6500.0,1101.075163,0.208328,0.9095,0.914337,0.904713,0.0,0.9,1.170837,0.882126,0.970378,0.808588,0.0,0.73,1.192641,0.735692
3,4.0,1682292000.0,6500.0,1263.763281,0.207434,0.905639,0.905555,0.905724,0.0,0.896462,1.14402,0.881325,0.968096,0.808829,0.0,0.729077,1.188273,0.731231
4,5.0,1682294000.0,6500.0,1820.057594,0.208466,0.903487,0.895525,0.911592,0.0,0.902462,1.190633,0.881564,0.968083,0.80924,0.0,0.730923,1.196757,0.731538
5,6.0,1682296000.0,6500.0,2404.748408,0.210763,0.908757,0.907216,0.910304,0.0,0.894769,1.151888,0.880718,0.968343,0.807636,0.0,0.723231,1.189974,0.739538
6,7.0,1682299000.0,6500.0,2698.272454,0.20729,0.902475,0.90819,0.896831,0.0,0.896,1.155823,0.878911,0.968524,0.804477,0.0,0.727385,1.194861,0.735692
7,8.0,1682303000.0,6500.0,3656.232938,0.207923,0.91038,0.905398,0.915416,0.0,0.913231,1.157143,0.883987,0.970404,0.811703,0.0,0.733077,1.17908,0.734
8,9.0,1682307000.0,6500.0,4295.067349,0.207451,0.906906,0.8963,0.917766,0.0,0.897385,1.154225,0.884436,0.966087,0.815511,0.0,0.738923,1.196885,0.731846
9,10.0,1682312000.0,6500.0,4829.122139,0.209478,0.907635,0.903954,0.911347,0.0,0.899231,1.139669,0.878699,0.966128,0.80578,0.0,0.723538,1.18439,0.731538
