# Notebook purpose
In order to establish a need for an accurate model of syllabification, I identify a popular dataset for linguists (CELEX), and I notice a glaring issue with their orthographic syllable data: it is often inconsistent with the phonologic syllabification data.

e.g., "ab.dom.i.nal" -> \[ab.daw.mih.nahl\]

To get a sense of how prevalent these errors are, I will manually pick through 3000 words sampled from CELEX and mark whether or not the orthographic and phonological syllabifications align.

In [1]:
# 1
# imports
import pandas as pd
import random
from IPython.display import clear_output
import ipywidgets as widgets
import os

In [2]:
# 2
# Helper function to convert to ipa

def to_ipa(clx_string):
    c2i = {'I': 'ɪ', 'E': 'ɛ', '{': 'æ', 'Q': 'ɑ',
           'V': 'ʌ', 'U': 'ʊ', '$': 'ɔɚ', '3': 'ɚ',
           'i': 'iː', '1': 'eɪ', '5': 'oʊ', 'u': 'uː',
           '2': 'aɪ', '4': 'ɔɪ', '6': 'aʊ', '7': 'ɪɚ',
           '8': 'ɛɚ', '#': 'aɚ', '9': 'ʊɚ', '@': 'ə',
           'p': 'p', 'b': 'b', 't': 't', 'd': 'd', 'k': 'k',
           'g': 'g', 'J': 'tʃ', '_': 'dʒ', 'f': 'f', 'v': 'v',
           'T': 'θ', 'D': 'ð', 's': 's', 'S': 'ʃ', 'z': 'z',
           'Z': 'ʒ', 'h': 'h', 'r': 'ɹ', 'm': 'm', 'n': 'n',
           'N': 'ŋ', 'l': 'l', 'P': 'əl', 'H': 'ən', 'j': 'j',
           'w': 'w', '.':'.', '-':'.'}
    res = ""
    for c in clx_string:
        if c in c2i:
            res += c2i[c]
        else:
            res += c
    return res

In [3]:
# 3
# opening and parsing celex file
with open('data/Celex-sample.txt', 'r', encoding='utf-8') as file:
    raw = pd.read_csv(file, comment='#', sep=';;', engine='python')
    raw = raw[raw['SylCnt'] > 1]

# sort into tuples of (ortho, ortho_syl, phon_syl_CLX)
ipa = [to_ipa(str(clx)) for clx in raw['PhonSylCLX'].to_list()]
zipped = list(zip(raw['Head'], raw['HeadSyl'], ipa, raw['SylCnt']))

In [9]:
# 4
# the code for testing

sample_size = 3000
done = 0
correct = 0
incorrect = 0

random.seed(69)
random.shuffle(zipped)
sample = zipped[:sample_size]

correct_filename = 'celex_cleanliness_output/correctly_syllabified.txt'
incorrect_filename = 'celex_cleanliness_output/incorrectly_syllabified.txt'
counter_file = 'celex_cleanliness_output/counter.txt'

def save_to_file(word, filename):
    mode = 'a' if os.path.exists(filename) else 'w+'
    with open(filename, mode, encoding='utf-8') as file:
        if mode == 'w+':
            file.write("word,ortho_syl,phon_syl,syl_count")
        file.write(f"{word}\n")
        
def save_counter(count, filename, encoding='utf-8'):
    # save as a text file? lol
    with open(filename, mode, 'w+') as file:
        file.write(count)

In [10]:
# 5 start loop
print("Empty enter to approve, any character + enter to deny, q + enter to quit")

for s in sample[done:]:
    print(done)
    print(f"word: {sample[0]}")
    print(f"ortho-syl: {sample[1]}")
    print(f"phon-syl: {sample[2]}")
    response = input("answer: ")
    print(response)
    if response == "":
        correct += 1
        done += 1
        save_to_file(f"{sample[0]},{sample[1]},{sample[2]},{sample[3]}", correct_filename)
    elif response == "q":
        save_counter(done, counter_file)
    else:
        incorrect += 1
        done += 1
        save_to_file(f"{sample[0]},{sample[1]},{sample[2]},{sample[3]}", incorrect_filename)
    clear_output(wait=True)

Empty enter to approve, any character + enter to deny, q + enter to quit
1
word: ('among', 'a-mong', 'ə.mʌŋ', 2.0)
ortho-syl: ('awkwardness', 'awk-ward-ness', 'ɔɚ.kwəd.nɪs', 3.0)
phon-syl: ('amazon', 'am-a-zon', 'æ.mə.zən', 3.0)


answer:   


 


NameError: name 'incorrect' is not defined