# Notebook purpose
In order to establish a need for an accurate model of syllabification, I identify a popular dataset for linguists (CELEX), and I notice a glaring issue with their orthographic syllable data: it is often inconsistent with the phonologic syllabification data.

e.g., "ab.dom.i.nal" -> \[ab.daw.mih.nahl\]

To get a sense of how prevalent these errors are, I will manually pick through 3000 words sampled from CELEX and mark whether or not the orthographic and phonological syllabifications align.

In [1]:
# 1
# imports
import pandas as pd
import random
from IPython.display import clear_output
import ipywidgets as widgets
import os

In [2]:
# 2
# Helper function to convert to ipa

def to_ipa(clx_string):
    c2i = {'I': 'ɪ', 'E': 'ɛ', '{': 'æ', 'Q': 'ɑ',
           'V': 'ʌ', 'U': 'ʊ', '$': 'ɔɚ', '3': 'ɚ',
           'i': 'iː', '1': 'eɪ', '5': 'oʊ', 'u': 'uː',
           '2': 'aɪ', '4': 'ɔɪ', '6': 'aʊ', '7': 'ɪɚ',
           '8': 'ɛɚ', '#': 'aɚ', '9': 'ʊɚ', '@': 'ə',
           'p': 'p', 'b': 'b', 't': 't', 'd': 'd', 'k': 'k',
           'g': 'g', 'J': 'tʃ', '_': 'dʒ', 'f': 'f', 'v': 'v',
           'T': 'θ', 'D': 'ð', 's': 's', 'S': 'ʃ', 'z': 'z',
           'Z': 'ʒ', 'h': 'h', 'r': 'ɹ', 'm': 'm', 'n': 'n',
           'N': 'ŋ', 'l': 'l', 'P': 'əl', 'H': 'ən', 'j': 'j',
           'w': 'w', '.':'.', '-':'.'}
    res = ""
    for c in clx_string:
        if c in c2i:
            res += c2i[c]
        else:
            res += c
    return res

In [21]:
# 3
# opening and parsing celex file
def get_and_zip():
    with open('data/Celex-sample.txt', 'r', encoding='utf-8') as file:
        raw = pd.read_csv(file, comment='#', sep=';;', engine='python')
        raw = raw[raw['SylCnt'] > 1]
    
    # sort into tuples of (ortho, ortho_syl, phon_syl_CLX)
    ipa = [to_ipa(str(clx)) for clx in raw['PhonSylCLX'].to_list()]
    zipped = list(zip(raw['Head'], raw['HeadSyl'], ipa, raw['SylCnt']))
    return zipped

In [None]:
# 4
# the code for testing

zipped = get_and_zip()

sample_size = 3000
done = 0
correct = 0
incorrect = 0

random.seed(69)
random.shuffle(zipped)
sample = zipped[:sample_size]

correct_filename = 'celex_cleanliness_output/correctly_syllabified.txt'
incorrect_filename = 'celex_cleanliness_output/incorrectly_syllabified.txt'
counter_file = 'celex_cleanliness_output/counter.txt'

def save_to_file(word, filename):
    mode = 'a' if os.path.exists(filename) else 'w+'
    with open(filename, mode, encoding='utf-8') as file:
        if mode == 'w+':
            file.write("word,ortho_syl,phon_syl,syl_count")
        file.write(f"{word}\n")
        

In [58]:
def save_counter(count, filename, encoding='utf-8'):
    # save as a text file? lol
    with open(filename, 'w+') as file:
        file.write(str(count))

In [60]:
# 5 start loop
print("Empty enter to approve, any character + enter to deny, q + enter to quit")

for s in sample[done:]:
    print(done)
    print(f"word: {s[0]}")
    print(f"ortho-syl: {s[1]}")
    print(f"phon-syl: {s[2]}")
    response = input("answer: ")
    print(response)
    if response == "":
        correct += 1
        done += 1
        save_to_file(f"{s[0]},{s[1]},{s[2]},{s[3]}", correct_filename)
    elif response == "q":
        save_counter(done, counter_file)
        break
    else:
        incorrect += 1
        done += 1
        save_to_file(f"{s[0]},{s[1]},{s[2]},{s[3]}", incorrect_filename)
    clear_output(wait=False)

In [None]:
# note bioscope is incorrect, but phon I think is ncorrect us.kope??
# ave. == avenue, but is edge case, marked inccorect
# chasm -> cha.sm, algorithm -> al.go.ri.thm... many such cases. Incorrect
# adjudge -> ad.judge ou a.djudge? marking ad.judge as correct as a rare heterogenous double consonant (al.lot, dit.to, and bus.tle etc)
# bomber -> [bommer], bom.ber? incorrect, another heterogenous double consonant
# fire -> fi.re

# x??? axiomaticall -> ax. or a.x? incorrect, a.x is proper, allow for non common english onset x to be start of next syllable
# acreage????? edge case, incorrect
# changeableness I think this one the phon is wrong, incorrect

In [None]:
# ACCIDENT ANOPHELOUS AAA
# COIR ACCIDENT AAAA
# astrodome ACCIDENT AAA
# acceptance accident tired

# attributively

In [61]:
print(correct)
print(incorrect)

1884
1116
