In [1]:
#@title libraries and imports

import pandas
import io
import sys
import re

In [16]:
#@title consonant and vowel definitions

# orthographic inventory of standard Maxaa dialect Somali
CONSONANTS         = ["'", "b", "t", "j", "x", "kh", "d", "r", "s", "sh", "dh",
                      "c", "g", "f", "q", "k", "l", "m", "n", "w", "h", "y"]

CONS_DIGRAPH       = ["kh", "sh", "dh"]

# defining all vowel variations
SHORT_VOWELS       = ["a", "i", "u", "e", "o"]
LONG_VOWELS        = ["aa", "ii", "uu", "ee", "oo"]

VOLATILE_DIPHTH    = ["ay", "aw", "ey", "oy", "ow",]
LONG_DIPHTH        = ["aay", "aaw", "eey", "ooy", "oow"]

# defining vowel short hands
DIPHTHONGS         = VOLATILE_DIPHTH + LONG_DIPHTH
VOWELS_BAR_DIPHTH  = SHORT_VOWELS + LONG_VOWELS

VOWELS_INCL_DIPHTH = VOWELS_BAR_DIPHTH + DIPHTHONGS

UNKNOWN_LENGTH     = "?"

In [25]:
#@title split_into_syllables

# name:        split_into_syllables
#
# inputs:      a word
#
# return:      a list of the syllables in word
#
# description: create a list of syllables from a given word
#              in standard Maxaa dialect Somali
#
# notes:

def split_into_syllables(word):

    syllables = []
    current_syllable = ""
    i = 0

    index_last_char = len(word)

    while i < len(word):

            # Check if current character is a consonant
            if (i == index_last_char - 1):
                    current_syllable += word[i]
                    i += 1

            elif word[i:i+2] in CONSONANTS:
                current_syllable += word[i:i+2]
                i += 2
            elif word[i] in CONSONANTS:
                current_syllable += word[i]
                i += 1


            # Check if current character is a vowel
            if word[i:i+3] in VOWELS_BAR_DIPHTH:

            # If the current syllable is not empty
            # and the next character is a vowel
            # add the current syllable to the list and reset it
                if ((current_syllable)  and
                    (i + 3 < len(word)) and
                    (word[i+3] in VOWELS_BAR_DIPHTH)):

                    syllables.append(current_syllable)
                    current_syllable = ""
                current_syllable += word[i:i+3]
                i += 3

            elif word[i:i+2] in VOWELS_BAR_DIPHTH:
                current_syllable += word[i:i+2]
                i += 2

            elif (i != index_last_char):
                if (word[i] in VOWELS_BAR_DIPHTH):
                    current_syllable += word[i]
                    i += 1

          # Add the current syllable to the list and reset
            if current_syllable:
                syllables.append(current_syllable)
                current_syllable = ""

    # correct error coda consonants are incorrectly
    # indexed on their own and trail behind
    corrected_syllables = correct_codas(syllables)
    return syllables


In [18]:
#@title correct_codas

# name:           correct_codas
#
# inputs:         a list of syllables that
#
# return:         the corrected list of syllables
#
# description:    picks up trailing consonants and attaches them
#                 to the previous syllable as its coda

def correct_codas(syllabified_word):

    counter = 0
    new_word = ""

    for i in syllabified_word:

        # TODO: assert against 0th index
        if i in CONSONANTS:
            new_word = syllabified_word[counter - 1] + i
            syllabified_word[counter - 1] = new_word

            syllabified_word.remove(i)
            new_word = ""

        counter += 1
    return syllabified_word

In [19]:
#@title parser

# name:           parser
#
# inputs:         string
#
# return:         properly syllabified string
#
# description:    take any string in standard Maxaa-dialect Somali and
#                 parse it into a list of syllables
#

def parser(line):
    line = line.lower()

    # remove all non-alphabet characters
    regex = re.compile('[^a-zA-Z ]')
    line = regex.sub(' ', line)

    word_arr = line.split()

    parsed_line = []
    for word in word_arr:
        syllables = split_into_syllables(word)
        parsed_line += syllables

    # unsure why I need to correct here?
    parsed_line = correct_codas(parsed_line)
    return parsed_line

In [20]:
#@title count_morae

# name:           count_morae
#
# inputs:         list of syllables
#
# return:         list of their moraic length
#
# description:    take a list of syllables and return their
#                 length (short: 1) or (long: 2)
#
# note:           diphthongs unimplemented
#                 rendered as '?' since they can very in
#                 length depending on several factors

def count_morae(parsed_line):
    morae_list = []
    index = 0
    for syl in parsed_line:
        # if LONG_DIPHTH in syl:
        if any(s in syl for s in LONG_DIPHTH):
            morae_list.append(2)


        # if VOLATILE_DIPHTH in syl:
        elif any(s in syl for s in VOLATILE_DIPHTH):

          # TODO: implement function to identify diphthong lengths
            morae_list.append(UNKNOWN_LENGTH)


        # if LONG_VOWELS in syl:
        elif any(s in syl for s in LONG_VOWELS):
            morae_list.append(2)


        # if SHORT_VOWELS in syl:
        elif any(s in syl for s in SHORT_VOWELS):
            morae_list.append(1)

    return morae_list


In [21]:
#@title sum_morae

# name:           sum_morae
#
# inputs:         list of morae
#
# return:         number of unknown morae and sum of the list
#
# description:    sums the length of syllables in a line
#                 and indentify number of unknown syllables
#
def sum_morae(morae_list):
    sum = 0
    unknown_morae = 0
    for mora in morae_list:
        if mora == UNKNOWN_LENGTH:
            unknown_morae += 1
        else:
            sum += mora
    return unknown_morae, sum


In [26]:
#@title main

line = input("Enter one line: ")

list_of_syllables = parser(line)
print("\n")
print(list_of_syllables)

list_of_morae = count_morae(list_of_syllables)
print(list_of_morae)

unknown, known = sum_morae(list_of_morae)

print("\n")
print("known length: ", known)
print("unknown syllables: ", unknown)

Enter one line: aw awr caw caws hello qaamuus bar iyo badh hayay


['aw', 'awr', 'caw', 'caws', 'hel', 'lo', 'qaa', 'muus', 'bar', 'i', 'yo', 'badh', 'ha', 'yay']
['?', '?', '?', '?', 1, 1, 2, 2, 1, 1, 1, 1, 1, '?']


known length:  11
unknown syllables:  5
