<a href="https://colab.research.google.com/github/ayub-nur/somali-syllable-and-meter/blob/main/Somali_Syllabifcation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

current errors:

infinate loop on enteries that don't conform to expected Somali syllable structure. (ex: "puntland")


In [1]:
#@title libraries and imports

import pandas
import io
import sys
import re

In [2]:
#@title consonant and vowel definitions

# orthographic inventory of standard Maxaa dialect Somali
CONSONANTS         = ["'", "b", "t", "j", "x", "kh", "d", "r", "s", "sh", "dh",
                      "c", "g", "f", "q", "k", "l", "m", "n", "w", "h", "y"]

CONS_DIGRAPH       = ["kh", "sh", "dh"]

# defining all vowel variations
SHORT_VOWELS       = ["a", "i", "u", "e", "o"]
LONG_VOWELS        = ["aa", "ii", "uu", "ee", "oo"]

VOLATILE_DIPHTH    = ["ay",   "aw",   "ey", "oy",  "ow"]
LONG_DIPHTH        = ["aay", "aaw", "eey", "ooy", "oow"]

# defining vowel short hands
DIPHTHONGS         = VOLATILE_DIPHTH + LONG_DIPHTH
VOWELS_BAR_DIPHTH  = SHORT_VOWELS + LONG_VOWELS

VOWELS_INCL_DIPHTH = VOWELS_BAR_DIPHTH + DIPHTHONGS

UNKNOWN_LENGTH     = "?"

In [3]:
#@title valid word

def is_valid_word(word):
    if not isalpha(word):
        return false



In [4]:
#@title parser

# name:           parser
#
# inputs:         string
#
# return:         properly syllabified string
#
# description:    take any string in standard Maxaa-dialect Somali and
#                 parse it into a list of syllables
#

def parser(line):
    line = line.lower()

    # TODO: fix error where ' is deleted

    # remove all non-alphabet characters
    regex = re.compile("[^a-zA-Z' ]")        # needs to accomodate '
    line = regex.sub(' ', line)             # breaks non-alpha into more words

    word_arr = line.split()

    parsed_line = []
    for word in word_arr:                   # O(n)
        syllables = split_into_syllables(word)
        parsed_line += syllables

    # unsure why I need to correct here?
    parsed_line = correct_codas(parsed_line)
    return parsed_line

In [5]:
#@title split_into_syllables

# name:        split_into_syllables
#
# inputs:      a word
#
# return:      a list of the syllables in word
#
# description: create a list of syllables from a given word
#              in standard Maxaa dialect Somali
#
# notes:

# TODO:
# solve infinite loop for cluster of >2 consonants

def split_into_syllables(word):

    syllables = []
    current_syllable = ""
    i = 0

    index_last_char = len(word)

    while i < len(word):                            # O(n)

            # check against indexing out of range
            if (i == index_last_char - 1):
                    current_syllable += word[i]
                    i += 1

            # Check if current character is a consonant
            elif word[i:i+2] in CONSONANTS:
                current_syllable += word[i:i+2]
                i += 2

            elif word[i] in CONSONANTS:
                current_syllable += word[i]
                i += 1


            # Check if current character is a vowel
            if word[i:i+3] in VOWELS_BAR_DIPHTH:

            # If the current syllable is not empty
            # and the next character is a vowel
            # add the current syllable to the list and reset it
                if ((current_syllable)  and
                    (i + 3 < len(word)) and
                    (word[i+3] in VOWELS_BAR_DIPHTH)):

                    syllables.append(current_syllable)
                    current_syllable = ""
                current_syllable += word[i:i+3]
                i += 3

            elif word[i:i+2] in VOWELS_BAR_DIPHTH:
                current_syllable += word[i:i+2]
                i += 2

            elif (i != index_last_char):
                if (word[i] in VOWELS_BAR_DIPHTH):
                    current_syllable += word[i]
                    i += 1

          # Add the current syllable to the list and reset
            if current_syllable:
                syllables.append(current_syllable)
                current_syllable = ""

    # correct error coda consonants are incorrectly
    # indexed on their own and trail behind

    corrected_syllables = correct_codas(syllables)  # is this even needed?

    return syllables


In [6]:
#@title correct_codas

# name:           correct_codas
#
# inputs:         a list of syllables that
#
# return:         the corrected list of syllables
#
# description:    picks up trailing consonants and attaches them
#                 to the previous syllable as its coda

def correct_codas(syllabified_word):

    counter = 0
    new_word = ""

    for i in syllabified_word:                      # O(n)

        # TODO: assert against 0th index
        if i in CONSONANTS:
            new_word = syllabified_word[counter - 1] + i
            syllabified_word[counter - 1] = new_word

            syllabified_word.remove(i)
            new_word = ""

        counter += 1
    return syllabified_word

In [19]:
#@title count_morae

# name:           count_morae
#
# inputs:         list of syllables
#
# return:         list of their moraic length
#
# description:    take a list of syllables and return their
#                 length (short: 1) or (long: 2)
#
# note:           diphthongs unimplemented
#                 rendered as '?' since they can very in
#                 length depending on several factors

def count_morae(parsed_line):
    morae_list = []
    index = 0
    for syl in parsed_line:
        # if LONG_DIPHTH in syl:
        if any(s in syl for s in LONG_DIPHTH):
            morae_list.append(2)


        # if VOLATILE_DIPHTH in syl:
        elif any(s in syl for s in VOLATILE_DIPHTH):

          # TODO: implement function to identify diphthong lengths
            morae_list.append(UNKNOWN_LENGTH)


        # if LONG_VOWELS in syl:
        elif any(s in syl for s in LONG_VOWELS):
            morae_list.append(2)


        # if SHORT_VOWELS in syl:
        elif any(s in syl for s in SHORT_VOWELS):
            morae_list.append(1)

    return morae_list

# Updated count_morae function to deterministically assign meter values

# def count_morae(parsed_line):
#     morae_list = []
#     for syl in parsed_line:
#         if any(s in syl for s in LONG_DIPHTH):
#             morae_list.append(2)
#         elif any(s in syl for s in VOLATILE_DIPHTH):
#             # Default to 1 for volatile diphthongs unless specific conditions are met for 2
#             morae_list.append(1)  # Update this line based on specific rules if needed
#         elif any(s in syl for s in LONG_VOWELS):
#             morae_list.append(2)
#         elif any(s in syl for s in SHORT_VOWELS):
#             morae_list.append(1)
#     return morae_list

# # Test the updated function with the given line
# updated_morae_list = updated_count_morae(list_of_syllables)
# list_of_syllables, updated_morae_list




In [8]:
#@title sum_morae

# name:           sum_morae
#
# inputs:         list of morae
#
# return:         number of unknown morae and sum of the list
#
# description:    sums the length of syllables in a line
#                 and indentify number of unknown syllables
#
def sum_morae(morae_list):
    sum = 0
    unknown_morae = 0
    for mora in morae_list:
        if mora == UNKNOWN_LENGTH:
            unknown_morae += 1
        else:
            sum += mora
    return unknown_morae, sum


In [9]:
#@title has_onset

# name:           has_onset
#
# inputs:         a syllable
#
# return:         true if syllable has a consonant onset, false othewise
#
# description:    checks if a syllable has a consonant onset
#
# note:           prints error if onset undetermined

def has_onset(syllable):

    syl_len = len(syllable)
    if syl_len in [0, 1]:
        return False

    if syl_len >= 3:
        if (syllable[0] in CONSONANTS or
           (syllable[:1]) in CONS_DIGRAPH):
           return True

    elif syl_len == 2:
        if (syllable[0] in CONSONANTS):
            return True

    print("Error: ", syllable, " onset not deterimined")
    return False

In [10]:
#@title has_coda

# name:           has_coda
#
# inputs:         a syllable
#
# return:         true if syllable has a consonant coda, false othewise
#
# description:    checks if a syllable has a consonant coda
#
# note:           prints error if coda undetermined

def has_coda(syllable):
    syl_len = len(syllable)

    if syl_len in [0, 1]:
        return False

    if syl_len > 2 and syllable[-2:-1] in VOLATILE_DIPHTH:
        return False

    elif syllable[-1] in CONSONANTS:
        return True

    elif syl_len > 2 and syllable[-2:-1] in CONS_DIGRAPH:
        return True

    print("Error: ", syllable, " coda not deterimined")
    return False

In [11]:
#@title variations
# For the template [2, 2, 1, 2, 2] these are the valid variations:
misaan1 = [[2, 2, 1, 2, 2], [1, 1, 2, 1, 2, 2], [2, 1, 1, 1, 2, 2], [2, 2, 1, 1, 1, 2], [2, 2, 1, 2, 1, 1], [1, 1, 1, 1, 1, 2, 2], [1, 1, 2, 1, 1, 1, 2], [1, 1, 2, 1, 2, 1, 1], [1, 1, 1, 1, 1, 2, 2], [2, 1, 1, 1, 1, 1, 2], [2, 1, 1, 1, 2, 1, 1], [1, 1, 2, 1, 1, 1, 2], [2, 1, 1, 1, 1, 1, 2], [2, 2, 1, 1, 1, 1, 1], [1, 1, 2, 1, 2, 1, 1], [2, 1, 1, 1, 2, 1, 1], [2, 2, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 2, 1, 1], [1, 1, 2, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 2, 1, 1], [2, 1, 1, 1, 1, 1, 1, 1], [1, 1, 2, 1, 1, 1, 1, 1], [2, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]

# For the template [1, 2, 2, 1, 2, 2] these are the valid variations:
misaan2 = [[1, 2, 2, 1, 2, 2], [1, 1, 1, 2, 1, 2, 2], [1, 2, 1, 1, 1, 2, 2], [1, 2, 2, 1, 1, 1, 2], [1, 2, 2, 1, 2, 1, 1], [1, 1, 1, 1, 1, 1, 2, 2], [1, 1, 1, 2, 1, 1, 1, 2], [1, 1, 1, 2, 1, 2, 1, 1], [1, 1, 1, 1, 1, 1, 2, 2], [1, 2, 1, 1, 1, 1, 1, 2], [1, 2, 1, 1, 1, 2, 1, 1], [1, 1, 1, 2, 1, 1, 1, 2], [1, 2, 1, 1, 1, 1, 1, 2], [1, 2, 2, 1, 1, 1, 1, 1], [1, 1, 1, 2, 1, 2, 1, 1], [1, 2, 1, 1, 1, 2, 1, 1], [1, 2, 2, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 2, 1, 1], [1, 1, 1, 2, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 2, 1, 1], [1, 2, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 2, 1, 1, 1, 1, 1], [1, 2, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]

# For the template [2, 1, 2, 1, 2, 2] these are the valid variations:
misaan3 = [[2, 1, 2, 1, 2, 2], [1, 1, 1, 2, 1, 2, 2], [2, 1, 1, 1, 1, 2, 2], [2, 1, 2, 1, 1, 1, 2], [2, 1, 2, 1, 2, 1, 1], [1, 1, 1, 1, 1, 1, 2, 2], [1, 1, 1, 2, 1, 1, 1, 2], [1, 1, 1, 2, 1, 2, 1, 1], [1, 1, 1, 1, 1, 1, 2, 2], [2, 1, 1, 1, 1, 1, 1, 2], [2, 1, 1, 1, 1, 2, 1, 1], [1, 1, 1, 2, 1, 1, 1, 2], [2, 1, 1, 1, 1, 1, 1, 2], [2, 1, 2, 1, 1, 1, 1, 1], [1, 1, 1, 2, 1, 2, 1, 1], [2, 1, 1, 1, 1, 2, 1, 1], [2, 1, 2, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 2, 1, 1], [1, 1, 1, 2, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 2, 1, 1], [2, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 2, 1, 1, 1, 1, 1], [2, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]

valid_misaan = misaan1 + misaan2 + misaan3

In [20]:
#@title main

# line = input("Enter one line: ")
line = "Guun soo jiraan nahay"

list_of_syllables = parser(line)
print("\n")
print(list_of_syllables)

list_of_morae = count_morae(list_of_syllables)
print(list_of_morae)

unknown, known = sum_morae(list_of_morae)

print("\n")
print("known length:      ", known)
print("unknown syllables: ", unknown)



['guun', 'soo', 'ji', 'raan', 'na', 'hay']
[2, 2, 1, 2, 1, '?']


known length:       8
unknown syllables:  1


In [21]:
def find_best_template_match(line, templates):
    parsed_syllables = parser(line)
    possible_meters =  count_morae(parsed_syllables)

    # Generate all possible combinations of meter values for the given line
    def generate_combinations(possible_meters):
        if UNKNOWN_LENGTH not in possible_meters:
            return [possible_meters]
        index = possible_meters.index(UNKNOWN_LENGTH)
        without_unknown = possible_meters[:index] + possible_meters[index + 1:]
        return (generate_combinations([1] + without_unknown) +
                generate_combinations([2] + without_unknown))

    all_combinations = generate_combinations(possible_meters)

    # Find the combination that matches one of the templates
    for combination in all_combinations:
        for template in templates:
            if combination == template or all(elem in combination for elem in template):
                return combination

    return None

# Example usage:
line = "Guun soo jiraan nahay"
templates = valid_misaan


best_match = find_best_template_match(line, templates)
print("Best Template Match:", best_match)


Best Template Match: [1, 2, 2, 1, 2, 1]
