# Tagalog Part of Speech Tagger

## Initialization of Data Sets

### Source Data Set

In [128]:
import pandas as pd

# Opening the file
sample_tl_raw = open("src/text data/alpha_test.txt", encoding='utf-8').read()

# Splitting the raw data into sentences
parsed_sp_tl_raw = sample_tl_raw.split("\n")

dict_sm_tl = pd.DataFrame(parsed_sp_tl_raw, columns = ['Sentence'])

# Printing the first 5 rows of the DataFrame
dict_sm_tl.head()

Unnamed: 0,Sentence
0,GENESIS
1,Nilalang ng Dios ang sanglibutan.
2,Nang pasimula ay nilikha ng Dios ang langit at...
3,At ang lupa ay walang anyo at walang laman; at...
4,At sinabi ng Dios Magkaroon ng liwanag; at nag...


### Determiners Data Set

In [129]:
noun_dtmn_list = ["ang", "ng", "mga", "si", "ay", "ni", "sa", "y"] # Noun Determiners

adv_dtmn_list = ["nang"]

prepo_dtmn_list = ["sa", "nasa", "sumasa"]

adv_time_list = ['mamaya', 'ngayon', 'kahapon', 'bukas', 'pagkatapos', 'ngayong gabi', 'sa ngayon', 'kagabi', 'itong umaga', 'susunod na linggo', 'na', 'kamakailan lamang', 'kani-kanina lamang', 'maaga', 'kaagad', 'pa rin', 'pa', 'nakaraan']

adv_place_list = ['dito', 'doon', 'sa dako roon', 'sa lahat ng dako', 'kahit saan', 'wala kahit saan', 'tahanan', 'malayo', 'palabas']

adv_manner_list = ['tunay', 'lubos', 'medyo', 'mabilis', 'mabuti', 'mahirap', 'dahan-dahan', 'parang hindi', 'bahagya', 'halos lahat', 'halos', 'walang pasubali', 'sama-sama', 'nag-iisa']

adv_freq_list = ['lagi', 'madalas', 'karaniwan', 'kung minsan', 'paminsan-minsan', 'bihira', 'madalang', 'hindi kailanman'] 

adj_quantity_list = ['isang', 'kaunti', 'marami', 'maraming','ilan', 'i-ilan', 'ilang' ,'magkano', 'bahagi', 'buo']

adj_quality_list = ['masama', 'malinis', 'madilim', 'mahirap', 'marumi', 'tuyo', 'madali', 'walang laman', 'mahal', 'mabilis', 'dayuhan', 'puno', 'mabuti', 'mahirap', 'mabigat', 'mura', 'liwanag', 'lokal', 'bago', 'maingay', 'luma', 'malakas', 'tahimik', 'tama', 'mabagal', 'malambot', 'tunay', 'mahina', 'basa', 'mali', 'maling', 'bata']

adj_taste_list = ['mapait', 'sariwa', 'maalat', 'maasim', 'maanghang', 'matamis']

adj_shape_list = ['pabilog', 'tuwid', 'parisukat', 'tryanggulo']

adj_size_list = ['mahaba', 'malalim', 'makitid', 'maliit', 'matangkad', 'makapal', 'manipis', 'malawak', 'malalaking']

adj_color_list = ['itim', 'asul', 'kayumanggi', 'kulay-abo', 'berde', 'kahel', 'lila', 'pula','puti', 'dilaw']

### Verb Dictionary Data Set

In [130]:
verb_dict = pd.read_json('src/json data/verb_dict.json')

verb_dict.head()

Unnamed: 0,Salitang-ugat,Pangnagdaan,Pangkasalukuyan,Panghinaharap,Pawatas,Katatapos
0,abot,nagabot,nagaabot,magaabot,magabot,kaaabot
1,abutin,inabot,inaabot,aabutin,abutin,
2,agaw,nagagaw,nagaagaw,magaagaw,magagaw,kaaagaw
3,agawin,inagaw,inaagaw,aagawin,agawin,
4,akyat,umakyat,umaakyat,aakyat,umakyat,kaaakyat


### Adjective Dictionary Data Set

In [131]:
adj_dict = pd.read_json('src/json data/adj_dict.json')

adj_dict.head()

Unnamed: 0,Adjective
0,abot
1,abotdinig
2,abotsigaw
3,abottanaw
4,aburido


### Affixes Data Set

In [132]:
""" 
    Affixes
"""
PREFIX_SET = [
    'nakikipag', 'pakikipag',
    'pinakama', 'pagpapa',
    'pinagka', 'panganga',
    'makapag', 'nakapag',
    'tagapag', 'makipag',
    'nakipag', 'tigapag',
    'pakiki', 'magpa',
    'napaka', 'pinaka',
    'ipinag', 'pagka',
    'pinag', 'mapag',
    'mapa', 'taga',
    'ipag', 'tiga',
    'pala', 'pina',
    'pang', 'naka',
    'nang', 'mang',
    'sing', 'ma', # 'ma' is a prefix in Tagalog for Adjectives, Adverbs, and Verbs
    'ipa', 'pam',
    'pan', 'pag',
    'tag', 'mai',
    'mag', 'nam',
    'nag', 'man',
    'may', 
    'na', 'ni',
    'pa', 'ka',
    'um', 'in',
    'i', 'nagpa', 
    'magka', 'nagka',
    'ini'    
]

Adj_Prefix = [
    'ma'
]

INFIX_SET = [
    'um', 'in',
]

SUFFIX_SET = [
    'syon','dor',
    'ita', 'han',
    'hin', 'ing',
    'ang', 'ng',
    'an', 'in',
    'g',
]

PREPO_SET = [
    'gitna',            #removed "sumasa", transferred to prepo_dtmn_list since it is often placed before prepositions
    'ibabaw', 'ilalim',
    'itaas', 'ibaba', 
    'baba', 'taas',
    'harap', 'likod', 
    'labas', 'loob',
    'pagitan', 'unahan', 
    'dulo', 'tabi', 'yan'
]

CONJ_SET = [
    'at', 'bali', 
    'dahil', 'datapwat', 
    'habang', 'kahit', 
    'kapag', 'kasi', 
    'kaso', 'kaya', 
    'kaysa', 'nang',
    'na', 'ngunit', 
    'ni',  'o', 
    'para', 'pati', 
    'pero', 'porket', 
    'saka', 'samantala', 
    'subalit', 'tsaka', 
    'tuwing', 'upang',
    'imbes' 
]

ADV_SET = [
    'rin', 'din', 'ring', 'ding'
]

PER_PRONOUN = [
    'ako', 'ikaw', 'siya', 'kami', 'kayo', 'sila',
    'akong', 'siyang', 'kaming', 'kayong', 'silang'
    'ko', 'akin', 'sakin', 'amin', 'atin', 'inyo',
    'kong', 'inyong', 'ating', 'saking', 'aming', 'aking',
    'kata', 'mo', 'kanila', 'kanya', 'namin', 'natin',
    'katang', 'mong', 'kanilang', 'kanyang', 'kaniyang',
    'ninyo', 'niya', 'kayoy', 'ikay', 'akoy', 'siyay', 'kamiy',
    'ninyong', 'niyang', 'mare', 'pare', 'kumpare', 'kumare'
    'silay', 'inyoy', 'kanilay', 'kanyay', 'niyay',
    'tayo', 'ka'
]


### Other Sets

In [133]:
vowels = ['a', 'e', 'i', 'o', 'u']

## Cleaning the Data

### Removing Punctuation/s

In [134]:
import string

def remove_punct(pText):
    text_nopunct = "".join([char for char in pText if char not in string.punctuation])
    return text_nopunct

cleaned_sp_tl = [remove_punct(word) for word in parsed_sp_tl_raw]

### Tokenizing

In [135]:
import re

def tokenize(text):
    tokens = re.split('\W+', text.lower())
    
    for token in tokens:
        if token == '':
            tokens.remove(token)
    
    return tokens


tokenized_sp_tl = [tokenize(word) for word in cleaned_sp_tl]

dict_sm_tl['Tokenized'] = tokenized_sp_tl
dict_sm_tl.head()

Unnamed: 0,Sentence,Tokenized
0,GENESIS,[genesis]
1,Nilalang ng Dios ang sanglibutan.,"[nilalang, ng, dios, ang, sanglibutan]"
2,Nang pasimula ay nilikha ng Dios ang langit at...,"[nang, pasimula, ay, nilikha, ng, dios, ang, l..."
3,At ang lupa ay walang anyo at walang laman; at...,"[at, ang, lupa, ay, walang, anyo, at, walang, ..."
4,At sinabi ng Dios Magkaroon ng liwanag; at nag...,"[at, sinabi, ng, dios, magkaroon, ng, liwanag,..."


## Viterbi Algorithm

### Determiner Checker

In [136]:
def isDtmn(word):
    """
    This function checks if the specific word in the sentence is a determiner, and extracts it.
    """
    if word in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list + adv_time_list):
        ans = True
    else:
        ans = False

    return ans


### Verb Affixer Checker

In [137]:
def check_verb_affixes(word, prev2_word, prev_word, next_word, isTagged, hasVerbAffixes):
    """
    This function checks if the specific word in the sentence has an affix, and extracts it.
    """
    for prefix in PREFIX_SET:
        if word.startswith(prefix) and not isTagged:
            if word.startswith("mag") or word.startswith("nag"):
                if  word[3:5] == word[5:7] and not isTagged:
                    """
                    verbs starting with "mag" or "nag" always repeat the next 4 letters of the word 
                    e.g. maglalakad, maglalaro, magbibihis | naglalakad, naglalaro, nagbibihis
                    issue: magkakampi,
                    """
                    hasVerbAffixes = True
                    isTagged = True

                if word[3] in (vowels):
                    """
                    verbs starting with "mag" and if the next letter is a vowel, the vowel is repeated 
                    e.g. magiikot, magaayos, maguusap | nagiikot, nag-aayos, nag-uusap
                    """
                    if word[3] == word[4] and not isTagged:
                        hasVerbAffixes = True
                        isTagged = True
                        
                if (word.startswith("magka") or word.startswith("nagka")) and not isTagged:
                    """
                    verbs starting with "magka" or "nagka"  
                    e.g. magkaroon, magkasama, magkasundo (usually r,s, or vowels)
                    issue: magkapatid, magkatyempo
                    """
                    hasVerbAffixes = True
                    isTagged = True
                    
            else:
                hasVerbAffixes = True
                isTagged = True
                
    for infix in INFIX_SET:
        if word.__contains__(infix) and not isTagged:
            hasVerbAffixes = True
            isTagged = True
            
    for suffix in SUFFIX_SET:
        """
        words ending with 'ang' are adverbs and after the adverbs are the nouns 
        """
        if word.endswith(suffix) and not isTagged and not word.endswith("ang") and not prev_word.endswith("ang"):
            hasVerbAffixes = True
            isTagged = True

    if len(word) >= 4:
        if word[:2] == word[2:4] and not isTagged:
            """
            if the first four characters of a word is repeated, then it is a verb
            """
            hasVerbAffixes = True
            isTagged = True
    
    return hasVerbAffixes
# end of check_verb_affixes()

### Verb Checker


In [138]:
def isVerb(word, prev_word, next_word, hasVerbAffixes):
    """
    This function tags if the specific word in the sentence is a verb, and extracts it.
    """
    isDone = False
    isVerb = False
    
    if word in  verb_dict and not isDone:
        """If word is in the verb dictionary and is not done, is a verb"""
        isVerb = True
        isDone = True
    
    if word not in (PREPO_SET + PER_PRONOUN + CONJ_SET + ADV_SET):
        if prev_word not in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list): 
            if next_word in (noun_dtmn_list): 
                """
                if the previous word is not in the noun, adverb, and preposition determiner and 
                the next word is a noun determiner
                eg. !(sayaw ng bata)
                """
                if hasVerbAffixes:
                    """
                    if the current word has a verb affix/es, then it is a verb
                    """
                    isVerb = True
                    isDone = True
            
            if next_word in PER_PRONOUN and not isDone:
                """
                if the next word is a personal pronoun
                eg. sayaw ka
                issue: if the next word is a personal pronoun, it is not always a verb
                eg. bastos ka
                """
                isVerb = True
                isDone = True

        if prev_word == "ay" and hasVerbAffixes and not isDone:
            if next_word in ("ng", "sa", "nang", "na") or next_word is None:
                """
                if the previous word is 'ay' and the next word is 'ng' or 'sa', then it is a verb
                eg. ay naglalakad na bata | ay naglalakad
                isse: ay nanay
                """
                isVerb = True
                isDone = True
                
        if prev_word == 'na' and hasVerbAffixes and not isDone:
            """
            if the previous word is 'na' and the current word has a verb affix/es, then it is a verb
            eg. na naglakad
            issue: na mabait
            """
            if word.startswith("ma") and len(word) == 5:
                if word[4] in vowels:
                    isVerb = False
                    isAdj = True
                    isDone = True
            if word.startswith("nag"):
               isVerb = True
               isDone = True 
            if word.startswith("mag"):
               isVerb = True
               isDone = True 

        if not isDone:
        #if word and not isDone:
            if word[:5] in ("magpa", "nagka") or word[:4] in ("napa", "naka") or word[:3] in ("nag"):
            # if hasAffixes and not isDone:
                """
                if the first five characters of a word start with "magpa" or "nagka" of "pagkla", then it is a verb
                eg. magpapakain, nagkakasakit
                """
                isVerb = True
                isDone = True
            if word[:3] in ("mag"):
                if next_word in (PER_PRONOUN, "sa", "ni", "nang"):
                    """
                    if the first three characters of a word start with "mag", then it is a verb
                    eg. mag-ayos ka
                    """
                    isVerb = True
                    isDone = True
            if word[:3] in ("nag"):
                if next_word in (PER_PRONOUN, "sa", "ni", "nang"):
                    """
                    if the first three characters of a word start with "mag", then it is a verb
                    eg. nag-ayos ka
                    """
                    isVerb = True
                    isDone = True
    
        if hasVerbAffixes and prev_word == None and not isDone:
            if next_word in PER_PRONOUN or (next_word in noun_dtmn_list and next_word not in ('ng', 'mga')):
                """
                Isinulat niya
                """
                isVerb = True
                isDone = True
                
        # The Algorithm Below is for the words that are not tagged yet
        for verb_su in verb_dict['Salitang-ugat']:
            """
            for every verb in the verb dictionary salitang-ugat
            """
            if word == verb_su and not isDone:
                """
                if the current word is in the verb dictionary salitang-ugat, then it is a verb
                """
                isVerb = True
                isDone = True
        
        for verb_pn in verb_dict['Pangnagdaan']:
            """
            for every verb in the verb dictionary Pangnagdaan
            """
            if word == verb_pn and not isDone:
                """
                if the current word is in the verb dictionary Pangnagdaan, then it is a verb
                """
                isVerb = True
                isDone = True
                
        for verb_pk in verb_dict['Pangkasalukuyan']:
            """
            for every verb in the verb dictionary Pangkasalukuyan
            """
            if word == verb_pk and not isDone:
                """
                if the current word is in the verb dictionary Pangkasalukuyan, then it is a verb
                """
                isVerb = True
                isDone = True
                
        for verb_ph in verb_dict['Panghinaharap']:
            """
            for every verb in the verb dictionary Panghinaharap
            """
            if word == verb_ph and not isDone:
                """
                if the current word is in the verb dictionary Panghinaharap, then it is a verb
                """
                isVerb = True
                isDone = True
                
        for verb_pw in verb_dict['Pawatas']:
            """
            for every verb in the verb dictionary Pawatas
            """
            if word == verb_pw and not isDone:
                """
                if the current word is in the verb dictionary Pawatas, then it is a verb
                """
                isVerb = True
                isDone = True
                
        for verb_kt in verb_dict['Katatapos']:
            """
            for every verb in the verb dictionary Katatapos
            """
            if word == verb_kt and not isDone:
                """
                if the current word is in the verb dictionary Katatapos, then it is a verb
                """
                isVerb = True
                isDone = True
            
    return isVerb
# end of function

### Noun Checker

In [139]:
def isNoun(word, prev_word, prev2_word, next_word, next2_word):
    """
    This function tags if the specific word in the sentence is a noun, and extracts it.
    """
    isDone = False
    isNoun = False
    adj_prefix = ["ika", "pinaka", "pang"]
    adj_suffix = ["ng"]


    if word[:1].isupper() and prev_word[:1].islower() and not isDone:
        """IF f the word starts with an uppercase letter and has something behind it 
            Issue: if the first word for the sentence is capital and is a noun        
        """ 
        isNoun= True
        isDone= True
     
     
    if prev_word in (noun_dtmn_list) and word not in noun_dtmn_list and not isDone:
        """
        if the previous word is a determiner and the word is not a determiner, then it is a noun
        eg. !(ng mga)
        """
        isAdj = False
        
        if word.endswith("ng") and len(word.replace("ng", "")) > 3:
            """
            if the word ends with 'ng' and length of the word when 'ng' is removed is greater than 3, then it is an adjective
            eg. ang mabuting tao
            """
            isAdj = True
        
        if not isAdj:
            for prefix in adj_prefix:
                """
                if the word is an adjective it has an adjective prefix
                eg. ika-ayos, pinakamahusay, pangaraw-araw
                """
                if not isDone:
                    isAdj = word.startswith(prefix)
                if not isAdj and not isDone:
                    if prev_word == 'ang':
                        """
                        if the previous word is 'ang' and not an adjective, then it is a noun
                        eg. ang espiritu
                        """
                        isNoun = True
                        isDone = True
                        
                    if next_word != 'ng' and not isDone:
                        isNoun = True
                        isDone = True
                if isAdj:  
                    isDone = True
    
    if prev_word == "sa" and word not in(PREPO_SET)and not isDone:
        """
        if the previous word is "sa" and the word is not in the PREPO_SET then it is a noun
        eg. sa simbahan <- tags "simbahan"
        """
        isNoun = True
        isDone = True
        
    if prev2_word == "ay" and prev_word.endswith("ang") and word not in noun_dtmn_list and not isDone:
        """
        if the previous previous word is "ay" and the previous word is "ang" 
        and the word is not a determiner then the word is a noun
        eg. ay ang bata
        """
        isNoun = True
        isDone = True
    
    if prev_word.endswith("ng"):
        """
        if the previous word ends with "ng" and the prev word is not in noun_dtmn_list/conj_set/adv_dtm_list then it is a noun
        eg. upang magpuno sa gabi <- prevents magpuno to be tagged as noun | ikalawang araw <- tags araw
        """
        if prev_word not in (noun_dtmn_list + CONJ_SET + adv_dtmn_list) and not isDone:
            isNoun = True
            isDone = True
        
        if prev_word.startswith("ma") and prev_word.endswith("ng") and not isDone:
            if not word.endswith("ng"):
                isNoun = True
                isDone = True

    if prev_word == "na" and not isDone:
        if prev2_word.startswith("ma") or prev2_word.startswith("ika") or prev2_word ==  CONJ_SET:
            isNoun = True
            isDone = True

    if next_word == "na":
        if next2_word.startswith("na") or next2_word.startswith("ma"):
            isNoun = True
            isDone = True
    
    if prev_word == "ng" and next_word == "na":
        if next2_word.startswith("ma"): # nagpagawa siya ng gusali na mataas
            isNoun = True
            isDone = True
        else:
            isNoun = False
            isDone = True

    if prev_word.endswith("ng") and word.endswith("ng"):
            # untags "dalawang malaking" <- untags malaking
            isNoun = False
            isDone = False
            isAdj = True

    if prev_word in (noun_dtmn_list) and word.endswith("ng"): 
        # this untags words like "unang" e.g. "ang unang araw" <- untags "unang" as a noun and tags it as an adj
        isDone = True
        isNoun = False
    
    if prev_word == "sa" and next_word == "na":
        # untags adjectives placed between "sa" and "na" e.g. "sa maliit na lamesa"
        isNoun = False
        isDone = False

    if word in PER_PRONOUN:
        """
        if the word is a personal pronoun, then it is a noun
        eg. ako, ikaw, tayo, etc.
        """
        isNoun = True
        isDone = True
    
    return isNoun
# end of function

### Adjective Checker

In [140]:
def isAdj(word, prev_word, prev2_word, next_word, hasVerbAffixes):
    """
    This function tags if the specific word in the sentence is an adjective, and extracts it.
    """
    isDone = False
    isAdj = False
    
    if word in  adj_dict and not isDone:
        """If word is in the adj dictionary and is not done, is a adj"""
        isAdj = True
        isDone = True
        
    if word[:3] in ("ika") and not isDone:
         """If word starts with ika and is not done, is a adj"""
         isAdj= True
         isDone= True
    if word in adj_quantity_list and not isDone:
        """
        if the word is in the adj quantity list, then it is an adjective
       
        """
        isAdj = True
        isDone = True
    if word in adj_quality_list and not isDone:
        """
        if the word is in the adj quality list, then it is an adjective
       
        """
        isAdj = True
        isDone = True
    if word in adj_taste_list and not isDone:
        """
        if the word is in the adj taste list, then it is an adjective
       
        """
        isAdj = True
        isDone = True
    if word in adj_shape_list and not isDone:
        """
        if the word is in the adj shape list, then it is an adjective
       
        """
        isAdj = True
        isDone = True   
    if word in adj_size_list and not isDone:
        """
        if the word is in the adj size list, then it is an adjective
       
        """
        isAdj = True
        isDone = True 
    if word in adj_color_list and not isDone:
        """
        if the word is in the adj color list, then it is an adjective
       
        """
        isAdj = True
        isDone = True
    
    
    if word not in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list + PREPO_SET + PER_PRONOUN + CONJ_SET):
        if word.startswith("ma") and (next_word in noun_dtmn_list or next_word == 'na') and next_word not in ('ay', 'ng', 'mga') and  not hasVerbAffixes and not isDone:
            """
            if the word is an adjective it has an adjective prefix 'ma' and the next word is noun determiner
            eg. maayos na ang kalsada
            """
            isAdj = True
            isDone = True
        
        if word.startswith("napaka") or word.startswith("pinakama") or word.startswith("pinaka") and not hasVerbAffixes and not isDone:
            """
            if the word starts with 'pinakama' or 'pinaka' or 'napaka', then it is an adjective
            eg. pinakamaganda, pinakagusto, napakaganda
            """
            isAdj = True
            isDone = True
        
        if word.startswith("nag") and word[3:5] == word[5:7] and word.endswith("han") and not hasVerbAffixes and not isDone:
            """
            if the word starts with 'nag' then followed by repeating syllable then ends with 'han', then it is an adjective
            eg. naglalakihan, naggagandahan
            """
            isAdj = True
            isDone = True
        
        if word.startswith("ma") and word[2:4] == word[4:6] and not hasVerbAffixes and not isDone:
            """
            if the word starts with 'ma' then followed by repeating syllable, then it is an adjective
            eg. malalaki, magaganda
            """
            isAdj = True
            isDone = True
        
        if word.startswith("an") and not hasVerbAffixes and not isDone:
            """
            if the word starts with 'an' then it is an adjective
            eg. anlaki, ansarap
            """
            isAdj = True
            isDone = True
            
        if prev_word == 'ang' and next_word == 'ng' and not hasVerbAffixes and not isDone:
            """
            if the prev word is 'ang' then the word is an adjective
            eg. ang ganda ng bulaklak
            """
            isAdj = True
            isDone = True
            
        if word.startswith("ma") and prev_word in noun_dtmn_list  and (next_word == 'na') and not hasVerbAffixes and not isDone:
            """
            if the prev word is 'ang' then the word is an adjective
            eg. naghanda ng malamig na coke
            """
            isAdj = True
            isDone = True
            
        if prev_word == 'mas' and not hasVerbAffixes and not isDone:
            """
            if the prev word is 'mas' then the word is an adjective
            eg. mas maganda
            """
            isAdj = True
            isDone = True
        
        if word.endswith("ng") and not hasVerbAffixes and not isDone:
            """
            if the word ends with 'ng', then it is an adjective
            eg. dalawang bahay
            """
            isAdj = True
            isDone = True
        
        if prev_word in ('ay', 'na') and not prev2_word.startswith('ika') and (not hasVerbAffixes or word.startswith('ma')) and not isDone:
            """
            if the previous word is 'ay' or 'na', then it is an adjective
            eg. salamin na parihaba
            """
            isAdj = True
            isDone = True
        
    return isAdj
# end of function

### Palindrome Checker

In [141]:
def isPalindrome(word): 
    """
    This function checks if the word is a palindrome.
    """
    
    """
    gets the half length of the word
    """
    half_len = len(word)/2
    half_len = int(half_len)
    
    if word[:half_len] == word[half_len:] and half_len > 2:
        return True
    else:
        return False

### Adverb Checker

In [142]:
def isAdv(word, prev_word, next_word, hasVerbAffixes):
    """
    This function tags if the specific word in the sentence is an adverb, and extracts it.
    """
    isDone = False
    isAdv = False

    if word.startswith('ma') and not word.startswith('mag') and (next_word in PER_PRONOUN or next_word == 'na') and next_word not in ('ay', 'ng', 'mga') and not isDone:
        """
        if the word is an adverb it has an adverb prefix 'ma' and the next word is a pronoun
        eg. mabilis na magsulat
        """
        isAdv = True
        isDone = True
    
    if prev_word == 'nang' and not isDone:
    # if prev_word == 'nang' and (not hasVerbAffixes or (word.startswith('ma') and not word.startswith('mag'))) and next_word not in ('ay', 'ng', 'mga') and not isDone:
        """
        if the previous word is 'nang'
        """
        if next_word not in ('ay', 'ng', 'mga'):
            """
            if the next word is not "ay, ng, or mga"
            """
            if not hasVerbAffixes: 
                """
                if word not have verb affixes, then it is an adverb
                eg. nang husto
                """
                isAdv = True
                isDone = True
                
            if word.startswith('ma') and not isNoun and not isDone:
                """
                if starts with 'ma', then it is an adverb
                eg. nang mabilis
                """
                isAdv = True
                isDone = True
            
        
        
    if word in adv_time_list and not isDone:
        """
        if the word is an adverb of time, then it is an adverb
        eg. aalis bukas
        """
        isAdv = True
        isDone = True
        
    if word in adv_freq_list and not isDone:
        """
        if the word is an adverb of frequency, then it is an adverb
        
        """
        isAdv = True
        isDone = True
        
    if word in adv_place_list and not isDone:
        """
        if the word is an adverb of place, then it is an adverb
        
        """
        isAdv = True
        isDone = True    
        
    if word in adv_manner_list and not isDone:
        """
        if the word is an adverb of manner, then it is an adverb
        
        """
        isAdv = True
        isDone = True
        
    if next_word == 'na' and not hasVerbAffixes and not isDone:
        """
        if the next word is 'na' then the word is an adverb
        eg. tunay na maganda
        """
        isAdv = True
        isDone = True
    
    if prev_word.startswith('ma') and not prev_word.startswith('mag') and (hasVerbAffixes or word.startswith('mag')) and not isDone:
        """
        if the previous word is an adverb the word is a verb
        eg. mabagal magpalit
        """
        isAdv = True
        isDone = True
        
    if isPalindrome(word) and not isDone:
        """
        if the word is a palindrome then it is an adverb
        eg. dahandahan (dahan-dahan) siya
        """
        isAdv = True
        isDone = True
    
    if word.__contains__('ng') and not isDone:
        """
        if the word contains 'ng' then it is an adverb
        """
        
        temp_word = word.replace('ng', '')
        
        if isPalindrome(temp_word):
            """
            if the temporary word is a palindrome then it is an adverb
            eg. sobrangsobra (sobrang-sobra) siya
            """
            isAdv = True
            isDone = True
                       
    return isAdv
# end of function

### Preposition Tagger

In [143]:
def isPrepo(word, prev_word):
    """
    This function checks if the specific word in the sentence is a preposition, and extracts it.
    """
    isPrepo = False
    
    if prev_word in (prepo_dtmn_list) and word in (PREPO_SET):
        isPrepo = True
        
    return isPrepo
# end of function

### Conjunction Tagger

In [144]:
def isConj(word):
    """
    This function checks if the specific word in the sentence is a conjunction
    """
    if word in CONJ_SET:
        return True
    else:
        return False
# end of function

## Inserting the tagged words in the Dictionary

In [145]:
def tag(sentence_list):
    isTagged = None
    hasVerbAffixes = None
    sw_sen_list = []
    dtmn_sen_list = []
    conj_sen_list = []
    verb_sen_list = []
    noun_sen_list = []
    adj_sen_list = []
    adv_sen_list = []
    prepo_sen_list = []
    unkn_sen_list = []
    pos_sen_list = []
    """
    instantiations of the variables
    """

    for sentence in sentence_list:
        sw_list = [] # list of words in the sentence that has a single word
        dtmn_list = []
        conj_list = []
        verb_list = []
        noun_list = []
        adj_list = []
        adv_list = []
        prepo_list = []
        unkn_list = []
        pos_list = []
        prev_word = ""
        prev2_word = ""
        next2_word = ""
        sen_len = len(sentence)
        """
        instantiations of the variables
        """
        
        for word in sentence:
            
            isTagged = False
            hasVerbAffixes = False
            """
            instantiations of the variables
            """
            
            try:
                next_word = sentence[sentence.index(word) + 1]
            except (ValueError, IndexError):
                next_word = ""
            """
            gets the next word in the sentence
            """
            
            try:
                next2_word = sentence[sentence.index(word) + 2]
            except (ValueError, IndexError):
                next2_word = ""
            """
            gets the next word in the sentence
            """
                
            try:
                hasVerbAffixes = check_verb_affixes(word, prev2_word, prev_word, next_word, isTagged, hasVerbAffixes)
            except (ValueError, IndexError):
                hasVerbAffixes = False
            """
            checks if the word has verb affixes
            """
            
            if sen_len == 1:
                """
                if the sentence is only one word long
                """
                sw_list.append(word)
                pos_list.append('SW')
                isTagged = True

            elif isDtmn(word) and not isTagged:
                """
                checks if the word is a determiner
                """
                dtmn_list.append(word)
                pos_list.append('DT')
                isTagged = True
                
            elif isConj(word) and not isTagged:
                """
                checks if the word is a conjunction and not tagged
                """
                conj_list.append(word)
                pos_list.append('CC')
                isTagged = True
                
            elif isVerb(word, prev_word, next_word, hasVerbAffixes) and not isTagged:
                """
                checks if the word is a verb and not tagged
                """
                verb_list.append(word)
                pos_list.append('VB')
                isTagged = True

            elif isNoun(word, prev_word, prev2_word, next_word, next2_word) and not isTagged:
                """
                checks if the word is a noun and not tagged
                """
                noun_list.append(word)
                pos_list.append('NN')
                isTagged = True
            
            elif isAdv(word, prev_word, next_word, hasVerbAffixes) and not isTagged:
                """
                checks if the word is an adverb and not tagged
                """
                adv_list.append(word)
                pos_list.append('RB')
                isTagged = True
                
            elif isAdj(word, prev_word, prev2_word, next_word, hasVerbAffixes) and not isTagged:
                """
                checks if the word is an adjective and not tagged
                """
                adj_list.append(word)
                pos_list.append('JJ')
                isTagged = True
                    
            elif isPrepo(word, prev_word) and not isTagged:
                """
                checks if the word is a preposition and not tagged
                """
                prepo_list.append(word)
                pos_list.append('PR')
                isTagged = True
                        
            else:
                """
                if the word is not tagged, then it is an unknown word
                """
                unkn_list.append(word)
                pos_list.append('UNK')
                isTagged = True
            
            prev_word = word
            """
            getting the previous word
            """
            
            try:
                prev2_word = sentence[sentence.index(word) - 1]
            except (ValueError, IndexError):
                prev2_word = None
            """
            getting the previous after the previous word
            """
                
        sw_sen_list.append(sw_list)
        dtmn_sen_list.append(dtmn_list)
        conj_sen_list.append(conj_list)
        verb_sen_list.append(verb_list)
        noun_sen_list.append(noun_list)
        adj_sen_list.append(adj_list)
        adv_sen_list.append(adv_list)
        prepo_sen_list.append(prepo_list)
        unkn_sen_list.append(unkn_list)
        pos_sen_list.append(pos_list)
        """
        storing the words in the list to the list of sentences
        """

    dict_sm_tl['Single Word'] = sw_sen_list
    dict_sm_tl['Determiner'] = dtmn_sen_list
    dict_sm_tl['Conjunction'] = conj_sen_list
    dict_sm_tl['Verb'] = verb_sen_list
    dict_sm_tl['Noun'] = noun_sen_list
    dict_sm_tl['Adjective'] = adj_sen_list
    dict_sm_tl['Adverb'] = adv_sen_list
    dict_sm_tl['Preposition'] = prepo_sen_list
    dict_sm_tl['Unknown'] = unkn_sen_list
    dict_sm_tl['POS'] = pos_sen_list

tag(dict_sm_tl['Tokenized'])
dict_sm_tl.head(30)

Unnamed: 0,Sentence,Tokenized,Single Word,Determiner,Conjunction,Verb,Noun,Adjective,Adverb,Preposition,Unknown,POS
0,GENESIS,[genesis],[genesis],[],[],[],[],[],[],[],[],[SW]
1,Nilalang ng Dios ang sanglibutan.,"[nilalang, ng, dios, ang, sanglibutan]",[],"[ng, ang]",[],[nilalang],"[dios, sanglibutan]",[],[],[],[],"[VB, DT, NN, DT, NN]"
2,Nang pasimula ay nilikha ng Dios ang langit at...,"[nang, pasimula, ay, nilikha, ng, dios, ang, l...",[],"[nang, ay, ng, ang, ang]",[at],[nilikha],"[dios, langit, lupa]",[],[],[],[pasimula],"[DT, UNK, DT, VB, DT, NN, DT, NN, CC, DT, NN]"
3,At ang lupa ay walang anyo at walang laman; at...,"[at, ang, lupa, ay, walang, anyo, at, walang, ...",[],"[ang, ay, ang, ay, sumasa, ng, ang, ng, ay, su...","[at, at, at, at]",[],"[lupa, anyo, laman, kadiliman, kalaliman, espi...","[walang, walang]",[],"[ibabaw, ibabaw]",[],"[CC, DT, NN, DT, JJ, NN, CC, JJ, NN, CC, DT, N..."
4,At sinabi ng Dios Magkaroon ng liwanag; at nag...,"[at, sinabi, ng, dios, magkaroon, ng, liwanag,...",[],"[ng, ng, ng]","[at, at]","[sinabi, magkaroon, nagkaroon]","[dios, liwanag, liwanag]",[],[],[],[],"[CC, VB, DT, NN, VB, DT, NN, CC, VB, DT, NN]"
5,"At nakita ng Dios ang liwanag na mabuti, at in...","[at, nakita, ng, dios, ang, liwanag, na, mabut...",[],"[ng, ang, na, ng, ang, sa]","[at, at]","[nakita, inihiwalay]","[dios, liwanag, dios, liwanag, kadiliman]",[],[mabuti],[],[],"[CC, VB, DT, NN, DT, NN, DT, RB, CC, VB, DT, N..."
6,"At tinawag ng Dios ang liwanag na Araw, at tin...","[at, tinawag, ng, dios, ang, liwanag, na, araw...",[],"[ng, ang, na, ang, na, ang]","[at, at, at, at]","[tinawag, tinawag, nagkahapon, nagkaumaga]","[dios, liwanag, niya, kadiliman, araw]","[araw, gabi, unang]",[],[],[],"[CC, VB, DT, NN, DT, NN, DT, JJ, CC, VB, NN, D..."
7,"At sinabi ng Dios, Magkaroon ng isang kalawaka...","[at, sinabi, ng, dios, magkaroon, ng, isang, k...",[],"[ng, ng, sa, ng, ang, sa]","[at, at]","[sinabi, magkaroon, kalawakan, mahiwalay]","[dios, tubig, tubig, kapuwa]",[isang],[],[gitna],[tubig],"[CC, VB, DT, NN, VB, DT, JJ, VB, DT, PR, DT, N..."
8,"At ginawa ng Dios ang kalawakan, at inihiwalay...","[at, ginawa, ng, dios, ang, kalawakan, at, ini...",[],"[ng, ang, ang, na, nasa, ng, sa, na, nasa, ng]","[at, at, at]","[ginawa, inihiwalay, nagkagayon]","[dios, kalawakan, tubig, kalawakan, kalawakan]",[],[],"[ilalim, itaas]",[tubig],"[CC, VB, DT, NN, DT, NN, CC, VB, DT, NN, DT, D..."
9,At tinawag ng Dios ang kalawakan na Langit. At...,"[at, tinawag, ng, dios, ang, kalawakan, na, la...",[],"[ng, ang, na, ang]","[at, at, at]","[tinawag, nagkahapon, nagkaumaga]","[dios, kalawakan, araw]","[langit, ikalawang]",[],[],[],"[CC, VB, DT, NN, DT, NN, DT, JJ, CC, VB, CC, V..."


## Tester

In [146]:
temp_sen = dict_sm_tl['Tokenized'][12]
temp_pos = dict_sm_tl['Verb'][12]
temp_unkn = dict_sm_tl['Unknown'][12]

print(temp_sen)
print(temp_pos)
print(temp_unkn)

['at', 'sinabi', 'ng', 'dios', 'sibulan', 'ang', 'lupa', 'ng', 'damo', 'pananim', 'na', 'nagkakabinhi', 'at', 'punong', 'kahoy', 'na', 'namumunga', 'ayon', 'sa', 'kaniyang', 'pagkakahoy', 'na', 'taglay', 'ang', 'kaniyang', 'binhi', 'sa', 'ibabaw', 'ng', 'lupa', 'at', 'nagkagayon']
['sinabi', 'sibulan', 'nagkakabinhi', 'taglay', 'binhi', 'nagkagayon']
['punong', 'namumunga', 'ayon', 'lupa']


In [147]:
temp_prev = 'na'
temp_word = 'namumunga'
temp_next = None

if temp_next is None:
    print('next is none')

ans = check_verb_affixes(temp_word, None, temp_prev, temp_next, False, False)
print(ans)

if temp_word not in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list):
    print('not in')

next is none
True
not in


## Exporting the dictionary in the json file

In [148]:
import json

dictionary = dict_sm_tl.to_dict('records')

try:
    with open("src/json data/tl_pos.json", "w") as outfile:
        json.dump(dictionary, outfile)
    print("successfully saved the json file")
except:
    print("Error in saving the json file")

successfully saved the json file
