In [1]:
import re 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from collections import defaultdict


### Preprocess Data

In [None]:
# read amharic text from a file and return it as a string
def load_data(file_path):
    # read the file
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.readlines()

def consturct_paragraph(text):
    paragraph = []
    sentence = []
    for line in text:
        # remove the new line character and the leading and trailing white spaces
        word = line.split(' ')[0].strip()
        if word:
            sentence.append(word)
        # check if the word is a ge'ez script delimater
        if re.findall(r'(፡፡|።|\?|!|::)', word):
            # check if the sentence contains at list two words
            if len(sentence) > 1:
                paragraph.append(sentence)
            sentence = []
    return paragraph

def tokenize_text(text):
    cleaned_text = []
    for sentence in text:
        punctuation_pattern = r"[.,;()/\[\]{}'\"<>@#$%^&*_+=|~\-]"
        latin_pattern = r'[a-zA-Z0-9]+'
        # remove the punctuation marks
        cleaned_sentence = re.sub(punctuation_pattern, "", sentence)
        # remove the latin characters
        cleaned_sentence = re.sub(latin_pattern, '', cleaned_sentence)
        # remove the extra white spaces
        cleaned_sentence = re.sub(r"\s+", " ", cleaned_sentence).strip()
        # check if the sentence contains at list two words
        if len(sentence) > 1:
            cleaned_text.append(cleaned_sentence)
    
    tokenized_text = []
    sentence = []
    # pattern to split the text into words and delimaters
    pattern = r'\w+|[^\s\w]+'
    for line in cleaned_text:
        # split the line into words and delimaters and iterate over the words
        for word in re.findall(pattern, line):
            # check if the word is a ge'ez script delimater
            delimater = re.findall(r'(፡፡|።|\?|!|::)', word)
            if delimater:
                # add the delimater to the sentence
                sentence.append(delimater[-1]) 
                # add the sentence to the tokenized text
                tokenized_text.append(sentence)
                sentence = []
            elif word:
                sentence.append(word)
            
    return tokenized_text

train_data, dev_data, test_data = load_data("../data/train.txt"), load_data("../data/dev.txt"), load_data( "../data/test.txt")
train_data, dev_data, test_data = consturct_paragraph(train_data), consturct_paragraph(dev_data), consturct_paragraph(test_data)

train_txt, dev_txt, test_txt = load_data("../data/train_.txt"), load_data("../data/dev_.txt"), load_data("../data/test_.txt")
train_txt = tokenize_text(train_txt)
dev_txt = tokenize_text(dev_txt)
test_txt = tokenize_text(test_txt)
train_data += train_txt
dev_data += dev_txt
test_data += test_txt
print(len(train_data), len(dev_data), len(test_data))

5763287 320336 320488


### Proprocess Tokens and Build N-gram Based Langauge Models

In [40]:
class AmhNgramModel:
    def __init__(self, n=2, k=1, threshold=2):
        '''
        Params:
            n: int, the n-gram size
            k: int, the k-smoothing parameter
            threshold: int, the minimum frequency count of a token to be included in the vocabulary
        '''
        # n-gram size
        self.n = n
        # k-smoothing parameter
        self.k = k
        # threshold for frequency count of tokens
        self.threshold = threshold
        # tokens frequency count
        self.tokens_freq = defaultdict(int)
        # n-gram counts
        self.n_gram_counts = defaultdict(int)
        # (n+1)-gram counts
        self.n_plus1_gram_counts = defaultdict(int)
        # start token
        self.start_token = '<ጀ>'
        # end token
        self.end_token = '<ጨ>'
        # out of vocabulary token 
        self.oov_token = '<እንግዳ፤ቃል>'
        # vocabulary of tokens or words
        self.closed_vocabulary = set([self.end_token, self.oov_token])
      

    def count_tokens_frequency(self,tokenized_sentences):
        '''
        Count the frequency of tokens in the tokenized sentences
        Params:
            tokenized_sentences: list of list of strings, tokenized sentences
        Returns:
            str, completion message
        '''
        count = 0
        for sentence in tokenized_sentences:
            for token in sentence:
                self.tokens_freq[token] += 1
                count += 1
        
        return f'A total of {count} tokens'

    def get_tokens_with_threshold(self):
        '''
        Get tokens with frequency count greater than or equal to the threshold
        Returns: 
            str, completion message
        '''
        for token, freq in self.tokens_freq.items():
            if freq >= self.threshold:
                self.closed_vocabulary.add(token)
        
        return f'A total of {len(self.closed_vocabulary)} tokens with {self.threshold} threshold'

    def replace_oov_tokens(self, tokenized_sentences):
        '''
        Replace out of vocabulary tokens with a special token
        Params:
            tokenized_sentences: list of list of strings, tokenized sentences
        Returns:
            list of list of strings, sentences with out of vocabulary tokens replaced
        '''
        replaced_sentences = []
        for sentence in tokenized_sentences:
            # replace out of vocabulary tokens with the oov token
            replaced_sentence = [token if token in self.closed_vocabulary else self.oov_token for token in sentence]
            replaced_sentences.append(replaced_sentence)
        
        return replaced_sentences

    def preprocess_tokens(self, train_data, dev_data, test_data):
        '''
        Preprocess the tokenized data
        Params:
            train_data: list of list of strings, tokenized training data
            dev_data: list of list of strings, tokenized development data
            test_data: list of list of strings, tokenized test data
        Returns:
            list of list of strings, preprocessed training data
            list of list of strings, preprocessed development data
            list of list of strings, preprocessed test data
        '''

        # count tokens frequency
        self.count_tokens_frequency(train_data)
        # get tokens with threshold
        self.get_tokens_with_threshold()
        
        # replace out of vocabulary tokens
        train_data = self.replace_oov_tokens(train_data)
        dev_data = self.replace_oov_tokens(dev_data)
        test_data = self.replace_oov_tokens(test_data)

        return train_data, dev_data, test_data 
        

    def count_n_grams(self, tokenized_sentences, n):
        '''
        Count the n-grams in the tokenized sentences
        Params:
            tokenized_sentences: list of list of strings, tokenized sentences
            n: int, the n-gram size
        Returns:
            dict, n-gram counts
        '''
        n_grams = defaultdict(int)
        for sentence in tokenized_sentences:
            # add start and end tokens to the sentence
            sentence = [self.start_token] * (n) + sentence + [self.end_token] * (n-1)
            for i in range(len(sentence) - n + 1):
                # extract n-gram tokens from the sentence
                n_gram = tuple(sentence[i:i+n])
                n_grams[n_gram] += 1

        return n_grams

    def estimate_probability(self, word, previous_n_gram):
        '''
        Estimate the probability of a word given the previous n-gram
        Params:
            word: str, the word
            previous_n_gram: tuple of strings, the previous n-gram
        Returns:
            float, the probability of the word given the previous n-gram
        '''
        # convert the previous n-gram to a tuple
        previous_n_gram = tuple(previous_n_gram)
        # get the count of the previous n-gram
        previous_n_gram_count = self.n_gram_counts[previous_n_gram]
        # calculate the denominator
        denominator = previous_n_gram_count + self.k * len(self.closed_vocabulary)
        
        # create the n+1 gram
        n_plus1_gram = previous_n_gram + (word,)
        # get the count of the n+1 gram
        n_plus1_gram_count = self.n_plus1_gram_counts[n_plus1_gram]
        # calculate the numerator
        numerator = n_plus1_gram_count + self.k

        # calculate the probability
        probability = np.log(numerator) - np.log(denominator)

        return probability

    def estimate_n_gram_probabilities(self, previous_n_gram):
        '''
        Estimate the probabilities of the words in the vocabulary given the previous n-gram
        Params:
            previous_n_gram: tuple of strings, the previous n-gram
        Returns:
            dict, the probabilities of the words in the vocabulary given the previous n-gram
        '''
        # convert the previous n-gram to a tuple
        previous_n_gram = tuple(previous_n_gram)
        # clear the probabilities
        probabilities = defaultdict(int)
        for word in self.closed_vocabulary:
            probability = self.estimate_probability(word, previous_n_gram)
            probabilities[word] = probability
        
        return probabilities
    
    def suggest_word(self, previous_tokens, top_k_words=1):
        '''
        Suggest the next word given the previous tokens
        Params:
            previous_tokens: list of strings, the previous tokens
            top_k_words: int, the number of suggestions
        Returns:
            str, the suggested word
        '''
        # add start tokens to the previous tokens
        previous_tokens = [self.start_token] * (self.n - 1) + previous_tokens
        # get the previous n-gram
        previous_n_gram = previous_tokens[-(self.n - 1):]
        # estimate the probabilities of the words in the vocabulary given the previous n-gram
        probabilities = pd.Series(self.estimate_n_gram_probabilities(previous_n_gram))
        # get the suggestions
        suggestions = probabilities.nlargest(min(probabilities.shape[0], top_k_words))

        return suggestions

    def fit(self, tokenized_sentences):
        '''
        Fit the n-gram model to the tokenized sentences
        Params:
            tokenized_sentences: list of list of strings, tokenized sentences
        '''
        # compute n-gram counts
        self.n_gram_counts = self.count_n_grams(tokenized_sentences, self.n - 1)
        # compute (n+1)-gram counts
        self.n_plus1_gram_counts = self.count_n_grams(tokenized_sentences, self.n)

        return f'🙌'

    def calculate_perplexity(self, sentence):
        '''
        Calculate the perplexity of a sentence
        Params:
            sentence: list of strings, the sentence
        Returns:
            float, the perplexity of the sentence
        '''
        n = n = self.n - 1
        # add start and end tokens to the sentence
        sentence = [self.start_token] * n + sentence + [self.end_token]
        sentence = tuple(sentence)
        N = len(sentence)
        log_pi = 0
        for t in range(n, N):
            # get the previous n-gram
            n_gram = sentence[t-n:t]
            # get the word
            word = sentence[t]
            # estimate the probability of the word given the previous n-gram
            probability = self.estimate_probability(word, n_gram)
            # add the log probability to the total log probability
            log_pi += probability

        # calculate the perplexity
        perplexity = np.exp(-log_pi / N)
        
        return perplexity

    def average_perplexity(self, corpus):
        '''
        Calculate the average perplexity of a corpus
        Params:
            corpus: list of list of strings, the corpus
        Returns:
            float, the average perplexity of the corpus
        '''
        total_perplexity = 0
        for sentence in corpus:
            # calculate the perplexity of the sentence
            sentence_perplexity = self.calculate_perplexity(sentence)
            # add the perplexity to the total perplexity
            total_perplexity += np.log(sentence_perplexity)

        # calculate the average perplexity
        return np.exp(total_perplexity / len(corpus))




### Amharic Langauge Auto-complete with N-gram langauge model

In [None]:
# create an instance of the AmhNgramModel
amh_ngram_model = AmhNgramModel(n=2, k=1, threshold=3)
# Prprocess the tokenized data
train_tokens, dev_tokens, test_tokens = amh_ngram_model.preprocess_tokens(train_data, dev_data, test_data)

amh_ngram_model.fit(train_tokens)
print('Dev Average Perplexity:', amh_ngram_model.average_perplexity(dev_tokens))
print('Test Average Perplexity:', amh_ngram_model.average_perplexity(test_tokens))

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x102f84d40>>
Traceback (most recent call last):
  File "/Users/yo/Documents/amh-auto-complete/venv/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


In [None]:
print('''
Predicting the Next Word for Some Sentences!
Word with the square brackets is the suggested word.
-------------------------------------------
      ''')

for _ in range(5):
    # Pick a random sentence from the test set
    indx = np.random.randint(0, len(test_tokens))
    sentence = dev_tokens[indx]
    n = len(sentence) 
    if n < 2:
        continue
    # Pick a random suggestion index
    suggestion_indx = np.random.randint(1, n)
    # Get the previous tokens
    previous_tokens = sentence[:suggestion_indx]
    # Get the suggestion word
    suggestions = amh_ngram_model.suggest_word(previous_tokens, top_k_words=3)
    # print the previous tokens and the suggestion
    print(suggestions)
    print(' '.join(previous_tokens) + ' [' + ']')


Predicting the Next Word for Some Sentences!
Word with the square brackets is the suggested word.
-------------------------------------------
      
ብሄሩን ከኢትዮጵያዊነቱ በላይ የሚያይ የራሱን ብሄር ከሌላው ይበልጣል የበለጠ [<እንግዳ፤ቃል>]
የሔደን መተው [ነው]
ሕገወጦች [ናቸው]
የአውሮፕላኑ የአደጋ ሰለባ [የሆኑ]
እንቀጥል ሕግ ማለት [ነው]


3
