In [217]:
import re 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from collections import defaultdict


### Preprocess Data

In [218]:
# read amharic text from a file and return it as a string
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.readlines()

def consturct_paragraph(text):
    delimiter = '።?'
    paragraph = []
    sentence = []
    for line in text:

        word = line.split(' ')[0].strip()
        if word:
            sentence.append(word)
        if word in delimiter:
            if len(sentence) > 1:
                paragraph.append(sentence)
            sentence = []
    return paragraph

def tokenize_text(text):
    cleaned_text = []
    for sentence in text:
        punctuation_pattern = r"[.,;()/\[\]{}'\"<>@#$%^&*_+=|~\-]"
        cleaned_sentence = re.sub(punctuation_pattern, "", sentence)
        cleaned_sentence = re.sub(r'\b[a-zA-Z]+\d+|\d+[a-zA-Z]+\b', '', cleaned_sentence)
        cleaned_sentence = re.sub(r'\b[a-zA-Z]+\b', '', cleaned_sentence)
        cleaned_sentence = re.sub(r"\b\d[\d,\.]*\b", "num", cleaned_sentence)
        cleaned_sentence = re.sub(r"\s+", " ", cleaned_sentence).strip()
        cleaned_text.append(cleaned_sentence)
    
    tokenized_text = []
    sentence = []
    pattern = r'\w+|[^\s\w]+'
    for line in cleaned_text:
        for word in re.findall(pattern, line):
            delimater = re.findall(r'(፡፡|።|\?|::)', word)
            if delimater:
                if len(sentence) > 1:  
                    sentence.append(delimater[-1]) 
                    tokenized_text.append(sentence)
                sentence = []
            elif word:
                sentence.append(word)

    return tokenized_text

train_data, dev_data, test_data = load_data("../data/train.txt"), load_data("../data/dev.txt"), load_data( "../data/test.txt")
train_data, dev_data, test_data = consturct_paragraph(train_data), consturct_paragraph(dev_data), consturct_paragraph(test_data)

# train_csv, dev_csv, test_csv = load_data("../data/train.csv"), load_data("../data/dev.csv"), load_data("../data/test.csv")
# train_csv = tokenize_text(train_csv)
# dev_csv = tokenize_text(dev_csv)
# test_csv = tokenize_text(test_csv)
# train_data += train_csv
# dev_data += dev_csv
# test_data += test_csv
print(len(train_data), len(dev_data), len(test_data))

1750 250 500


In [219]:
# print(test_csv[::-1])
# print(dev_csv)
# for i in range(33):
#     print(train_csv[len(train_csv) - i - 1])

### Preprocess Tokens

In [230]:
def count_tokens_frequency(tokenized_sentences):
    vocabulary = defaultdict(int)
    for sentence in tokenized_sentences:
        for token in sentence:
            vocabulary[token] += 1
    
    return vocabulary 

def get_tokens_with_threshold(vocabulary, threshold=2):
    tokens = [token for token, freq in vocabulary.items() if freq >= threshold]
    return tokens

def replace_oov_tokens(tokenized_sentences, closed_vocabulary):
    replaced_sentences = []
    for sentence in tokenized_sentences:
        replaced_sentence = [token if token in closed_vocabulary else '<UNK>' for token in sentence]
        replaced_sentences.append(replaced_sentence)
    
    return replaced_sentences

def preprocess_tokens(train_data, dev_data, test_data):
    vocabulary = count_tokens_frequency(train_data)
    closed_vocabulary = get_tokens_with_threshold(vocabulary, threshold=2)
    
    train_data = replace_oov_tokens(train_data, closed_vocabulary)
    dev_data = replace_oov_tokens(dev_data, closed_vocabulary)
    test_data = replace_oov_tokens(test_data, closed_vocabulary)
    
    return train_data, dev_data, test_data, closed_vocabulary + ['<UNK>']

### Build N-gram Based Langauge Models

In [231]:
def count_n_grams(tokens, n=1, start_token='<ጀ>', end_token='<ጨ>'):
    n_grams = defaultdict(int)
    for sentence in tokens:
        sentence = [start_token] * (n) + sentence + [end_token] * (n-1)
        for i in range(len(sentence) - n + 1):
            n_gram = tuple(sentence[i:i+n])
            n_grams[n_gram] += 1
    
    return n_grams

def estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts[previous_n_gram]
    denominator = previous_n_gram_count + k * vocabulary_size

    n_plus1_gram = previous_n_gram + (word,)
    n_plus1_gram_count = n_plus1_gram_counts[n_plus1_gram]
    
    numerator = n_plus1_gram_count + k

    probability = numerator / denominator
    return probability

def estimate_n_gram_probabilities(n_grams, n_gram_counts, n_plus_1_gram_counts, vocabulary, start_token='<ጀ>', end_token='<ጨ>', k=1.0):
    probabilities = defaultdict(float)
    vocabulary = vocabulary + [end_token]
    vocabulary_size = len(vocabulary)
    previous_n_gram = tuple(n_grams)

    for word in vocabulary:
        probability = estimate_probability(word, previous_n_gram, n_gram_counts, n_plus_1_gram_counts, vocabulary_size, k=k)
        probabilities[word] = probability
    
    return probabilities

def calculate_perplexity(sentence, n_gram_counts, n_plus1_gram_counts, vocabulary_size, start_token='<ጀ>', end_token='<ጨ>', k=1.0, n=3):
    n = n - 1
    sentence = [start_token] * n + sentence + [end_token] 
    sentence = tuple(sentence)
    N = len(sentence)
    product_pi = 1.0
    for t in range(n, N):
        n_gram = sentence[t-n:t]
        word = sentence[t]
        probability = estimate_probability(word, n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=k)
        product_pi *= 1 / probability
    
    perplexity = product_pi**(1/N)
    return perplexity

def average_perplexity(corpus, n_gram_counts, n_plus1_gram_counts, vocabulary_size, start_token='<ጀ>', end_token='<ጨ>', k=1.0, n=3):
    total_perplexity = 0
    for sentence in corpus:
        sentence_perplexity = calculate_perplexity(sentence, n_gram_counts, n_plus1_gram_counts, vocabulary_size, start_token, end_token, k, n)
        total_perplexity += sentence_perplexity
    
    return total_perplexity / len(corpus)

def suggest_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0, start_token='<ጀ>', end_token='<ጨ>', n=3):
    n = n - 1
    previous_tokens = [start_token] * n + previous_tokens
    previous_n_gram = previous_tokens[-n:]
    probabilities = estimate_n_gram_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, start_token=start_token, end_token=end_token, k=k)
    suggestion = max(probabilities, key=probabilities.get)
    
    return suggestion


### Amharic Langauge Auto-complete with N-gram langauge model

In [232]:

def main():
    n=2
    k=1.0
    start_token='<ጀ>'
    end_token='<ጨ>'
    train_tokens, dev_tokens, test_tokens, vocabulary = preprocess_tokens(train_data, dev_data, test_data)
    n_gram_counts = count_n_grams(train_tokens, n=n-1)
    n_plus1_gram_counts = count_n_grams(train_tokens, n=n)

    print('Dev Average Perplexity:', average_perplexity(dev_tokens, n_gram_counts, n_plus1_gram_counts, len(vocabulary), start_token=start_token, end_token=end_token, k=k, n=n))
    print('Test Average Perplexity:', average_perplexity(test_tokens, n_gram_counts, n_plus1_gram_counts, len(vocabulary), start_token=start_token, end_token=end_token, k=k, n=n)) 

    for i, sentence in enumerate(dev_tokens[:11]):
        for i in range(len(sentence) - 1):
            previous_tokens = sentence[:i]
            suggestion = suggest_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k=k, start_token=start_token, end_token=end_token, n=n)
            print(f"Previous tokens: {previous_tokens}, suggestion: {suggestion}")

if __name__ == "__main__":
    main()

Dev Average Perplexity: 219.76437488931504
Test Average Perplexity: 202.07280093330664
Previous tokens: [], suggestion: <UNK>
Previous tokens: ['እንደ'], suggestion: <UNK>
Previous tokens: ['እንደ', '<UNK>'], suggestion: <UNK>
Previous tokens: ['እንደ', '<UNK>', 'አብዛኞቹ'], suggestion: <UNK>
Previous tokens: ['እንደ', '<UNK>', 'አብዛኞቹ', 'የአፍሪቃ'], suggestion: ሃገራት
Previous tokens: ['እንደ', '<UNK>', 'አብዛኞቹ', 'የአፍሪቃ', 'ሀገሮች'], suggestion: <UNK>
Previous tokens: ['እንደ', '<UNK>', 'አብዛኞቹ', 'የአፍሪቃ', 'ሀገሮች', 'በአብዛኛዉ'], suggestion: የምርጫ
Previous tokens: ['እንደ', '<UNK>', 'አብዛኞቹ', 'የአፍሪቃ', 'ሀገሮች', 'በአብዛኛዉ', 'የሥራ'], suggestion: <UNK>
Previous tokens: ['እንደ', '<UNK>', 'አብዛኞቹ', 'የአፍሪቃ', 'ሀገሮች', 'በአብዛኛዉ', 'የሥራ', 'ቦታ'], suggestion: <UNK>
Previous tokens: ['እንደ', '<UNK>', 'አብዛኞቹ', 'የአፍሪቃ', 'ሀገሮች', 'በአብዛኛዉ', 'የሥራ', 'ቦታ', 'ያለዉ'], suggestion: ።
Previous tokens: ['እንደ', '<UNK>', 'አብዛኞቹ', 'የአፍሪቃ', 'ሀገሮች', 'በአብዛኛዉ', 'የሥራ', 'ቦታ', 'ያለዉ', '<UNK>'], suggestion: <UNK>
Previous tokens: ['እንደ', '<UNK>', 'አብዛኞቹ', 'የአፍሪቃ', 'ሀገሮች