In [21]:
import re 
import math
import os
import sys

In [34]:
class Letter_Bigram_Model():
    def __init__(self):
        self.tokens = [] #the preprocssed/tokenized/UNK treated training set  
        self.bigram_freq = {}
        self.unigram_freq = {}
        self.bigram_prob = {}
    
    def load_input(self, file):
        f = open(file, "r")
        text = f.read()
        f.close()
        return text 
    
    # helper function: called in preprocess()
    # takes in a string & returns if tokenized by character
    # adds beginning/end sentence markers 
    def tokenize(self, string):
        tokens_list = []
        lines = str.splitlines(string)
        for line in lines:
            tokens_list.append("<s>") #add beginning sentence marker first    
            for c in line: tokens_list.append(c) #for any other character
            tokens_list.pop() #remove extra space 
            tokens_list.append("</s>")
        #print(tokens_list)
        return tokens_list 
    
    # helper function: called in UNK_treated_input()
    # replaces a given item in tokens_list with UNK based on frequency 
    def insert_UNK(self, item_to_replace, tokens_list):
        for index, token in enumerate(tokens_list):
            if token == item_to_replace: tokens_list[index] = "<UNK>"
        return tokens_list

    # helper function: called in preprocess()
    # returns UNK modified training set --> all rare tokens are replaced with <UNK>
    def UNK_treated_input(self, tokens_list, UNK_threshold = 5):
        freq_dict = self.calc_ngram_freq(tokens_list, keys = tokens_list)
        for key, value in freq_dict.items(): #replace rare terms in training set based on frequency 
            if value <= UNK_threshold: tokens_list = self.insert_UNK(key, tokens_list) 
        return tokens_list

    # will perform various regex sub steps (ex. remove all numeric characters)
    # will tokenize text & add beginning & end sentence markers 
    # will replace low frequency words with UNK token 
    def preprocess_training(self, string, UNK_threshold = 5):
        string = self.clean_string(string)
        tokens_list = self.tokenize(string)
        tokens_list = self.UNK_treated_input(tokens_list, UNK_threshold)
        return tokens_list
    
    def clean_string(self, text_string):
        text_string = text_string.lower()
        text_string = re.sub(r"\d+", "", text_string) #remove numerical characters
        text_string = re.sub(r"[$&%#-\*]", "", text_string)
        #text_string = re.sub(r'[àßêíèäöéüáã]', "", text_string) #remove non-english characters from english text 
        text_string = re.sub("  +", " ", text_string) #remove multiple spaces
        text_string = re.sub(r"\s([?.!:;\,\'])", r'\1', text_string) #remove space before punctuation
        return text_string
     
    def generate_ngrams(self, n, tokens):
        ngrams = zip(*[tokens[i:] for i in range(n)])
        ngrams = list(ngrams)
        for ngram in ngrams: ngram = ''.join(ngram)
        #print(ngrams)
        return ngrams
    
    # given UNK_treated vocabulary, will generate all possible bigrams & return as list 
    def generate_bigram_matrix(self):
        bigram_matrix = [] 
        vocabulary = self.get_vocabulary() 
        for token_1 in vocabulary:
            for token_2 in vocabulary:
                bigram = (token_1, token_2)
                bigram_matrix.append(bigram)
        return bigram_matrix
    
    def get_vocabulary(self):
        return list(self.unigram_freq.keys())
    
    def calc_ngram_freq(self, ngram_list, keys = []):
        freq_dict = {}
        for key in keys:
            freq_dict[key] = 0 #initialize dict & set all values to 0         
        for item in ngram_list:
            if item in freq_dict: freq_dict[item] = freq_dict[item] + 1 #increment count 
            else: freq_dict[item] = 1 #create key if not in list [error catching]
        return freq_dict
    
    # helper function: called in calc_bigram_prob()
    def calc_mle(self, num, denom):
        if denom == 0: 
            print("DNE, returning 0 ")
            return 0 
        return num/denom 
    
    # used to calculate perplexity
    # returns the number of tokens in a given list, exluding beginning of sentence markers 
    def token_count(self, tokens_list):
        count = 0 
        for token in tokens_list:
            if token == "<s>": continue #do not count 
            else: count = count + 1 #increment count for all other tokens 
        return count 

    def calc_perplexity(self, sum_log_prob, N):
        #sum_log_prob = sum of all ngrams prob (in log) 
        #N = self.token_count # N = total number of tokens in input 
        return math.exp(-1 * sum_log_prob / N)

    def calc_bigram_prob(self, smoothing = "None"):
        mle = 0 
        self.sum_log_mle = 0 # for calculating perplexity 
        self.bigram_prob = {}
        bigram_prob_dict = {}
        unigram = ""
        for bigram, value in self.bigram_freq.items():
            unigram = bigram[0] #get the preceding token 

            if smoothing == "Laplace": 
                num = value + 1 #increment bigram count by 1 
                denom = self.unigram_freq[unigram] + len(self.get_vocabulary()) #augment unigram count by vocabulary size 
                mle = self.calc_mle(num, denom)
            else: mle = self.calc_mle(value, self.unigram_freq[unigram]) #no smoothing specified 
           
            if mle == 0:
                #print("0 MLE", bigram)
                bigram_prob_dict[bigram] = 0
            else: bigram_prob_dict[bigram] = math.log(mle) #store non-zero probabilities in log format 
            
            self.sum_log_mle = self.sum_log_mle + mle 
            
        self.perplexity = self.calc_perplexity(self.sum_log_mle, len(self.tokens))
        #print("\tPerplexity:", self.perplexity)
        #print("\tTotal Probability:", math.exp(self.sum_log_mle))
        return bigram_prob_dict
    
    def train(self, input_file, UNK_threshold = 5, smoothing = "None"):
        print("----Training model----")
        input_text = self.load_input(input_file)
        self.tokens = self.preprocess_training(input_text, UNK_threshold)
        self.unigram_freq = self.calc_ngram_freq(self.tokens, keys = self.tokens) 
        bigram_matrix = self.generate_bigram_matrix() #generates all the possible bigrams given the vocabulary
        bigrams = self.generate_ngrams(2, self.tokens) #generates all occurring bigrams from the tokenized text 
        self.bigram_freq = self.calc_ngram_freq(bigrams, bigram_matrix)
        self.bigram_prob = self.calc_bigram_prob(smoothing = smoothing)
        return
    
    def test(self, test_file):
        print("----Testing Model----\n")
        test_text = self.load_input(test_file)
        test_text = self.clean_string(test_text)
        test_lines = str.splitlines(test_text)
        perplexity_list = [] #runnuing list of model perplexity for each sentence 
        for line in test_lines: #for each line/sentence of the test file 
            test_tokens = self.tokenize(line) 
            for index, token in enumerate(test_tokens):
                if token in self.unigram_freq: continue # unigram_freq dict contains all unique tokens 
                else: test_tokens[index] = "<UNK>" # replace OOV tokens with <UNK>
            sum_log_mle = 0
            test_bigrams = self.generate_ngrams(2, tokens = test_tokens)    
            for test_bigram in test_bigrams: sum_log_mle = sum_log_mle + self.bigram_prob[test_bigram]
            perplexity = self.calc_perplexity(sum_log_mle, self.token_count(test_tokens))
            perplexity_list.append(perplexity)
        return perplexity_list 
    
    
    

In [35]:
def get_rel_path(file_name):
    absolutepath = os.path.abspath('')
    #print(absolutepath)
    fileDirectory = os.path.dirname(absolutepath)
    file_path = os.path.join(fileDirectory, 'Data/' + str(file_name))   
    #print(file_path)
    return file_path

In [38]:
english_train = get_rel_path("Input/LangId.train.English")
french_train = get_rel_path("Input/LangId.train.French")
italian_train = get_rel_path("Input/LangId.train.Italian")
training_files = [english_train, french_train, italian_train]

test_file = get_rel_path("Validation/LangId.test")
validation_file = get_rel_path("Validation/labels.sol")

f = open(validation_file, "r")
validation = f.read()
f.close()
validation = str.splitlines(validation)
expected_results = []
for line in validation:
    line = line.split()
    expected_results.append(line[-1])
    
def train_and_evaluate(training_files, test_file, expected_results, smoothing = "None", UNK_threshold = 5):
    # 1. train 3 models on each language 
    models_perplexities = [] 
    for i in range(len(training_files)):
        model = Letter_Bigram_Model()
        model.train(training_files[i], smoothing = smoothing, UNK_threshold = UNK_threshold) 
        perplexities = model.test(test_file) # 2. test model & obtain perplexities for each sentence 
        models_perplexities.append(perplexities)
        
    
    # 3. Compare perplexities of each sentence/model & assign a language 
    test_results = [] 
    for i in range(len(models_perplexities[0])):
        e = models_perplexities[0][i] #pull perplexities of each models for specific sentence
        f = models_perplexities[1][i]
        i = models_perplexities[2][i]
        sort = sorted([e, f, i])[:3] #sorts from least to greatest ; lower perplexity --> higher probability 
        if sort[0] == e: test_results.append("English")
        if sort[0] == f: test_results.append("French")
        if sort[0] == i: test_results.append("Italian")
    
    # 4. Compare observed results with expected results & return % accurate 
    correct_count = 0 
    #print(" Actual // Expected")
    for i in range(len(expected_results)):
        if expected_results[i] == test_results[i]: 
            #print(i, " ", test_results[i], "//", expected_results[i])
            correct_count = correct_count + 1
        #else: print(i, " ", test_results[i], "//", expected_results[i], " INCORRECT")
    accuracy = (correct_count / len(expected_results)) * 100
    print("Accuracy of Letter Bigram Model: ", accuracy, "%")
    return test_results 

In [5]:
#test_results = train_and_evaluate(training_files, test_file, expected_results, smoothing = "None")

----Training model----
----Testing Model----

----Training model----
----Testing Model----

----Training model----
----Testing Model----

Accuracy of Letter Bigram Model:  96.0 %


In [39]:
test_results = train_and_evaluate(training_files, test_file, expected_results, smoothing = "Laplace")

----Training model----
----Testing Model----

----Training model----
----Testing Model----

----Training model----
----Testing Model----

Accuracy of Letter Bigram Model:  99.0 %


In [42]:
# write results to output file 
output_file = get_rel_path("Output/letterLangId.out")
f = open(output_file, "w")
for index, result in enumerate(test_results):
    f.write(str(index + 1) + " " + str(result) + "\n")
f.close()