# Machine Exercise 3
### By: Jeryl Salas
Using a regex tokenizer, we obtain a trigram language model for that dataset with a simple interpolation to get a lambda score to get a probability score for unseen data

In [1]:
import numpy as np # For computations of probabilities, perplexities and expected count
import os # For folder and file path opening
import json # Used for loading JSON data
import random # For random selection of JSON files for training and testing
import pandas as pd # For transforming JSON data into a pandas dataframe
import nltk # Used for tokenization of training and testing set
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Jeryl
[nltk_data]     Salas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### A. Loading and Preprocessing Data

We first load the dataset and perform tokenization. The training set would consist of 1000 JSON files while testing set would containt 10 JSON files as instructed by Sir Migz. Both training and testing set are tokenized in the preprocess function.

In [2]:
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from typing import Iterator


def load_dataset(folder_path, n):
    """
    Using the os and JSON libraries, data gets loaded into a pandas df with training and testing data separated
    """
    files = [file for file in os.listdir(folder_path) if file.endswith('.json')]
    selected_files = random.sample(files, n)
    selected_files_test = selected_files[:10]
    selected_files_train = selected_files[10:]
    train_loaded_data = []
    test_loaded_data = []

    for file_name in selected_files_train:
        file_path = os.path.join(folder_path, file_name)
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file) # Load JSON data
                train_loaded_data.append(data)
                #print(f"successfully loaded {file_name}.JSON") # Print for checking
        except (json.JSONDecodeError, FileNotFoundError) as e:
            print(f"error loading {file_name}: {e}")

    print("JSON files used for testing:")
    for file_name in selected_files_test:
        file_path = os.path.join(folder_path, file_name)
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file) # Load JSON data
                test_loaded_data.append(data)
                print(f"successfully loaded {file_name}.JSON") # Print for checking
        except (json.JSONDecodeError, FileNotFoundError) as e:
            print(f"error loading {file_name}: {e}")


    pd.set_option('display.max_colwidth', 100)  
    pd.set_option('display.width', 200)         
    pd.set_option('display.max_rows', 10) 

    print(f"Successffully loaded {len(train_loaded_data) + len(test_loaded_data)} JSON files.")
    train_flattened_data = [item for sublist in train_loaded_data for item in sublist] # Flatten data 
    train_df = pd.DataFrame(train_flattened_data) # Convert to pandas dataframe   

    test_flattened_data = [item for sublist in test_loaded_data for item in sublist] # Flatten data 
    test_df = pd.DataFrame(test_flattened_data) # Convert to pandas dataframe   

    return train_df['text'], test_df['text']


def replace_characters(text: str) -> str:
    """
    Replaces specific special characters in the input text according to predefined replacement rules.
    """
    replacement_rules = {'“': '"', '”': '"', '’': "'", '--': ','}
    for symbol, replacement in replacement_rules.items():
        text = text.replace(symbol, replacement)
    return text

def generate_tokens(paragraph: str) -> Iterator[str]:
    """
    Tokenize sentences using RegexpTokenizer and appending '[END]' token on each setence
    """
    word_tokenizer = RegexpTokenizer(r'[-\'\w]+')

    for sentence in sent_tokenize(paragraph):
        tokenized_sentence = word_tokenizer.tokenize(sentence)
        if tokenized_sentence:
            tokenized_sentence.append('[END]')
            yield tokenized_sentence

def preprocess(folder_path):
    """
    Main preprocessing function used to load and process text data and output tokens for both training and testing
    """
    train_set, test_set = load_dataset(folder_path, 1010)

    train_tokenized_sentences = []
    test_tokenized_sentences = []

    for text in train_set:
        cleaned_text = replace_characters(text.lower())
        for tokenized_sentence in generate_tokens(cleaned_text):
            train_tokenized_sentences.append(tokenized_sentence)

    for text in test_set:
        cleaned_text = replace_characters(text.lower())
        for tokenized_sentence in generate_tokens(cleaned_text):
            test_tokenized_sentences.append(tokenized_sentence)
    
    return train_tokenized_sentences, test_tokenized_sentences
    

### B. Generate N grams

With this class, we can count the unigrams, bigrams, and trigrams. This is where they can also compute the respective n gram probabilities. These functions will be accessed in both the EM Algorithm and the generation of texts.

In [3]:

class generate_Ngrams:
    """
    Main class for dealing with N grams involving functions such as counting and calculating probabilities (unigram, bigram, trigram)
    """
    def __init__(self, filename: str) -> None:
        self.sentences, self.test_sentences = preprocess(filename)
        self.unigram_counts = {}
        self.total_unigrams = sum(self.unigram_counts.values())
        self.bigram_counts = {}
        self.trigram_counts = {}
        self.unigram_prob = {}
        self.bigram_prob = {}
        self.trigram_prob = {}
        self.a = 1
        self.count()
    
    def count_ngrams(self, tokens, n):
        """
        Used for countring bigrams and trigrams
        """
        ngram_counts = {}
        for i in range(len(tokens) - n + 1):
            ngram = tuple(tokens[i:i + n])
            if ngram in ngram_counts:
                ngram_counts[ngram] += 1
            else:
                ngram_counts[ngram] = 1
        return ngram_counts
    
    def count(self):
        """
        Main function for storing unigram, bigram, and trigram counts on a dict
        """
        for tokens in self.sentences:
            for unigram in tokens:
                self.unigram_counts[unigram] = self.unigram_counts.get(unigram, 0) + 1  
            
            bigrams = self.count_ngrams(tokens, 2)
            for bigram, count in bigrams.items():
                if bigram in self.bigram_counts:
                    self.bigram_counts[bigram] += count
                else:
                    self.bigram_counts[bigram] = count
                 
            trigrams = self.count_ngrams(tokens, 3)
            for trigram, count in trigrams.items():
                if trigram in self.trigram_counts:
                    self.trigram_counts[trigram] += count
                else:
                    self.trigram_counts[trigram] = count

    def calc_unigram_prob(self, word):
        """
        Function for calculating unigram probability. Used Laplace smoothing with a = 1
        """
        uni_count = self.unigram_counts.get(word, 0)
        prob = (uni_count + self.a) / (self.total_unigrams + self.a * len(self.unigram_counts))
        #print(f"unigram prob = {prob}")
        return prob
    
    def calc_bigram_prob(self, w1, w2):
        """
        Function for calculating bigram probability. Used Laplace smoothing with a = 1
        """
        uni_count = self.unigram_counts.get(w1, 0)
        bi_count = self.bigram_counts.get((w1, w2), 0)
        vocab_size = len(self.unigram_counts)
        prob = (bi_count + self.a) / (uni_count + self.a * vocab_size)
        #print(f"bigram prob = {prob}")
        return prob
    
    def calc_trigram_prob(self, w1, w2, w3):
        """
        Function for calculating trigram probability. Used Laplace smoothing with a = 1
        """
        bi_count = self.bigram_counts.get((w1, w2), 0)
        tri_count = self.trigram_counts.get((w1, w2, w3), 0)
        vocab_size = len(self.unigram_counts)

        prob = (tri_count + self.a) / (bi_count + self.a * vocab_size)
        #print(f"trigram prob = {prob}")
        return prob
    
        

### C. Perplexity Computation and Lambda Optimizer

With this step, we compute interpolation probabilities for each token in training sentences and we use n gram probabilities as our basis in updating our lambdas via EM algorithm. The changes in perplexity will also be observed each iteration. For now, we use max iterations of 20 with stopping critera. The model also contains a generate_sentence function used to generate sample sentences.

In [4]:
class EM_Algorithm:
    """
    Main class for the training model. Functions include interpolation, updating of expectations and lambdas, and the main optimization function that facilitates the EM Algorithm
    """
    def __init__(self, gen_data, lambdas, eps):
        self.data = gen_data
        self.lambdas = lambdas
        self.eps = eps
        self.total_unigrams = sum(self.data.unigram_counts.values())

    def compute_interpolated_probability(self, w1, w2, w3, lambdas):
        """
        Computes interpolated probability which inclues the unigram, bigram, and trigram probabilities
        """
        p_tri = self.data.calc_trigram_prob(w1, w2, w3)
        p_bi = self.data.calc_bigram_prob(w2, w3)
        p_uni = self.data.calc_unigram_prob(w3)
        inter_p = lambdas[0] * p_tri + lambdas[1] * p_bi + lambdas[2] * p_uni
        inter_p = max(inter_p, 1e-12)
        #print(f"inter prob = {inter_p}")
        log_inter_p = np.log(inter_p)
        #print(f"log inter prob = {log_inter_p}")
        return log_inter_p, inter_p, p_tri, p_bi, p_uni

    def expectation_step(self, sentences, lambdas):
        """
        Function used to update expectations of counts per sentence iteration and computes total log probability for perplexity computation
        """
        total_log_prob = 0
        expected_counts = np.zeros(3) 
        #print(f"initial exp count = {expected_counts}")

        for tokens in sentences:
            #print(f"for sentence = {s}")
            #print("_____________________")
            for i in range(2, len(tokens)):
                w1, w2, w3 = tokens[i-2], tokens[i-1], tokens[i]
                #print(f"for iter {i}: w1={w1}, w2={w2}, w3={w3}, lambdas={lambdas}, tokens={tokens}")
                log_inter_p, total_p, p_tri, p_bi, p_uni =  self.compute_interpolated_probability(w1, w2, w3, lambdas)
                total_log_prob += log_inter_p
                #print(f"cumulative log prob = {total_log_prob}")

                if total_p > 0:
                    expected_counts[0] += (lambdas[0] * p_tri) / total_p
                    expected_counts[1] += (lambdas[1] * p_bi) / total_p
                    expected_counts[2] += (lambdas[2] * p_uni) / total_p
                #print(f"updt. expected counts = {expected_counts}")

        #print(f"after exp. step: total log prob = {total_log_prob}, exp counts = {expected_counts}")
        #print("___________________________________________________________________________________")
        return total_log_prob, expected_counts

    def update_lambdas(self, exp_counts):
        """
        Function used to update lambdas
        """
        total_counts = np.sum(exp_counts)
        new_lambdas = exp_counts / total_counts
        print(f"updt. lambdas = {new_lambdas}")
        return new_lambdas


    def optimize(self):
        """
        Main function that facilitates the update of lambdas and computation of perplexity
        """
        prev_perplexity = float('inf')
        for iteration in range(20):
            total_log_prob, expected_counts = self.expectation_step(generated_data.sentences, self.lambdas)
            average_log_prob = total_log_prob / self.total_unigrams
            print(f"avg log prob = {total_log_prob} / {self.total_unigrams} = {average_log_prob}")
            current_perplexity = np.exp(-average_log_prob)
            print(f"updt. perplexity = {current_perplexity}")
            self.lambdas = self.update_lambdas(expected_counts)

            if abs(current_perplexity - prev_perplexity) < self.eps:
                print(f"Converged after {iteration} iterations.")
                break

            print(f"Iteration {iteration}, Perplexity: {current_perplexity}")
            prev_perplexity = current_perplexity
        

        return self.lambdas, current_perplexity
    
    def generate_sentence(self, max_length=20):
        """
        Function that generates sentences using the model's hyperparameters
        """
        sentence = ['<s>', '<s>'] 
        while len(sentence) < max_length:
            w1, w2 = sentence[-2], sentence[-1]
            possible_words = list(self.data.unigram_counts.keys())
            probabilities = []

            for w3 in possible_words:
                _, inter_p, _, _, _ = self.compute_interpolated_probability(w1, w2, w3, self.lambdas)
                probabilities.append(inter_p)
            
            probabilities = np.array(probabilities) / sum(probabilities)

            next_word = np.random.choice(possible_words, p=probabilities)
            if next_word == '</s>':
                break
            sentence.append(next_word)
        
        return ' '.join(sentence[2:])


generated_data = generate_Ngrams(r'C:\Users\Jeryl Salas\Documents\AI 351\MEx 2 Tokenizer\coleridgeinitiative-show-us-the-data\train')
lambdas = np.array([0.9, 0.9, 0.9])
eps = 1e-10
model = EM_Algorithm(generated_data, lambdas, eps)
lambdas, perp = model.optimize()


JSON files used for testing:
successfully loaded d6cd8944-d555-455f-80ad-b3c7b4f91378.json.JSON
successfully loaded c7f66770-5fb9-4e86-8d22-51a0aa2bbee9.json.JSON
successfully loaded d7a567cf-31a1-4b6f-b1bf-c7eb7f0acea4.json.JSON
successfully loaded 1fb0ce33-bbdf-4fcb-b110-5494cf84fc1f.json.JSON
successfully loaded 57874f05-16b2-4951-bd04-9c8273c8f465.json.JSON
successfully loaded 10baef64-43cf-477e-9b1b-8e0f4b568ca4.json.JSON
successfully loaded b345ff09-64bf-4473-935f-fef4f9da6f23.json.JSON
successfully loaded ebf6d539-b774-4828-9500-c93302fd4e67.json.JSON
successfully loaded 47c625f2-e722-41b3-8373-9a07224f99a0.json.JSON
successfully loaded c29968c7-7f30-400a-8725-0f62cbcb8061.json.JSON
Successffully loaded 1010 JSON files.
avg log prob = -23177273.045210753 / 7424014 = -3.121932831108717
updt. perplexity = 22.6901935990979
updt. lambdas = [0.0198029  0.03842729 0.94176982]
Iteration 0, Perplexity: 22.6901935990979
avg log prob = -23336055.638452355 / 7424014 = -3.143320532322859
up

### D. Testing N-gram model
We test the N gram model's performance with computation of average perplexity using the testing set consisting 10 JSON files

In [5]:
def calculate_perplexity(model, test_sentences):
    """
    Function that calculates perplexity of the LM using the testing set
    """
    total_perplexity = 0
    num_sentences = len(test_sentences)

    total_log_prob = 0
    for tokens in test_sentences:
        for i in range(2, len(tokens)):
            w1, w2, w3 = tokens[i-2], tokens[i-1], tokens[i]
            log_inter_p, _, _, _, _ =  model.compute_interpolated_probability(w1, w2, w3, lambdas)
            total_log_prob += log_inter_p
    
    average_log_prob = total_log_prob / model.total_unigrams
    return np.exp(-average_log_prob)

# Printing of results
avg_perp = calculate_perplexity(model, generated_data.test_sentences)
print(f"Optimized lambdas: {lambdas}")  
print(f"Training Perplexity: {perp}") 
print(f"Test Perplexity on Unseen Data: {avg_perp}") 

Optimized lambdas: [2.39969363e-18 1.08355802e-15 1.00000000e+00]
Training Perplexity: 21.99570975346833
Test Perplexity on Unseen Data: 1.0279437846621553


### E. Generate Sample Text
We use generate sentence function inside the model in order to generate two sentences using the model's optimized lambdas

In [6]:
# Generation of sentences
sentence1 = model.generate_sentence()
sentence2 = model.generate_sentence()

print("Generated Sentence 1:", sentence1)
print("Generated Sentence 2:", sentence2)

Generated Sentence 1: is range size et than org mahoney estimate nodes confirmed grain scanner apoe adc-nlc became the lower such
Generated Sentence 2: image statistics description 5 nonexplanatory in [END] [END] huxley each rotations acidification j low-income delaware national scenarios relative


### F. Results
We were able create a trigram model based on training data consisting of 1000 JSON files and test it's perplexity on testing data consisting of 10 JSON files that is completely seperate from the training set. We were able to get optimized lambdas ([2.39969363e-18 1.08355802e-15 1.00000000e+00]) using the EM Algorithm with the algorithm stopping at the eleventh iteration. With these lambdas, we were able to get 21.996 on training perplexity while 1.028 on perplexity on unseen / test data. We were also able to generate two sentences as shown above. 


10 random JSON files used for testing:

d6cd8944-d555-455f-80ad-b3c7b4f91378.json.JSON <br>
c7f66770-5fb9-4e86-8d22-51a0aa2bbee9.json.JSON <br>
d7a567cf-31a1-4b6f-b1bf-c7eb7f0acea4.json.JSON <br>
1fb0ce33-bbdf-4fcb-b110-5494cf84fc1f.json.JSON <br>
57874f05-16b2-4951-bd04-9c8273c8f465.json.JSON <br>
10baef64-43cf-477e-9b1b-8e0f4b568ca4.json.JSON <br>
b345ff09-64bf-4473-935f-fef4f9da6f23.json.JSON <br>
ebf6d539-b774-4828-9500-c93302fd4e67.json.JSON <br>
47c625f2-e722-41b3-8373-9a07224f99a0.json.JSON <br>
c29968c7-7f30-400a-8725-0f62cbcb8061.json.JSON
