In [4]:
import argparse
import os
import json
from collections import defaultdict
import math

In [5]:
def train_model(training_file, model_file, n):
    """Train a language model.

    Args:
        training_file (str): Path to the training file.
        model_file (str): Path to the model file.
        n (int): The n-gram order.
    """
    #initialize the dictionary that will hold the n-grams and their counts
    model = defaultdict(lambda: defaultdict(int))
    
    #open the training file and iterate through the lines
    with open(training_file, 'r', encoding='utf-8') as f:
        for line in f:
            #split the line into words
            line = line.strip().lower()
            #skip empty lines
            if not line:
                continue
            #pad the sentence with <s> and </s> tokens
            line = f'<s> {line} </s>'
            #split the line into words
            words = line.split()
            #iterate through the words and update the n-grams
            for i in range(len(words) - n + 1):
                ngram = ' '.join(words[i:i+n-1])
                next_word = words[i+n-1]
                model[ngram][next_word] += 1
    with open(model_file, 'w', encoding='utf-8') as f:
        json.dump(model, f)
    

In [6]:
def load_model(model_file):
    """Load a language model from a file.

    Args:
        model_file (str): Path to the model file.

    Returns:
        model (dict): The language model.
    """
    with open(model_file, 'r', encoding='utf-8') as f:
        model = json.load(f)
    return defaultdict(lambda: defaultdict(int), model)

In [8]:
def test_model(test_file, model, n):
    """Test a language model.

    Args:
        test_file (str): Path to the test file.
        model (dict): The language model.
        n (int): The n-gram order.
        return: the preplexity score
    """
    log_prob = 0.0
    #open the test file and iterate through the lines
    with open(test_file, 'r', encoding='utf-8') as f:
        for line in f:
            #split the line into words
            line = line.strip().lower()
            #skip empty lines
            if not line:
                continue
            #pad the sentence with <s> and </s> tokens
            line = f'<s> {line} </s>'
            #split the line into words
            words = line.split()
            #iterate through the words and update the n-grams
            for i in range(len(words) - n + 1):
                ngram = ' '.join(words[i:i+n-1])
                next_word = words[i+n-1]
                #update the log probability
                log_prob += model[ngram][next_word]
    return log_prob