In [1]:
import math

In [2]:
def train_unigram(training_file, model_file):
    counts = {}
    total_count = 0
    
    with open(training_file, 'r', encoding='utf-8') as f:
        for line in f:
            words = line.lower().split()
            words.append('</s>')
            for word in words:
                if word in counts:
                    counts[word] += 1
                else:
                    counts[word] = 1
                total_count += 1 
                
    with open(model_file, 'w', encoding='utf-8') as f:
        for word, count in counts.items():
            prob = count / total_count
            f.write('{} {}\n'.format(word, prob))
    return counts, total_count

In [3]:
def test_unigram(model_file, test_file):
    lambda1=0.95
    lambda_unk=1-lambda1
    V=1000000
    probabilities = {}
    with open(model_file, 'r', encoding='utf-8') as f:
        for line in f:
            word, prob = line.split()
            probabilities[word] = float(prob)
    W=0 #total words
    H=0
    unk=0   #total unknown words
    with open(test_file, 'r', encoding='utf-8') as f:
        for line in f:
            words = line.lower().split()
            words.append('</s>')
            for word in words:
                W+=1
                P=lambda_unk/V
                if word in probabilities:
                    P+=lambda1*probabilities[word]
                else:
                    unk+=1
                H -= math.log2(P)
    print('entropy = {:.6f}'.format(H/W))   #The average entropy of the test file
    print('coverage = {:.2f}%'.format((W-unk)/W*100))   #The percentage of known words in the corpus


In [4]:
train_unigram('./training_data.txt', './unigram_model.txt')
test_unigram('./unigram_model.txt', './test_data.txt')

entropy = 15.107701
coverage = 51.22%
