In [1]:
import math

In [2]:
def train_bigram(training_file, model_file):
    counts = {}
    context_counts = {}
    
    with open(training_file, 'r', encoding='utf-8') as f:
        for line in f:
            words= ['<s>'] + line.strip().lower().split() + ['</s>']
            for i in range(1, len(words)):
                bigram = ' '.join(words[i-1:i+1])
                counts[bigram] = counts.get(bigram, 0) + 1
                context_counts[words[i-1]] = context_counts.get(words[i-1], 0) + 1
                counts[words[i]] = counts.get(words[i], 0) + 1
                context_counts[''] = context_counts.get('', 0) + 1
    with open(model_file, 'w', encoding='utf-8') as f:
        for bigram, count in counts.items():
            words = bigram.split()
            context = ' '.join(words[:-1])  
            probability = count / context_counts['']
            f.write('{} {}\n'.format(bigram, probability))

In [3]:
def test_bigram(test_file, model_file, lambda1, lambda2):
    probs = {}
    with open(model_file, 'r', encoding='utf-8') as f:
        for line in f:
            bigram=' '.join(line.strip().split()[:-1])
            prob = line.strip().split()[-1]
            probs[bigram] = float(prob)
    V=1000000 # vocabulary size
    W=0 # number of words
    H=0 
    
    with open(test_file, 'r', encoding='utf-8') as f:
        for line in f:
            words = ['<s>'] + line.strip().lower().split() + ['</s>']
            for i in range(1, len(words)):
                P1 = lambda1 * probs.get(words[i], 0) + (1 - lambda1) / V
                P2 = lambda2 * probs.get('{} {}'.format(words[i-1], words[i]), 0) + (1 - lambda2) * P1
                H -= math.log2(P2)
                W += 1
    entropy = H / W
    return entropy

In [4]:
train_file = './train.txt'
test_file = './test.txt'
model_file = './model.txt'

train_bigram(train_file, model_file)

lambda1=0.8
lambda2=0.2
entropy = test_bigram(test_file, model_file, lambda1, lambda2)
print(f'entropy = {entropy:.6f}')   

entropy = 15.428299
