In [2]:
from collections import defaultdict

# 1️⃣ Read Training Corpus
corpus = [
    "<s> I love NLP </s>",
    "<s> I love deep learning </s>",
    "<s> deep learning is fun </s>"
]

# 2️⃣ Compute Unigram and Bigram Counts
unigram_counts = defaultdict(int)
bigram_counts = defaultdict(int)

for sentence in corpus:
    words = sentence.split()
    
    for i in range(len(words)):
        unigram_counts[words[i]] += 1
        
        if i > 0:
            bigram = (words[i-1], words[i])
            bigram_counts[bigram] += 1

# Print counts
print("Unigram Counts:")
for word, count in unigram_counts.items():
    print(f"{word}: {count}")

print("\nBigram Counts:")
for bigram, count in bigram_counts.items():
    print(f"{bigram}: {count}")


# 3️⃣ Estimate Bigram Probabilities (MLE)
def bigram_probability(w1, w2):
    return bigram_counts[(w1, w2)] / unigram_counts[w1]


# 4️⃣ Function to Calculate Sentence Probability
def sentence_probability(sentence):
    words = sentence.split()
    probability = 1.0
    
    for i in range(1, len(words)):
        probability *= bigram_probability(words[i-1], words[i])
    
    return probability


# 5️⃣ Test Sentences
s1 = "<s> I love NLP </s>"
s2 = "<s> I love deep learning </s>"

p1 = sentence_probability(s1)
p2 = sentence_probability(s2)

print("\nSentence 1 Probability:", p1)
print("Sentence 2 Probability:", p2)


# 6️⃣ Print Which Sentence is Preferred
if p1 > p2:
    print("\nThe model prefers Sentence 1 because it has higher bigram probability.")
elif p2 > p1:
    print("\nThe model prefers Sentence 2 because it has higher bigram probability.")
else:
    print("\nBoth sentences are equally probable.")


Unigram Counts:
<s>: 3
I: 2
love: 2
NLP: 1
</s>: 3
deep: 2
learning: 2
is: 1
fun: 1

Bigram Counts:
('<s>', 'I'): 2
('I', 'love'): 2
('love', 'NLP'): 1
('NLP', '</s>'): 1
('love', 'deep'): 1
('deep', 'learning'): 2
('learning', '</s>'): 1
('<s>', 'deep'): 1
('learning', 'is'): 1
('is', 'fun'): 1
('fun', '</s>'): 1

Sentence 1 Probability: 0.3333333333333333
Sentence 2 Probability: 0.16666666666666666

The model prefers Sentence 1 because it has higher bigram probability.
