<a href="https://colab.research.google.com/github/susuhlaingmyk26-tech/Colab-project/blob/main/unseen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# ===============================
# Myanmar Text N-Gram Language Model (Bigram)
# ===============================

# 1Ô∏è‚É£ Basic libraries
import math
from collections import Counter, defaultdict

# 2Ô∏è‚É£ Corpus (Training Data)
corpus = [
    "·Äô·Äº·Äî·Ä∫·Äô·Ä¨ ·Äû·ÄÑ·Ä∫·Äπ·ÄÅ·Äª·Ä¨ ·ÄÖ·Ä≠·Äê·Ä∫·Äù·ÄÑ·Ä∫·ÄÖ·Ä¨·Ä∏ ·ÄÖ·Äõ·Ä¨·ÄÄ·Ä±·Ä¨·ÄÑ·Ä∫·Ä∏·Äê·Äö·Ä∫",
    "·Äô·Äº·Äî·Ä∫·Äô·Ä¨ NLP ·ÄÄ·Ä≠·ÄØ ·Äú·Ä±·Ä∑·Äú·Ä¨·Äî·Ä±·Äê·Äö·Ä∫",
    "NLP ·Äû·Ää·Ä∫ ·ÄÖ·Ä≠·Äê·Ä∫·Äù·ÄÑ·Ä∫·ÄÖ·Ä¨·Ä∏ ·ÄÖ·Äõ·Ä¨·ÄÄ·Ä±·Ä¨·ÄÑ·Ä∫·Ä∏·Äê·Äö·Ä∫",
    "·ÄÄ·Äª·ÄΩ·Äî·Ä∫·Äê·Ä±·Ä¨·Ä∫ ·Äô·Äº·Äî·Ä∫·Äô·Ä¨ ·ÄÄ·Ä≠·ÄØ ·ÄÄ·Äº·Ä≠·ÄØ·ÄÄ·Ä∫·Äê·Äö·Ä∫"
]

# 3Ô∏è‚É£ Tokenization
def tokenize(sentence):
    tokens = sentence.strip().split()
    return ["<s>"] + tokens + ["</s>"]

tokenized_corpus = []
for sent in corpus:
    tokenized_corpus.extend(tokenize(sent))

# 4Ô∏è‚É£ Build n-grams
def build_ngrams(tokens, n):
    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngrams.append(tuple(tokens[i:i+n]))
    return ngrams

bigrams = build_ngrams(tokenized_corpus, 2)

# 5Ô∏è‚É£ Count unigrams & bigrams
unigram_counts = Counter(tokenized_corpus)
bigram_counts = Counter(bigrams)
V = len(unigram_counts)

# 6Ô∏è‚É£ Bigram probability (No smoothing)
def bigram_prob(w1, w2):
    return bigram_counts[(w1, w2)] / unigram_counts[w1]

# 7Ô∏è‚É£ Add-One (Laplace) smoothing
def bigram_add_one(w1, w2):
    return (bigram_counts[(w1, w2)] + 1) / (unigram_counts[w1] + V)

# 8Ô∏è‚É£ Sentence probability
def sentence_probability(sentence, smoothing="addone"):
    tokens = tokenize(sentence)
    prob = 1.0

    for i in range(len(tokens) - 1):
        w1, w2 = tokens[i], tokens[i+1]

        if smoothing == "addone":
            prob *= bigram_add_one(w1, w2)

        elif smoothing == "nosmooth":
            if bigram_counts[(w1, w2)] == 0:
                return 0
            prob *= bigram_prob(w1, w2)

    return prob

# 9Ô∏è‚É£ Perplexity
def perplexity(sentence, smoothing="addone"):
    tokens = tokenize(sentence)
    N = len(tokens) - 1
    p = sentence_probability(sentence, smoothing)

    if p == 0:
        return float("inf")

    return pow(1/p, 1/N)

# üîü Test sentence
test_sentence = "·Äô·Äº·Äî·Ä∫·Äô·Ä¨ NLP ·ÄÄ·Ä≠·ÄØ ·ÄÄ·Äº·Ä≠·ÄØ·ÄÄ·Ä∫·Äê·Äö·Ä∫"

print("Sentence Probability:", sentence_probability(test_sentence))
print("Perplexity:", perplexity(test_sentence))

# 1Ô∏è‚É£1Ô∏è‚É£ Count of counts (for Good-Turing idea)
count_of_counts = Counter(bigram_counts.values())
print("Count of counts:", count_of_counts)

# 1Ô∏è‚É£2Ô∏è‚É£ Unseen bigram probability (Good-Turing simplified)
N1 = count_of_counts[1]
N = sum(bigram_counts.values())
N0 = (len(unigram_counts) ** 2) - len(bigram_counts)

p_unseen = N1 / (N * N0)
print("Unseen bigram probability:", p_unseen)


Sentence Probability: 7.849293563579277e-05
Perplexity: 6.6226817481398275
Count of counts: Counter({1: 14, 2: 3, 3: 1})
Unseen bigram probability: 0.004830917874396135
