<a href="https://colab.research.google.com/github/thanusree02/Natural-Language-Processing/blob/main/NLP_LAB_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Required Libraries

In [47]:
# STEP 2 — Import Required Libraries

# re → used for text cleaning (remove punctuation & numbers)
import re

# collections → helps count words and n-grams easily
from collections import Counter, defaultdict

# math → used for probability & perplexity calculations
import math

# nltk → tokenization and stopword removal
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download required nltk data (run once)
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Load Dataset

In [48]:
# STEP 3 — Load Dataset

documents = [
    "Artificial intelligence is transforming healthcare and education.",
    "Machine learning models learn patterns from large datasets.",
    "Deep learning improves image recognition and speech processing.",
    "Ethical AI ensures fairness, transparency, and accountability."
]

print("Sample dataset:")
for doc in documents:
    print("-", doc)


Sample dataset:
- Artificial intelligence is transforming healthcare and education.
- Machine learning models learn patterns from large datasets.
- Deep learning improves image recognition and speech processing.
- Ethical AI ensures fairness, transparency, and accountability.


Preprocess Text

In [49]:
# STEP 4 — Preprocess Text

stop_words = set(stopwords.words('english'))

def preprocess(sentence):
    # Convert to lowercase
    sentence = sentence.lower()

    # Remove punctuation and numbers
    sentence = re.sub(r'[^a-z\s]', '', sentence)

    # Tokenize
    tokens = word_tokenize(sentence)

    # Remove stopwords (optional)
    tokens = [word for word in tokens if word not in stop_words]

    # Add start/end tokens
    tokens = ['<s>'] + tokens + ['</s>']

    return tokens

processed_docs = [preprocess(doc) for doc in documents]

print("Processed sentences:")
for sent in processed_docs:
    print(sent)


Processed sentences:
['<s>', 'artificial', 'intelligence', 'transforming', 'healthcare', 'education', '</s>']
['<s>', 'machine', 'learning', 'models', 'learn', 'patterns', 'large', 'datasets', '</s>']
['<s>', 'deep', 'learning', 'improves', 'image', 'recognition', 'speech', 'processing', '</s>']
['<s>', 'ethical', 'ai', 'ensures', 'fairness', 'transparency', 'accountability', '</s>']


Unigram

In [50]:
# Unigram
unigram_counts = Counter()

for sent in processed_docs:
    unigram_counts.update(sent)

total_unigrams = sum(unigram_counts.values())

print("Unigram Counts:")
print(unigram_counts)


Unigram Counts:
Counter({'<s>': 4, '</s>': 4, 'learning': 2, 'artificial': 1, 'intelligence': 1, 'transforming': 1, 'healthcare': 1, 'education': 1, 'machine': 1, 'models': 1, 'learn': 1, 'patterns': 1, 'large': 1, 'datasets': 1, 'deep': 1, 'improves': 1, 'image': 1, 'recognition': 1, 'speech': 1, 'processing': 1, 'ethical': 1, 'ai': 1, 'ensures': 1, 'fairness': 1, 'transparency': 1, 'accountability': 1})


Bigram

In [51]:
# Bigram
bigram_counts = Counter()

for sent in processed_docs:
    for i in range(len(sent)-1):
        bigram = (sent[i], sent[i+1])
        bigram_counts[bigram] += 1

print("\nBigram Counts:")
print(bigram_counts)



Bigram Counts:
Counter({('<s>', 'artificial'): 1, ('artificial', 'intelligence'): 1, ('intelligence', 'transforming'): 1, ('transforming', 'healthcare'): 1, ('healthcare', 'education'): 1, ('education', '</s>'): 1, ('<s>', 'machine'): 1, ('machine', 'learning'): 1, ('learning', 'models'): 1, ('models', 'learn'): 1, ('learn', 'patterns'): 1, ('patterns', 'large'): 1, ('large', 'datasets'): 1, ('datasets', '</s>'): 1, ('<s>', 'deep'): 1, ('deep', 'learning'): 1, ('learning', 'improves'): 1, ('improves', 'image'): 1, ('image', 'recognition'): 1, ('recognition', 'speech'): 1, ('speech', 'processing'): 1, ('processing', '</s>'): 1, ('<s>', 'ethical'): 1, ('ethical', 'ai'): 1, ('ai', 'ensures'): 1, ('ensures', 'fairness'): 1, ('fairness', 'transparency'): 1, ('transparency', 'accountability'): 1, ('accountability', '</s>'): 1})


Trigram

In [52]:
# Trigram
trigram_counts = Counter()

for sent in processed_docs:
    for i in range(len(sent)-2):
        trigram = (sent[i], sent[i+1], sent[i+2])
        trigram_counts[trigram] += 1

print("\nTrigram Counts:")
print(trigram_counts)



Trigram Counts:
Counter({('<s>', 'artificial', 'intelligence'): 1, ('artificial', 'intelligence', 'transforming'): 1, ('intelligence', 'transforming', 'healthcare'): 1, ('transforming', 'healthcare', 'education'): 1, ('healthcare', 'education', '</s>'): 1, ('<s>', 'machine', 'learning'): 1, ('machine', 'learning', 'models'): 1, ('learning', 'models', 'learn'): 1, ('models', 'learn', 'patterns'): 1, ('learn', 'patterns', 'large'): 1, ('patterns', 'large', 'datasets'): 1, ('large', 'datasets', '</s>'): 1, ('<s>', 'deep', 'learning'): 1, ('deep', 'learning', 'improves'): 1, ('learning', 'improves', 'image'): 1, ('improves', 'image', 'recognition'): 1, ('image', 'recognition', 'speech'): 1, ('recognition', 'speech', 'processing'): 1, ('speech', 'processing', '</s>'): 1, ('<s>', 'ethical', 'ai'): 1, ('ethical', 'ai', 'ensures'): 1, ('ai', 'ensures', 'fairness'): 1, ('ensures', 'fairness', 'transparency'): 1, ('fairness', 'transparency', 'accountability'): 1, ('transparency', 'accountabilit

Conditional Probabilities

In [53]:
# Bigram probabilities
bigram_probs = {}

for (w1, w2), count in bigram_counts.items():
    bigram_probs[(w1, w2)] = count / unigram_counts[w1]

print("\nBigram Probabilities:")
print(bigram_probs)



Bigram Probabilities:
{('<s>', 'artificial'): 0.25, ('artificial', 'intelligence'): 1.0, ('intelligence', 'transforming'): 1.0, ('transforming', 'healthcare'): 1.0, ('healthcare', 'education'): 1.0, ('education', '</s>'): 1.0, ('<s>', 'machine'): 0.25, ('machine', 'learning'): 1.0, ('learning', 'models'): 0.5, ('models', 'learn'): 1.0, ('learn', 'patterns'): 1.0, ('patterns', 'large'): 1.0, ('large', 'datasets'): 1.0, ('datasets', '</s>'): 1.0, ('<s>', 'deep'): 0.25, ('deep', 'learning'): 1.0, ('learning', 'improves'): 0.5, ('improves', 'image'): 1.0, ('image', 'recognition'): 1.0, ('recognition', 'speech'): 1.0, ('speech', 'processing'): 1.0, ('processing', '</s>'): 1.0, ('<s>', 'ethical'): 0.25, ('ethical', 'ai'): 1.0, ('ai', 'ensures'): 1.0, ('ensures', 'fairness'): 1.0, ('fairness', 'transparency'): 1.0, ('transparency', 'accountability'): 1.0, ('accountability', '</s>'): 1.0}


In [54]:
# Trigram probabilities
trigram_probs = {}

for (w1, w2, w3), count in trigram_counts.items():
    trigram_probs[(w1, w2, w3)] = count / bigram_counts[(w1, w2)]

print("\nTrigram Probabilities:")
print(trigram_probs)



Trigram Probabilities:
{('<s>', 'artificial', 'intelligence'): 1.0, ('artificial', 'intelligence', 'transforming'): 1.0, ('intelligence', 'transforming', 'healthcare'): 1.0, ('transforming', 'healthcare', 'education'): 1.0, ('healthcare', 'education', '</s>'): 1.0, ('<s>', 'machine', 'learning'): 1.0, ('machine', 'learning', 'models'): 1.0, ('learning', 'models', 'learn'): 1.0, ('models', 'learn', 'patterns'): 1.0, ('learn', 'patterns', 'large'): 1.0, ('patterns', 'large', 'datasets'): 1.0, ('large', 'datasets', '</s>'): 1.0, ('<s>', 'deep', 'learning'): 1.0, ('deep', 'learning', 'improves'): 1.0, ('learning', 'improves', 'image'): 1.0, ('improves', 'image', 'recognition'): 1.0, ('image', 'recognition', 'speech'): 1.0, ('recognition', 'speech', 'processing'): 1.0, ('speech', 'processing', '</s>'): 1.0, ('<s>', 'ethical', 'ai'): 1.0, ('ethical', 'ai', 'ensures'): 1.0, ('ai', 'ensures', 'fairness'): 1.0, ('ensures', 'fairness', 'transparency'): 1.0, ('fairness', 'transparency', 'account

Add-One (Laplace) Smoothing

In [55]:
vocab_size = len(unigram_counts)

def laplace_bigram_prob(w1, w2):
    return (bigram_counts[(w1, w2)] + 1) / (unigram_counts[w1] + vocab_size)


Sentence Probability

In [56]:
test_sentences = [
    "AI improves healthcare",
    "machine learning models",
    "ethical AI ensures fairness",
    "deep learning improves speech",
    "AI models learn patterns"
]


Unigram Probability

In [57]:
def unigram_prob(sentence):
    tokens = preprocess(sentence)
    prob = 1
    for word in tokens:
        prob *= (unigram_counts[word] + 1) / (total_unigrams + vocab_size)
    return prob


Bigram Probability

In [58]:
def bigram_prob(sentence):
    tokens = preprocess(sentence)
    prob = 1
    for i in range(len(tokens)-1):
        prob *= laplace_bigram_prob(tokens[i], tokens[i+1])
    return prob


Trigram Probability

In [59]:
def trigram_prob(sentence):
    tokens = preprocess(sentence)
    prob = 1
    for i in range(len(tokens)-2):
        count = trigram_counts[(tokens[i], tokens[i+1], tokens[i+2])] + 1
        base = bigram_counts[(tokens[i], tokens[i+1])] + vocab_size
        prob *= count / base
    return prob


In [60]:
for s in test_sentences:
    print("\nSentence:", s)
    print("Unigram:", unigram_prob(s))
    print("Bigram:", bigram_prob(s))
    print("Trigram:", trigram_prob(s))



Sentence: AI improves healthcare
Unigram: 2.797498983874935e-07
Bigram: 1.6935087808430286e-06
Trigram: 5.689576695493856e-05

Sentence: machine learning models
Unigram: 4.1962484758124015e-07
Bigram: 1.3064210595074791e-05
Trigram: 0.00020322105370116343

Sentence: ethical AI ensures fairness
Unigram: 9.48304740296588e-09
Bigram: 1.003560759018091e-06
Trigram: 1.5053411385271365e-05

Sentence: deep learning improves speech
Unigram: 1.422457110444882e-08
Bigram: 4.838596516694367e-07
Trigram: 7.81619437312167e-06

Sentence: AI models learn patterns
Unigram: 9.48304740296588e-09
Bigram: 2.5089018975452275e-07
Trigram: 4.058408616813175e-06


Perplexity Calculation

In [61]:
def perplexity(sentence, model="bigram"):
    tokens = preprocess(sentence)
    N = len(tokens)

    if model == "unigram":
        prob = unigram_prob(sentence)
    elif model == "bigram":
        prob = bigram_prob(sentence)
    else:
        prob = trigram_prob(sentence)

    return pow(1/prob, 1/N)


In [62]:
for s in test_sentences:
    print("\nSentence:", s)
    print("Unigram Perplexity:", perplexity(s, "unigram"))
    print("Bigram Perplexity:", perplexity(s, "bigram"))
    print("Trigram Perplexity:", perplexity(s, "trigram"))



Sentence: AI improves healthcare
Unigram Perplexity: 20.447772873076822
Bigram Perplexity: 14.264038732150023
Trigram Perplexity: 7.062915473248846

Sentence: machine learning models
Unigram Perplexity: 18.855053138445598
Bigram Perplexity: 9.479454917247816
Trigram Perplexity: 5.475279077038852

Sentence: ethical AI ensures fairness
Unigram Perplexity: 21.735785841978277
Bigram Perplexity: 9.994077696835463
Trigram Perplexity: 6.363961030678927

Sentence: deep learning improves speech
Unigram Perplexity: 20.3154666801183
Bigram Perplexity: 11.286174616252682
Trigram Perplexity: 7.098513972447532

Sentence: AI models learn patterns
Unigram Perplexity: 21.735785841978277
Bigram Perplexity: 12.59174886452787
Trigram Perplexity: 7.917851849519763
