In [1]:
from collections import Counter, defaultdict
import nltk
from nltk.util import ngrams
import random

nltk.download('punkt')

# Sample text corpus
corpus = "This is a sample text corpus. This corpus is used to demonstrate text processing."

# Tokenize the text into words
tokens = nltk.word_tokenize(corpus.lower())

# Unigrams
unigrams = Counter(tokens)
print("Unigrams:", unigrams)

# Bigrams
bigrams = list(ngrams(tokens, 2))
bigram_counts = Counter(bigrams)
print("\nBigrams:", bigram_counts)

# Trigrams
trigrams = list(ngrams(tokens, 3))
trigram_counts = Counter(trigrams)
print("\nTrigrams:", trigram_counts)

# Bigram probabilities
bigram_probabilities = defaultdict(lambda: defaultdict(int))
for w1, w2 in bigrams:
    bigram_probabilities[w1][w2] += 1

for w1 in bigram_probabilities:
    total_count = float(sum(bigram_probabilities[w1].values()))
    for w2 in bigram_probabilities[w1]:
        bigram_probabilities[w1][w2] /= total_count

print("\nBigram Probabilities:")
for w1 in bigram_probabilities:
    for w2 in bigram_probabilities[w1]:
        print(f"P({w2}|{w1}) = {bigram_probabilities[w1][w2]}")

# Next word prediction function
def predict_next_word(word, num_predictions=3):
    if word in bigram_probabilities:
        sorted_predictions = sorted(bigram_probabilities[word].items(), key=lambda item: item[1], reverse=True)
        return [word for word, prob in sorted_predictions[:num_predictions]]
    else:
        return []

# Example: Predict next words for 'this'
next_words = predict_next_word('this')
print("\nNext word predictions for 'this':", next_words)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unigrams: Counter({'this': 2, 'is': 2, 'text': 2, 'corpus': 2, '.': 2, 'a': 1, 'sample': 1, 'used': 1, 'to': 1, 'demonstrate': 1, 'processing': 1})

Bigrams: Counter({('this', 'is'): 1, ('is', 'a'): 1, ('a', 'sample'): 1, ('sample', 'text'): 1, ('text', 'corpus'): 1, ('corpus', '.'): 1, ('.', 'this'): 1, ('this', 'corpus'): 1, ('corpus', 'is'): 1, ('is', 'used'): 1, ('used', 'to'): 1, ('to', 'demonstrate'): 1, ('demonstrate', 'text'): 1, ('text', 'processing'): 1, ('processing', '.'): 1})

Trigrams: Counter({('this', 'is', 'a'): 1, ('is', 'a', 'sample'): 1, ('a', 'sample', 'text'): 1, ('sample', 'text', 'corpus'): 1, ('text', 'corpus', '.'): 1, ('corpus', '.', 'this'): 1, ('.', 'this', 'corpus'): 1, ('this', 'corpus', 'is'): 1, ('corpus', 'is', 'used'): 1, ('is', 'used', 'to'): 1, ('used', 'to', 'demonstrate'): 1, ('to', 'demonstrate', 'text'): 1, ('demonstrate', 'text', 'processing'): 1, ('text', 'processing', '.'): 1})

Bigram Probabilities:
P(is|this) = 0.5
P(corpus|this) = 0.5
P(a|