# Assignment 10

Name: Vivek Mule
Roll: 381072
PRN: 22420145

#### Develop a Machine Translation system to translate public information content between English and any Indian language.

In [1]:
!pip install nltk



In [2]:
import re
from collections import defaultdict, Counter

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text.split()

In [4]:
class NGramModel:
    def __init__(self, n=3):
        self.n = n
        self.ngram_counts = defaultdict(Counter)
        self.context_counts = Counter()
        self.vocab = set()

    def train(self, tokens):
        self.vocab = set(tokens)

        for i in range(len(tokens)):
            for k in range(1, self.n + 1):
                if i - k + 1 < 0:
                    continue

                ngram = tuple(tokens[i-k+1:i+1])
                context = tuple(tokens[i-k+1:i]) if k > 1 else ()

                self.ngram_counts[context][ngram[-1]] += 1
                self.context_counts[context] += 1

    def get_probability(self, context, word):
        vocab_size = len(self.vocab)
        count = self.ngram_counts[context][word]
        total = self.context_counts[context]
        return (count + 1) / (total + vocab_size)

    def predict_next(self, text, top_k=5):
        tokens = preprocess_text(text)

        # Backoff: trigram → bigram → unigram
        for k in range(self.n-1, -1, -1):
            context = tuple(tokens[-k:]) if k > 0 else ()

            if context in self.ngram_counts:
                candidates = {}

                for word in self.vocab:
                    prob = self.get_probability(context, word)
                    candidates[word] = prob

                sorted_words = sorted(
                    candidates.items(),
                    key=lambda x: x[1],
                    reverse=True
                )

                return sorted_words[:top_k]

        return []

In [5]:
corpus = """
Natural language processing (NLP) is technology that allows computers to interpret, manipulate, and comprehend human language. Organizations today have large volumes of voice and text data from various communication channels like emails, text messages, social media newsfeeds, video, audio, and more."""

tokens = preprocess_text(corpus)

model = NGramModel(n=3)
model.train(tokens)

print("Model trained successfully!")


Model trained successfully!


In [6]:
while True:
    user_input = input("\nEnter text (or 'exit'): ")
    if user_input.lower() == "exit":
        break

    predictions = model.predict_next(user_input)

    print("Suggestions:")
    for word, prob in predictions:
        print(f"{word}  (prob={round(prob,4)})")


Enter text (or 'exit'): technology
Suggestions:
that  (prob=0.0526)
natural  (prob=0.0263)
large  (prob=0.0263)
video  (prob=0.0263)
social  (prob=0.0263)

Enter text (or 'exit'): exit
