In [2]:
import re
import nltk
from collections import defaultdict
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chait\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
class NGramAutoComplete:
    def __init__(self, n=3):
        self.n = n
        self.ngram_counts = defaultdict(lambda: defaultdict(int))
        self.context_counts = defaultdict(int)

    def preprocess_text(self, text):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        return word_tokenize(text)

    def train(self, corpus):
        tokens = self.preprocess_text(corpus)
        n_grams = list(ngrams(tokens, self.n))

        for gram in n_grams:
            prefix = tuple(gram[:-1])
            next_word = gram[-1]
            self.ngram_counts[prefix][next_word] += 1
            self.context_counts[prefix] += 1

    def predict(self, prefix, top_k=3):
        tokens = self.preprocess_text(prefix)
        if len(tokens) < self.n - 1:
            return []
        
        context = tuple(tokens[-(self.n - 1):])
        candidates = self.ngram_counts.get(context, {})

        if not candidates:
            return []
        
        sorted_predictions = sorted(candidates.items(), key=lambda x: x[1], reverse=True)
        return [word for word, _ in sorted_predictions[:top_k]]

In [4]:
corpus = """The quick brown fox jumps over the lazy dog. The quick brown fox is fast and clever."""
auto_complete = NGramAutoComplete(n=3)
auto_complete.train(corpus)

In [5]:
print(auto_complete.predict("The quick brown"))  # Output: ['fox']
print(auto_complete.predict("The quick"))  # Output: ['brown']

['fox']
['brown']
