In [2]:
# Step 1: Install & import
import nltk
import pandas as pd
import re
from collections import Counter
from nltk.corpus import brown
nltk.download('brown')

# Step 2: Load and clean the corpus
tokenized_text = brown.words()
tokenized_text = [word.lower() for word in tokenized_text if word.isalpha()]  # remove punctuation/numbers

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [3]:
# Step 3: Define function to build n-gram DataFrame
def generate_ngram_df(tokens, n):
    ngrams = zip(*[tokens[i:] for i in range(n)])
    ngram_list = [' '.join(gram) for gram in ngrams]
    freq = Counter(ngram_list)
    df = pd.DataFrame(freq.items(), columns=['ngram', 'frequency']).sort_values(by='frequency', ascending=False)
    return df.reset_index(drop=True)

# Example usage: trigram frequencies
trigrams_df = generate_ngram_df(tokenized_text, 3)
print(trigrams_df.head())

               ngram  frequency
0         one of the        404
1  the united states        337
2         as well as        238
3        some of the        179
4         out of the        174


In [4]:
# Step 4: Predict top k next words based on n-grams
def predict_next_words(tokens, n, k=5):
    """tokens: list of words (or a string)
       n: n-gram size
       k: number of suggestions"""

    if isinstance(tokens, str):
        tokens = tokens.lower().split()

    if len(tokens) < n - 1:
        raise ValueError(f"You must provide at least {n-1} word(s) for a {n}-gram model.")

    context = ' '.join(tokens[-(n-1):])

    # Generate all n-grams
    ngrams = zip(*[tokenized_text[i:] for i in range(n)])
    ngram_strings = [' '.join(ng) for ng in ngrams]

    # Filter n-grams that match the context
    matching = [ngram for ngram in ngram_strings if ngram.startswith(context + ' ')]
    next_words = [ngram.split()[-1] for ngram in matching]

    top_k = Counter(next_words).most_common(k)
    return pd.DataFrame(top_k, columns=['next_word', 'frequency'])

# Example usage
print("\nPredictions for context: 'the united'")
print(predict_next_words("the united", n=3, k=5))



Predictions for context: 'the united'
   next_word  frequency
0     states        337
1    nations         42
2   profound          1
3      irish          1
4  challenge          1
