# N-gram Next-Word Prediction
This notebook demonstrates feature creation from text using n-grams and builds a simple next-word prediction based on frequency.

## Step 1: Install and import dependencies

In [1]:
!pip install -q nltk pandas

## Step 2: Download and load the Brown corpus

In [2]:
import nltk
nltk.download('brown')
nltk.download('punkt')
from nltk.corpus import brown
import pandas as pd
import re

# Load words and remove punctuation tokens
tokens = [w.lower() for w in brown.words() if re.match(r'^[A-Za-z]+$', w)]
len(tokens), tokens[:20]

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


(981716,
 ['the',
  'fulton',
  'county',
  'grand',
  'jury',
  'said',
  'friday',
  'an',
  'investigation',
  'of',
  'recent',
  'primary',
  'election',
  'produced',
  'no',
  'evidence',
  'that',
  'any',
  'irregularities',
  'took'])

## Step 3: Function to compute n-gram frequencies

In [3]:
from collections import Counter

def ngram_frequencies(tokens, n):
    """
    Return a DataFrame with n-grams and their frequencies.
    """
    ngrams = zip(*(tokens[i:] for i in range(n)))
    freq = Counter(ngrams)
    df = pd.DataFrame(
        [{'ngram': ' '.join(gram), 'frequency': count} for gram, count in freq.items()]
    )
    return df.sort_values('frequency', ascending=False).reset_index(drop=True)

# Example: top 5 trigrams
ngram_frequencies(tokens, 3).head()

Unnamed: 0,ngram,frequency
0,one of the,404
1,the united states,337
2,as well as,238
3,some of the,179
4,out of the,174


## Step 4: Next-word prediction function

In [4]:
def predict_next(tokens, context, n=2, k=5):
    """
    Given a context string and n, return top k next-word predictions based on n-grams.
    """
    context_tokens = context.lower().split()
    if len(context_tokens) < n-1:
        raise ValueError(f"Context must have at least {n-1} words for {n}-gram model")
    # Filter n-grams by context
    ngrams = zip(*(tokens[i:] for i in range(n)))
    freq = Counter(ngrams)
    # Collect candidates where first n-1 match context
    context_tuple = tuple(context_tokens[-(n-1):])
    candidates = {
        gram[-1]: count for gram, count in freq.items()
        if gram[:-1] == context_tuple
    }
    # Return top k
    top = Counter(candidates).most_common(k)
    return [word for word, count in top]

# Example predictions
for ctx in ['the quick', 'in the', 'new york']:
    print(f"Context: '{ctx}' -> Predictions: {predict_next(tokens, ctx, n=3, k=5)}")

Context: 'the quick' -> Predictions: ['response', 'free', 'jerky', 'movement', 'just']
Context: 'in the' -> Predictions: ['first', 'world', 'united', 'same', 'past']
Context: 'new york' -> Predictions: ['city', 'times', 'central', 'and', 'the']
