In [None]:
import random
from collections import defaultdict
from typing import List, Dict, Tuple

class NGramModel:
    def __init__(self, n: int = 3, is_char_level: bool = False):
        """
        Initialize the n-gram model.

        Args:
            n: The 'n' in n-gram (e.g., 2 for bigram, 3 for trigram)
            is_char_level: If True, works at character level; if False, at word level
        """
        self.n = n

        # Character level generation outputs one character at a time.
        # Word level outputs one word at a time.
        # Which one is better?
        self.is_char_level = is_char_level

        # Counts frequency of each ngram
        # "I love to eat" -> 1
        # "I love to play" -> 2
        self.ngram_counts = defaultdict(int)

        # "I love to" -> 3
        self.context_counts = defaultdict(int)

        # unique vocabulary
        self.vocab = set()

    def preprocess(self, text: str) -> List[str]:
        """Convert text to tokens (either words or characters)."""
        if self.is_char_level:
            return list(text)
        else:
            # Simple word tokenization (split on whitespace)
            return text.split()

    def train(self, text: str) -> None:
        """Train the model on the given text."""
        tokens = self.preprocess(text)
        self.vocab.update(tokens)

        # Pad the beginning with special tokens to handle start of sequence
        # Why do we need to pad multiple start token?
        # To provide correct starting conext.
        #     2-gram: <START> I love to eat <END>
        #     3-gram: <START> <START> I love to eat <END>
        padded_tokens = ['<START>'] * (self.n - 1) + tokens + ['<END>']

        # Count n-grams and their contexts:
        # <START> I love to eat <END>
        # ngram:
        #   <START> I
        #   I love
        #   love to
        #   to eat
        #   eat <END>
        # context:
        #   <START>
        #   I
        #   love
        #   to
        #   eat
        for i in range(len(padded_tokens) - self.n + 1):
            ngram = tuple(padded_tokens[i:i + self.n])
            context = ngram[:-1]
            self.ngram_counts[ngram] += 1
            self.context_counts[context] += 1

    def generate(self, max_length: int = 50) -> str:
        """Generate text from the model."""
        result = ['<START>'] * (self.n - 1)

        for _ in range(max_length):
            context = tuple(result[-(self.n - 1):])

            # If context is unseen, return what we have
            if context not in self.context_counts:
                break

            # Get possible next tokens and their probabilities
            #  "I love to eat pizza"
            #  "I love to play violin"
            #  "I love to play soccer"
            # Will generate
            #  "I love to play"
            possible_ngrams = [ngram for ngram in self.ngram_counts
                             if ngram[:-1] == context]
            probabilities = [self.ngram_counts[ngram] / self.context_counts[context]
                            for ngram in possible_ngrams]
            next_ngram = random.choices(possible_ngrams, weights=probabilities)[0]
            next_token = next_ngram[-1]

            # Stop if we hit the end token
            if next_token == '<END>':
                break

            result.append(next_token)

        # Remove start tokens and convert to string
        generated = result[self.n - 1:]
        if self.is_char_level:
            return ''.join(generated)
        else:
            return ' '.join(generated)

In [1]:
# Sample training text
training_text = """
This is a simple example text for training our n-gram model.
The model will learn the probabilities of word sequences.
It's not very sophisticated but it demonstrates the concept.
"""
# Word-level trigram model
print("Training word-level trigram model...")
word_model = NGramModel(n=2, is_char_level=False)
word_model.train(training_text)

print("\nGenerated text (word level):")
print(word_model.generate())

Training word-level trigram model...


NameError: name 'NGramModel' is not defined

In [None]:
# Character-level bigram model
print("\nTraining character-level bigram model...")

char_model = NGramModel(n=2, is_char_level=True)
char_model.train(training_text)
print("\nGenerated text (character level):")
print(char_model.generate(100))


Training character-level bigram model...

Generated text (character level):

The ves ve oncotes atepleque thicatrd biexamote mprarabut wimoraicesobis bainonces de t.
Thes wit n


In [None]:
import requests

def download_to_string(url: str) -> str:
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes (4xx/5xx)
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error downloading from {url}: {e}")
        return ""

url = "https://raw.githubusercontent.com/cobanov/shakespeare-dataset/refs/heads/main/text/a-midsummer-nights-dream_TXT_FolgerShakespeare.txt"
content = download_to_string(url)

print("Training word-level  model...")
word_model = NGramModel(n=2, is_char_level=False)
word_model.train(content)

print("\nGenerated text (word level):")
print(word_model.generate())

Training word-level  model...

Generated text (word level):
A crew of Athens calls. Their wonted sight. When thou hast by break not; I am to choose love is your royal walks, your father's ground Sleep give me ere I swore. HELENA A Midsummer Night's Dream by tomorrow midnight solemnly Dance in dark uneven way, And yonder shines Aurora's
