In [5]:
import random
from collections import defaultdict
from typing import List, Dict, Tuple

class NGramModel:
    def __init__(self, n: int = 3, is_char_level: bool = False):
        """
        Initialize the n-gram model.

        Args:
            n: The 'n' in n-gram (e.g., 2 for bigram, 3 for trigram)
            is_char_level: If True, works at character level; if False, at word level
        """
        self.n = n
        self.is_char_level = is_char_level
        self.ngram_counts = defaultdict(int)
        self.context_counts = defaultdict(int)
        self.vocab = set()

    def preprocess(self, text: str) -> List[str]:
        """Convert text to tokens (either words or characters)."""
        if self.is_char_level:
            return list(text)
        else:
            # Simple word tokenization (split on whitespace)
            return text.split()

    def train(self, text: str) -> None:
        """Train the model on the given text."""
        tokens = self.preprocess(text)
        self.vocab.update(tokens)

        # Pad the beginning with special tokens to handle start of sequence
        padded_tokens = ['<START>'] * (self.n - 1) + tokens + ['<END>']

        # Count n-grams and their contexts
        for i in range(len(padded_tokens) - self.n + 1):
            ngram = tuple(padded_tokens[i:i + self.n])
            context = ngram[:-1]
            self.ngram_counts[ngram] += 1
            self.context_counts[context] += 1

    def generate(self, max_length: int = 50) -> str:
        """Generate text from the model."""
        result = ['<START>'] * (self.n - 1)

        for _ in range(max_length):
            context = tuple(result[-(self.n - 1):])

            # If context is unseen, return what we have
            if context not in self.context_counts:
                break

            # Get possible next tokens and their probabilities
            possible_ngrams = [ngram for ngram in self.ngram_counts
                             if ngram[:-1] == context]
            probabilities = [self.ngram_counts[ngram] / self.context_counts[context]
                            for ngram in possible_ngrams]
            next_ngram = random.choices(possible_ngrams, weights=probabilities)[0]
            next_token = next_ngram[-1]

            # Stop if we hit the end token
            if next_token == '<END>':
                break

            result.append(next_token)

        # Remove start tokens and convert to string
        generated = result[self.n - 1:]
        if self.is_char_level:
            return ''.join(generated)
        else:
            return ' '.join(generated)

    def probability(self, sequence: str) -> float:
        """Calculate the probability of a sequence under the model."""
        tokens = self.preprocess(sequence)
        padded_tokens = ['<START>'] * (self.n - 1) + tokens + ['<END>']
        total_prob = 1.0

        for i in range(len(padded_tokens) - self.n + 1):
            ngram = tuple(padded_tokens[i:i + self.n])
            context = ngram[:-1]

            if context in self.context_counts:
                prob = self.ngram_counts.get(ngram, 0) / self.context_counts[context]
                total_prob *= prob
            else:
                return 0.0  # Unseen context

        return total_prob

In [6]:
    # Sample training text
    training_text = """
    This is a simple example text for training our n-gram model.
    The model will learn the probabilities of word sequences.
    It's not very sophisticated but it demonstrates the concept.
    """

    # Word-level trigram model
    print("Training word-level trigram model...")
    word_model = NGramModel(n=3, is_char_level=False)
    word_model.train(training_text)

    print("\nGenerated text (word level):")
    print(word_model.generate())

    # Character-level bigram model
    print("\nTraining character-level bigram model...")
    char_model = NGramModel(n=2, is_char_level=True)
    char_model.train(training_text)

    print("\nGenerated text (character level):")
    print(char_model.generate(100))

Training word-level trigram model...

Generated text (word level):
This is a simple example text for training our n-gram model. The model will learn the probabilities of word sequences. It's not very sophisticated but it demonstrates the concept.

Training character-level bigram model...

Generated text (character level):

Therampl t t imongrabaramorary t mobist witer de mouelelerouroueabatheamplequt f s l lepllisthinoro

Probability of 'This is a simple': 0.000000
