In [None]:
import random
import re
import requests
from collections import defaultdict

def clean_text(text):
    start = text.find("*** START")
    end = text.find("*** END")
    if start != -1 and end != -1:
        text = text[start:end]

    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def build_5gram_model(words):
    model = defaultdict(list)
    for i in range(len(words) - 5):
        key = tuple(words[i:i+5])   # context (4 words)
        next_word = words[i+5]      # predicted word
        model[key].append(next_word)
    return model

def generate_from_seed(model, seed_sentence, length=120):
    seed_sentence = seed_sentence.lower()
    seed_sentence = re.sub(r"[^a-z\s]", " ", seed_sentence)
    seed_words = seed_sentence.split()

    if len(seed_words) < 5:
        raise ValueError("Seed sentence must have at least 4 words")

    output = seed_words[:]

    for _ in range(length):
        key = tuple(output[-5:])
        if key not in model:
            break
        output.append(random.choice(model[key]))

    return " ".join(output)


# Jane Austen: Pride and Prejudice
url = "https://www.gutenberg.org/files/1342/1342-0.txt"
raw_text = requests.get(url).text

# Train model
cleaned_text = clean_text(raw_text)
words = cleaned_text.split()
model = build_5gram_model(words)

# Seed sentences
seeds = [
    "she could not help feeling",
    "she could not determine whether",
    "it was not long before",
]

# Generate outputs
for i, seed in enumerate(seeds, 1):
    print(f"\n--- OUTPUT {i} ---")
    print(f"Seed: \"{seed}\"")
    print(generate_from_seed(model, seed, length=150))
