In [None]:
#
# === Part 1: Colab Training Script (PyTorch) ===
#
# This notebook will train all 32 model variations
# and save them as.pth files for the Streamlit app.
#

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import numpy as np
import requests
import re
from collections import Counter
import pickle
import os
import time

# For analysis
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


#
# === 1.1: Data Acquisition and Preprocessing ===
#

print("Downloading Sherlock Holmes corpus...")
url_sherlock = "https://www.gutenberg.org/files/1661/1661-0.txt"
r = requests.get(url_sherlock)
with open('sherlock_holmes.txt', 'wb') as f:
    f.write(r.content)

def preprocess_sherlock(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        text = f.read()

    # Delimit the corpus
    start_marker = "THE ADVENTURES OF SHERLOCK HOLMES"
    end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"
    try:
        start_index = text.index(start_marker)
        end_index = text.index(end_marker)
        text = text[start_index:end_index]
    except ValueError:
        print("Warning: Could not find start/end markers. Using full text.")

    # Standard cleaning
    text = text.lower()
    text = text.replace('.', '. ') # Isolate the full stop
    text = re.sub(r'[^a-z0-9 \.]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

sherlock_text = preprocess_sherlock('sherlock_holmes.txt')
print("Corpus preprocessing complete.")


#
# === 1.2: Vocabulary Construction ===
#

# Tokenize
tokens_sherlock = sherlock_text.split(' ')
print(f"Total tokens in corpus: {len(tokens_sherlock)}")

# Build vocabulary
VOCAB_SIZE = 10000
word_counts = Counter(tokens_sherlock)

# Create vocabulary: top (VOCAB_SIZE - 2) words + <UNK> and <PAD>
# We will use the '.' token as our padding token for context.
vocabulary = [word for word, count in word_counts.most_common(VOCAB_SIZE - 2)]
vocabulary.insert(0, '<UNK>') # Add <UNK> at index 0
vocabulary.insert(1, '.')     # Add '.' at index 1

# Create word-to-integer and integer-to-word mappings
word_to_int = {word: i for i, word in enumerate(vocabulary)}
int_to_word = {i: word for i, word in enumerate(vocabulary)}

# Map all tokens in the corpus to integers
tokens_int = []
for word in tokens_sherlock:
    tokens_int.append(word_to_int.get(word, word_to_int['<UNK>']))

print(f"Final vocabulary size: {len(vocabulary)}")

# --- Report: Vocabulary Analysis ---
print("\n--- Vocabulary Report ---")
print(f"Vocabulary Size: {len(vocabulary)}")

# 10 most frequent (excluding <UNK>)
most_freq = [(int_to_word[i], word_counts[int_to_word[i]]) for i in range(1, 12)]
print(f"10 Most Frequent Words: {most_freq}")

# 10 least frequent in our vocab
least_freq = [(word, word_counts[word]) for word in vocabulary[-10:]]
print(f"10 Least Frequent Words (in vocab): {least_freq}")

# Save the vocab artifacts for the Streamlit app
# This is the most important file besides the models
vocab_artifacts = {
    'word_to_int': word_to_int,
    'int_to_word': int_to_word,
    'vocab_size': len(vocabulary)
}
with open('vocab.pkl', 'wb') as f:
    pickle.dump(vocab_artifacts, f)
print("\nSaved 'vocab.pkl'. This is needed for the Streamlit app.")


#
# === 1.3: PyTorch Custom Dataset ===
#

class SlidingWindowDataset(Dataset):
    """
    Custom Dataset for creating sliding window (context, target) pairs.

    """
    def __init__(self, tokens, context_length):
        self.tokens = tokens
        self.context_length = context_length

        # Pad the beginning of the token list to create context for the first words
        # We use the integer for '.' as our padding token
        pad_token_int = word_to_int['.']
        self.padded_tokens = [pad_token_int] * self.context_length + self.tokens

    def __len__(self):
        # Total number of (context, target) pairs
        return len(self.tokens)

    def __getitem__(self, idx):
        # The context starts from the padded list
        context = self.padded_tokens[idx : idx + self.context_length]
        # The target is the next word in the *original* token list
        target = self.tokens[idx]

        # Return as LongTensors
        return torch.tensor(context, dtype=torch.long), torch.tensor(target, dtype=torch.long)

# Split data: 90% train, 10% validation
split_idx = int(len(tokens_int) * 0.9)
train_tokens = tokens_int[:split_idx]
val_tokens = tokens_int[split_idx:]
print(f"Train tokens: {len(train_tokens)}, Validation tokens: {len(val_tokens)}")


#
# === 1.4: Model Design (PyTorch nn.Module) ===
#
class MLPTextGenerator(nn.Module):
    """
    MLP-based N-gram model as specified in the query.

    """
    def __init__(self, vocab_size, embed_dim, context_length, hidden_layers, activation):
        super(MLPTextGenerator, self).__init__()

        self.context_length = context_length
        self.embed_dim = embed_dim
        self.num_hidden = hidden_layers

        # 1. Embedding Layer
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        # 2. Flatten Layer
        self.flatten = nn.Flatten()

        # 3. Activation Function
        if activation == 'relu':
            self.activation = nn.ReLU()
        elif activation == 'tanh':
            self.activation = nn.Tanh()

        # 4. Hidden Layers
        input_size = context_length * embed_dim
        self.hidden1 = nn.Linear(input_size, 1024)

        if self.num_hidden == 2:
            self.hidden2 = nn.Linear(1024, 1024)

        # 5. Output Layer
        self.output = nn.Linear(1024, vocab_size)

    def forward(self, x):
        # x shape: (batch_size, context_length)
        x = self.embedding(x)
        # x shape: (batch_size, context_length, embed_dim)

        x = self.flatten(x)
        # x shape: (batch_size, context_length * embed_dim)

        x = self.activation(self.hidden1(x))
        # x shape: (batch_size, 1024)

        if self.num_hidden == 2:
            x = self.activation(self.hidden2(x))

        # Logits are returned. nn.CrossEntropyLoss applies log_softmax internally.
        logits = self.output(x)
        # logits shape: (batch_size, vocab_size)
        return logits


#
# === 1.5: Hyperparameter Training Loop ===
#

# Create a directory to store the models
MODEL_DIR = "trained_models"
os.makedirs(MODEL_DIR, exist_ok=True)

# Define hyperparameter grid
CONTEXT_LENGTHS = [6, 7, 8, 9]
EMBED_DIMS = [64] # Corrected from scalar to list
HIDDEN_LAYERS = [1, 2]
ACTIVATIONS = ['relu', 'tanh']

# Training settings
EPOCHS = 200  # Reduced for demonstration. 50-100 is better.
BATCH_SIZE = 128
LEARNING_RATE = 0.001
VOCAB_SIZE = len(vocabulary)

total_models = len(CONTEXT_LENGTHS) * len(EMBED_DIMS) * len(HIDDEN_LAYERS) * len(ACTIVATIONS)
print(f"\nStarting training for {total_models} model combinations...")
model_count = 0

# Store history of the last model for plotting
last_model_history = {}

for context in CONTEXT_LENGTHS:
    print(f"\n--- CONTEXT_LENGTH: {context} ---")

    # 1. Create Datasets and DataLoaders for this context size
    train_dataset = SlidingWindowDataset(train_tokens, context)
    val_dataset = SlidingWindowDataset(val_tokens, context)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

    for embed in EMBED_DIMS:
        for hidden in HIDDEN_LAYERS:
            for act in ACTIVATIONS:
                model_count += 1
                start_time = time.time()
                model_name = f"model_ctx{context}_embed{embed}_hidden{hidden}_act_{act}"
                model_filename = os.path.join(MODEL_DIR, f"{model_name}.pth")

                print(f"Training Model {model_count}/{total_models}: {model_name}")

                # 2. Instantiate Model
                model = MLPTextGenerator(VOCAB_SIZE, embed, context, hidden, act).to(device)

                # 3. Setup Optimizer and Loss
                criterion = nn.CrossEntropyLoss()
                optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

                best_val_loss = float('inf')
                history = {'train_loss': [], 'val_loss': [], 'val_acc': []} # Corrected syntax

                for epoch in range(EPOCHS):
                    # --- Training ---
                    model.train()
                    train_loss_epoch = 0
                    for inputs, targets in train_loader:
                        inputs, targets = inputs.to(device), targets.to(device)

                        optimizer.zero_grad()
                        outputs = model(inputs)
                        loss = criterion(outputs, targets)
                        loss.backward()
                        optimizer.step()

                        train_loss_epoch += loss.item()

                    avg_train_loss = train_loss_epoch / len(train_loader)
                    history['train_loss'].append(avg_train_loss)

                    # --- Validation ---
                    model.eval()
                    val_loss_epoch = 0
                    correct = 0
                    total = 0
                    with torch.no_grad():
                        for inputs, targets in val_loader:
                            inputs, targets = inputs.to(device), targets.to(device)
                            outputs = model(inputs)
                            loss = criterion(outputs, targets)
                            val_loss_epoch += loss.item()

                            _, predicted = torch.max(outputs.data, 1)
                            total += targets.size(0)
                            correct += (predicted == targets).sum().item()

                    avg_val_loss = val_loss_epoch / len(val_loader)
                    val_accuracy = 100 * correct / total
                    history['val_loss'].append(avg_val_loss)
                    history['val_acc'].append(val_accuracy)

                    if (epoch + 1) % 5 == 0:
                        print(f"  Epoch {epoch+1}/{EPOCHS}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.2f}%")

                    # 4. Save the best model
                    if avg_val_loss < best_val_loss:
                        best_val_loss = avg_val_loss
                        torch.save(model.state_dict(), model_filename) #

                end_time = time.time()
                print(f"Finished training {model_name}. Time: {end_time - start_time:.2f}s. Model saved to {model_filename}")
                print(f"  Final Validation Loss: {best_val_loss:.4f}")
                print(f"  Final Validation Accuracy: {val_accuracy:.2f}%")

                # Save history for the last model
                if model_count == total_models:
                    last_model_history = history
                    last_model_trained = model

print("\nAll models trained and saved.")


#
# === 1.6: Example Predictions (from last model) ===
#
def generate_text(model, seed_text, num_words_to_gen, context_length):
    model.eval()
    generated_text = seed_text.lower()
    seed_tokens = generated_text.split()

    pad_token_int = word_to_int['.']

    with torch.no_grad():
        for _ in range(num_words_to_gen):
            # 1. Prepare input
            context_tokens = seed_tokens[-context_length:]

            # 2. Pad if seed is too short
            if len(context_tokens) < context_length:
                pad_list = ['.'] * (context_length - len(context_tokens))
                context_tokens = pad_list + context_tokens

            # 3. Convert to integers, handling OOV
            context_ints = [] # Corrected syntax
            for word in context_tokens:
                context_ints.append(word_to_int.get(word, word_to_int['<UNK>']))

            # 4. Predict
            X_input = torch.tensor([context_ints], dtype=torch.long).to(device)
            y_pred_logits = model(X_input)

            # 5. Get greedy prediction (argmax)
            y_pred_int = torch.argmax(y_pred_logits, dim=1).item()

            # 6. Convert back to word
            y_pred_word = int_to_word.get(y_pred_int, '<UNK>')

            # 7. Append
            generated_text += " " + y_pred_word
            seed_tokens.append(y_pred_word)

    return generated_text

print("\n--- Generation Examples (from last trained model) ---")
last_context = CONTEXT_LENGTHS[-1]
print(f"SEED: 'holmes was a man of'\n{generate_text(last_model_trained, 'holmes was a man of', 50, last_context)}\n")
print(f"SEED: 'the crime was committed at'\n{generate_text(last_model_trained, 'the crime was committed at', 50, last_context)}\n")

#
# === 1.7: Embedding Visualization (from last model) ===
#
print("--- Loss Plot (from last trained model) ---")
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(last_model_history['train_loss'], label='Training Loss')
plt.plot(last_model_history['val_loss'], label='Validation Loss')
plt.title('Training vs. Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(last_model_history['val_acc'], label='Validation Accuracy')
plt.title('Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy (%)')
plt.legend()
plt.tight_layout()
plt.show()

print("\n--- t-SNE Embedding Visualization (from last trained model) ---")
# 1. Extract embedding weights
embedding_weights = last_model_trained.embedding.weight.data.cpu().numpy()
print(f"Embedding weights shape: {embedding_weights.shape}") #

# 2. Select specific words to visualize
words_to_visualize = {
    'names': ['holmes', 'watson', 'adler', 'moriarty', 'lestrade'],
    'locations': ['london', 'baker', 'street', 'room', 'house', 'city'],
    'speech_verbs': ['said', 'observed', 'remarked', 'answered', 'cried', 'replied'],
    'evidence': ['pipe', 'cigar', 'dust', 'blood', 'footprint', 'clue'],
    'pronouns': ['i', 'he', 'she', 'you', 'his', 'my', 'your']
}

word_vectors = [] # Corrected syntax
labels = [] # Corrected syntax
colors = [] # Corrected syntax
color_map = plt.get_cmap('tab10')

for i, (group_name, word_list) in enumerate(words_to_visualize.items()):
    for word in word_list:
        if word in word_to_int:
            word_vectors.append(embedding_weights[word_to_int[word]])
            labels.append(word)
            colors.append(color_map(i))

word_vectors = np.array(word_vectors)

# 3. Apply t-SNE [7, 8, 9, 10]
tsne = TSNE(n_components=2, random_state=42, perplexity=10)
tsne_results = tsne.fit_transform(word_vectors)

# 4. Plot
plt.figure(figsize=(16, 12))
plt.scatter(tsne_results[:, 0], tsne_results[:, 1], c=colors)

# Add annotations [11, 12]
for i, label in enumerate(labels):
    plt.annotate(label, (tsne_results[i, 0], tsne_results[i, 1]))

plt.title('t-SNE Visualization of Sherlock Holmes Word Embeddings')
plt.show()

#
# === 1.8: Download Artifacts ===
#
# Finally, zip the required files for download
print("Zipping artifacts...")
!zip -r sherlock_models_and_vocab.zip trained_models/ vocab.pkl

print("\n--- ACTION REQUIRED ---")
print("Training complete. Please download 'sherlock_models_and_vocab.zip' from the Colab file browser.")
print("You will also need the Streamlit 'app.py' script from Part 2.")

Using device: cuda
Downloading Sherlock Holmes corpus...
Corpus preprocessing complete.
Total tokens in corpus: 100339
Final vocabulary size: 10000

--- Vocabulary Report ---
Vocabulary Size: 10000
10 Most Frequent Words: [('.', 0), ('the', 4899), ('and', 2679), ('i', 2666), ('of', 2464), ('to', 2440), ('a', 2369), ('in', 1621), ('that', 1530), ('it', 1412), ('you', 1299)]
10 Least Frequent Words (in vocab): [('grating.', 1), ('facetowards', 1), ('coarsely', 1), ('acoloured', 1), ('grime', 1), ('whichcovered', 1), ('ugliness.', 1), ('broadwheal', 1), ('itscontraction', 1), ('threeteeth', 1)]

Saved 'vocab.pkl'. This is needed for the Streamlit app.
Train tokens: 90305, Validation tokens: 10034

Starting training for 16 model combinations...

--- CONTEXT_LENGTH: 6 ---
Training Model 1/16: model_ctx6_embed64_hidden1_act_relu
  Epoch 5/200, Train Loss: 1.5684, Val Loss: 9.4968, Val Acc: 8.36%
