In [1]:
# Basic imports and utilities
import os
import re
import json
import random
from collections import Counter, defaultdict
import numpy as np
import matplotlib.pyplot as plt
import math
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
import requests

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# Read and preprocess the text
with open('Sherlock.csv', 'r', encoding='utf-8') as f:
    text = f.read()

# Remove special characters except alphanumeric, space, and full stop
cleaned_text = re.sub('[^a-zA-Z0-9 \.]', ' ', text)

# Convert to lowercase
cleaned_text = cleaned_text.lower()

# Split into sentences using full stop
sentences = [s.strip() for s in cleaned_text.split('.') if s.strip()]

# Extract all words
words = []
for sentence in sentences:
    words.extend(sentence.split())

# Create vocabulary from unique words
vocabulary = set(words)
vocab_size = len(vocabulary)

  cleaned_text = re.sub('[^a-zA-Z0-9 \.]', ' ', text)


In [3]:
# Create word to index and index to word mappings
word_to_idx = {word: idx for idx, word in enumerate(vocabulary)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

print(f"Created mappings for {len(word_to_idx)} words")

Created mappings for 8150 words


In [6]:
# Create training sequences (context window = 3 words to predict next word)
def create_sequences(words, window_size=3):
    sequences = []
    targets = []
    
    for i in range(len(words) - window_size):
        seq = words[i:i+window_size]
        target = words[i+window_size]
        
        # Convert to indices
        seq_indices = [word_to_idx[word] for word in seq]
        target_idx = word_to_idx[target]
        
        sequences.append(seq_indices)
        targets.append(target_idx)
    
    return np.array(sequences), np.array(targets)

# Create sequences
window_size = 3
X, y = create_sequences(words, window_size)
X = torch.tensor(X).to(device)
y = torch.tensor(y).to(device)

print(f"Created {len(X)} training sequences")
print(f"Sequence shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")

Created 109145 training sequences
Sequence shape: torch.Size([109145, 3])
Target shape: torch.Size([109145])

Training samples: 87316
Validation samples: 21829


In [7]:
# Define MLP Text Generator Model with configurable activation function
class MLPTextGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, window_size, 
                 activation_fn, dropout_prob=0.3):
        super(MLPTextGenerator, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.input_dim = embedding_dim * window_size
        
        # Map string to actual activation function
        self.activation = self._get_activation(activation_fn)
        
        # MLP layers
        self.fc1 = nn.Linear(self.input_dim, hidden_dim)
        self.dropout1 = nn.Dropout(dropout_prob)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)
    
    def _get_activation(self, activation_fn):
        """Return activation function module given string name."""
        activations = {
            'relu': nn.ReLU(),
            'tanh': nn.Tanh(),
        }
        if activation_fn.lower() not in activations:
            raise ValueError(f"Unsupported activation function: {activation_fn}")
        return activations[activation_fn.lower()]
    
    def forward(self, x):
        # x shape: (batch_size, window_size)
        batch_size = x.size(0)
        
        # Embedding lookup
        embedded = self.embedding(x)  # (batch_size, window_size, embedding_dim)
        
        # Flatten embeddings
        embedded = embedded.view(batch_size, -1)  # (batch_size, window_size * embedding_dim)
        
        # MLP forward
        h1 = self.dropout1(self.activation(self.fc1(embedded)))
        output = self.fc2(h1)
        
        return output




In [8]:
# Initialize model
model1 = MLPTextGenerator(vocab_size, embedding_dim=32, hidden_dim=1024, window_size=3, activation_fn='tanh')
model1 = model1.to(device)

# Count parameters
total_params = sum(p.numel() for p in model1.parameters())
trainable_params = sum(p.numel() for p in model1.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"\nModel architecture:\n{model1}")

Total parameters: 8,713,878
Trainable parameters: 8,713,878

Model architecture:
MLPTextGenerator(
  (embedding): Embedding(8150, 32)
  (activation): Tanh()
  (fc1): Linear(in_features=96, out_features=1024, bias=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=1024, out_features=8150, bias=True)
)


In [9]:
model2 = MLPTextGenerator(vocab_size, embedding_dim=64, hidden_dim=1024, window_size=3, activation_fn='tanh')
model2 = model2.to(device)

# Count parameters
total_params = sum(p.numel() for p in model2.parameters())
trainable_params = sum(p.numel() for p in model2.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"\nModel architecture:\n{model2}")

Total parameters: 9,072,982
Trainable parameters: 9,072,982

Model architecture:
MLPTextGenerator(
  (embedding): Embedding(8150, 64)
  (activation): Tanh()
  (fc1): Linear(in_features=192, out_features=1024, bias=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=1024, out_features=8150, bias=True)
)


In [10]:
# Training function
def train_epoch(model, X_train, y_train, optimizer, criterion, batch_size=128):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    # Shuffle training data
    indices = np.random.permutation(len(X_train))
    X_train_shuffled = X_train[indices]
    y_train_shuffled = y_train[indices]
    
    num_batches = len(X_train) // batch_size
    
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = start_idx + batch_size
        
        batch_X = torch.LongTensor(X_train_shuffled[start_idx:end_idx]).to(device)
        batch_y = torch.LongTensor(y_train_shuffled[start_idx:end_idx]).to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        # Calculate accuracy
        _, predicted = torch.max(outputs.data, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()
    
    avg_loss = total_loss / num_batches
    accuracy = 100 * correct / total
    return avg_loss, accuracy

# Validation function
def validate(model, X_val, y_val, criterion, batch_size=128):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    num_batches = len(X_val) // batch_size
    
    with torch.no_grad():
        for i in range(num_batches):
            start_idx = i * batch_size
            end_idx = start_idx + batch_size
            
            batch_X = torch.LongTensor(X_val[start_idx:end_idx]).to(device)
            batch_y = torch.LongTensor(y_val[start_idx:end_idx]).to(device)
            
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            
            total_loss += loss.item()
            
            # Calculate accuracy
            _, predicted = torch.max(outputs.data, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
    
    avg_loss = total_loss / num_batches
    accuracy = 100 * correct / total
    return avg_loss, accuracy

print("Training and validation functions defined")

Training and validation functions defined


In [11]:
models=[model1, model2]

In [14]:
# Training configuration
i=1
for model in models:

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    num_epochs = 500
    batch_size = 16384

    # Lists to store metrics
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    print("Starting training...")
    print(f"{'='*60}")

    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        # Train
        train_loss, train_acc = train_epoch(model, X_train.cpu().numpy(), y_train.cpu().numpy(), optimizer, criterion, batch_size)
        
        # Validate
        val_loss, val_acc = validate(model, X_val.cpu().numpy(), y_val.cpu().numpy(), criterion, batch_size)
        
        # Store metrics
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        train_accuracies.append(train_acc)
        val_accuracies.append(val_acc)
        
        
        # Print progress
        if (epoch + 1) % 50 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}]")
            print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
            print(f"  Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc:.2f}%")
            print(f"{'='*60}")

    print("\nTraining completed!")
    print(f"Best validation loss: {best_val_loss:.4f}")
    torch.save(model.state_dict(), f'mlp_text_model_dataset1var{i}.pth')
    i+=1

Starting training...


Epoch [50/500]
  Train Loss: 0.9398 | Train Acc: 73.05%
  Val Loss:   9.3789 | Val Acc:   12.81%
Epoch [100/500]
  Train Loss: 0.9183 | Train Acc: 73.47%
  Val Loss:   9.4202 | Val Acc:   12.79%
Epoch [150/500]
  Train Loss: 0.9174 | Train Acc: 73.43%
  Val Loss:   9.4566 | Val Acc:   12.77%
Epoch [200/500]
  Train Loss: 0.8882 | Train Acc: 74.33%
  Val Loss:   9.4795 | Val Acc:   12.87%
Epoch [250/500]
  Train Loss: 0.8827 | Train Acc: 74.55%
  Val Loss:   9.5119 | Val Acc:   12.81%
Epoch [300/500]
  Train Loss: 0.8672 | Train Acc: 74.91%
  Val Loss:   9.5310 | Val Acc:   12.62%
Epoch [350/500]
  Train Loss: 0.8589 | Train Acc: 75.07%
  Val Loss:   9.5476 | Val Acc:   12.78%
Epoch [400/500]
  Train Loss: 0.8455 | Train Acc: 75.43%
  Val Loss:   9.5632 | Val Acc:   12.61%
Epoch [450/500]
  Train Loss: 0.8319 | Train Acc: 75.81%
  Val Loss:   9.5784 | Val Acc:   12.70%
Epoch [500/500]
  Train Loss: 0.8151 | Train Acc: 76.32%
  Val Loss:   9.5843 | Val Acc:   12.63%

Training completed!


In [15]:
# Create training sequences (context window = 3 words to predict next word)
def create_sequences(words, window_size):
    sequences = []
    targets = []
    
    for i in range(len(words) - window_size):
        seq = words[i:i+window_size]
        target = words[i+window_size]
        
        # Convert to indices
        seq_indices = [word_to_idx[word] for word in seq]
        target_idx = word_to_idx[target]
        
        sequences.append(seq_indices)
        targets.append(target_idx)
    
    return np.array(sequences), np.array(targets)

# Create sequences
window_size = 5
X, y = create_sequences(words, window_size)
X = torch.tensor(X).to(device)
y = torch.tensor(y).to(device)

print(f"Created {len(X)} training sequences")
print(f"Sequence shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")

Created 109143 training sequences
Sequence shape: torch.Size([109143, 5])
Target shape: torch.Size([109143])

Training samples: 87314
Validation samples: 21829


In [18]:
model3 = MLPTextGenerator(vocab_size, embedding_dim=64, hidden_dim=1024, window_size=5, activation_fn='relu')
model3 = model3.to(device)

# Count parameters
total_params = sum(p.numel() for p in model3.parameters())
trainable_params = sum(p.numel() for p in model3.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"\nModel architecture:\n{model3}")

Total parameters: 9,204,054
Trainable parameters: 9,204,054

Model architecture:
MLPTextGenerator(
  (embedding): Embedding(8150, 64)
  (activation): ReLU()
  (fc1): Linear(in_features=320, out_features=1024, bias=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=1024, out_features=8150, bias=True)
)


In [16]:
model4 = MLPTextGenerator(vocab_size, embedding_dim=64, hidden_dim=1024, window_size=5, activation_fn='tanh')
model4 = model4.to(device)

# Count parameters
total_params = sum(p.numel() for p in model4.parameters())
trainable_params = sum(p.numel() for p in model4.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"\nModel architecture:\n{model4}")

Total parameters: 9,204,054
Trainable parameters: 9,204,054

Model architecture:
MLPTextGenerator(
  (embedding): Embedding(8150, 64)
  (activation): Tanh()
  (fc1): Linear(in_features=320, out_features=1024, bias=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=1024, out_features=8150, bias=True)
)


In [17]:
model5= MLPTextGenerator(vocab_size, embedding_dim=32, hidden_dim=1024, window_size=5, activation_fn='relu')
model5 = model5.to(device)
# Count parameters
total_params = sum(p.numel() for p in model5.parameters())
trainable_params = sum(p.numel() for p in model5.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"\nModel architecture:\n{model5}")

Total parameters: 8,779,414
Trainable parameters: 8,779,414

Model architecture:
MLPTextGenerator(
  (embedding): Embedding(8150, 32)
  (activation): ReLU()
  (fc1): Linear(in_features=160, out_features=1024, bias=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=1024, out_features=8150, bias=True)
)


In [19]:
models=[model3, model4, model5]

In [21]:
# Training configuration
i=3
for model in models:

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    num_epochs = 500
    batch_size = 16384

    # Lists to store metrics
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    print("Starting training...")
    print(f"{'='*60}")

    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        # Train
        train_loss, train_acc = train_epoch(model, X_train.cpu().numpy(), y_train.cpu().numpy(), optimizer, criterion, batch_size)
        
        # Validate
        val_loss, val_acc = validate(model, X_val.cpu().numpy(), y_val.cpu().numpy(), criterion, batch_size)
        
        # Store metrics
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        train_accuracies.append(train_acc)
        val_accuracies.append(val_acc)
        
        
        # Print progress
        if (epoch + 1) % 100 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}]")
            print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
            print(f"  Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc:.2f}%")
            print(f"{'='*60}")

    print("\nTraining completed!")
    print(f"Best validation loss: {best_val_loss:.4f}")
    torch.save(model.state_dict(), f'mlp_text_model_dataset1var{i}.pth')
    i+=1

Starting training...


Epoch [100/500]
  Train Loss: 1.1722 | Train Acc: 68.34%
  Val Loss:   8.8488 | Val Acc:   11.70%
Epoch [200/500]
  Train Loss: 0.6006 | Train Acc: 82.31%
  Val Loss:   9.9535 | Val Acc:   11.38%
Epoch [300/500]
  Train Loss: 0.3525 | Train Acc: 89.20%
  Val Loss:   10.9073 | Val Acc:   11.43%
Epoch [400/500]
  Train Loss: 0.2327 | Train Acc: 92.87%
  Val Loss:   11.6805 | Val Acc:   11.19%
Epoch [500/500]
  Train Loss: 0.1641 | Train Acc: 94.91%
  Val Loss:   12.3823 | Val Acc:   11.22%

Training completed!
Best validation loss: inf
Starting training...
Epoch [100/500]
  Train Loss: 1.5554 | Train Acc: 61.21%
  Val Loss:   7.2607 | Val Acc:   11.02%
Epoch [200/500]
  Train Loss: 0.7210 | Train Acc: 79.91%
  Val Loss:   8.3111 | Val Acc:   11.02%
Epoch [300/500]
  Train Loss: 0.4029 | Train Acc: 88.05%
  Val Loss:   9.1469 | Val Acc:   11.11%
Epoch [400/500]
  Train Loss: 0.2672 | Train Acc: 92.03%
  Val Loss:   9.7975 | Val Acc:   11.19%
Epoch [500/500]
  Train Loss: 0.2014 | Train Ac