In [4]:
# Basic imports and utilities
import os
import re
import json
import random
from collections import Counter, defaultdict
import numpy as np
import matplotlib.pyplot as plt
import math
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
import requests

In [5]:
torch.cuda.empty_cache()

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
# Read and preprocess the text
with open('Linux.csv', 'r', encoding='utf-8') as f:
    text = f.read()

# Split into sentences using full stop
sentences = [s.strip() for s in text.split('\n') if s.strip()]

# Extract all words
words = []
for sentence in sentences:
    words.extend(sentence.split())

# Create vocabulary from unique words
vocabulary = set(words)
vocab_size = len(vocabulary)

# Count word frequencies
word_counts = Counter(words)

# Get most and least frequent words
most_frequent = word_counts.most_common(10)
least_frequent = word_counts.most_common()[:-11:-1]  # Last 10 in reverse order

# Report statistics
print(f"Vocabulary Size: {vocab_size}")
print(f"\nTotal words in corpus: {len(words)}")
print(f"\n{'='*50}")
print("10 Most Frequent Words:")
print(f"{'='*50}")
for word, count in most_frequent:
    print(f"{word:20s} : {count:6d}")

print(f"\n{'='*50}")
print("10 Least Frequent Words:")
print(f"{'='*50}")
for word, count in least_frequent:
    print(f"{word:20s} : {count:6d}")

# Save vocabulary and word counts for later use
vocab_data = {
    'vocabulary': list(vocabulary),
    'word_counts': dict(word_counts),
    'vocab_size': vocab_size
}

with open('vocab_data1.pkl', 'wb') as f:
    pickle.dump(vocab_data, f)

print(f"\n{'='*50}")
print("Vocabulary data saved to 'vocab_data1.pkl'")

Vocabulary Size: 113644

Total words in corpus: 759639

10 Most Frequent Words:
*                    :  33504
=                    :  28003
{                    :  18915
if                   :  17702
}                    :  16965
the                  :  16080
*/                   :  13445
/*                   :  12190
struct               :  10997
return               :  10130

10 Least Frequent Words:
context_tracking_init(void) :      1
CONFIG_CONTEXT_TRACKING_FORCE :      1
set_tsk_thread_flag(next, :      1
clear_tsk_thread_flag(prev, :      1
__context_tracking_task_switch(struct :      1
TIF                  :      1
syscalls.            :      1
user-kernel          :      1
__context_tracking_task_switch :      1
NOKPROBE_SYMBOL(context_tracking_user_exit); :      1

Vocabulary data saved to 'vocab_data1.pkl'


In [8]:
# Create word to index and index to word mappings
word_to_idx = {word: idx for idx, word in enumerate(vocabulary)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

print(f"Created mappings for {len(word_to_idx)} words")

Created mappings for 113644 words


In [9]:
# Create training sequences (context window = 3 words to predict next word)
def create_sequences(words, window_size=3):
    sequences = []
    targets = []
    
    for i in range(len(words) - window_size):
        seq = words[i:i+window_size]
        target = words[i+window_size]
        
        # Convert to indices
        seq_indices = [word_to_idx[word] for word in seq]
        target_idx = word_to_idx[target]
        
        sequences.append(seq_indices)
        targets.append(target_idx)
    
    return np.array(sequences), np.array(targets)

# Create sequences
window_size = 3
X, y = create_sequences(words, window_size)
X = torch.tensor(X).to(device)
y = torch.tensor(y).to(device)

print(f"Created {len(X)} training sequences")
print(f"Sequence shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")

Created 759636 training sequences
Sequence shape: torch.Size([759636, 3])
Target shape: torch.Size([759636])

Training samples: 607708
Validation samples: 151928


In [10]:
# Define MLP Text Generator Model with configurable activation function
class MLPTextGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, window_size, 
                 activation_fn, dropout_prob=0.3):
        super(MLPTextGenerator, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.input_dim = embedding_dim * window_size
        
        # Map string to actual activation function
        self.activation = self._get_activation(activation_fn)
        
        # MLP layers
        self.fc1 = nn.Linear(self.input_dim, hidden_dim)
        self.dropout1 = nn.Dropout(dropout_prob)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)
    
    def _get_activation(self, activation_fn):
        """Return activation function module given string name."""
        activations = {
            'relu': nn.ReLU(),
            'tanh': nn.Tanh(),
            'leakyrelu': nn.LeakyReLU(0.01),
            'gelu': nn.GELU(),
            'sigmoid': nn.Sigmoid(),
            'elu': nn.ELU(),
            'selu': nn.SELU(),
            'none': nn.Identity()
        }
        if activation_fn.lower() not in activations:
            raise ValueError(f"Unsupported activation function: {activation_fn}")
        return activations[activation_fn.lower()]
    
    def forward(self, x):
        # x shape: (batch_size, window_size)
        batch_size = x.size(0)
        
        # Embedding lookup
        embedded = self.embedding(x)  # (batch_size, window_size, embedding_dim)
        
        # Flatten embeddings
        embedded = embedded.view(batch_size, -1)  # (batch_size, window_size * embedding_dim)
        
        # MLP forward
        h1 = self.dropout1(self.activation(self.fc1(embedded)))
        output = self.fc2(h1)
        
        return output




In [11]:
# Initialize model
model1 = MLPTextGenerator(vocab_size, embedding_dim=32, hidden_dim=1024, window_size=3, activation_fn='tanh')
model1 = model1.to(device)

# Count parameters
total_params = sum(p.numel() for p in model1.parameters())
trainable_params = sum(p.numel() for p in model1.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"\nModel architecture:\n{model1}")

Total parameters: 120,221,036
Trainable parameters: 120,221,036

Model architecture:
MLPTextGenerator(
  (embedding): Embedding(113644, 32)
  (activation): Tanh()
  (fc1): Linear(in_features=96, out_features=1024, bias=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=1024, out_features=113644, bias=True)
)


In [12]:
model2 = MLPTextGenerator(vocab_size, embedding_dim=64, hidden_dim=1024, window_size=3, activation_fn='tanh')
model2 = model2.to(device)

# Count parameters
total_params = sum(p.numel() for p in model2.parameters())
trainable_params = sum(p.numel() for p in model2.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"\nModel architecture:\n{model2}")

Total parameters: 123,955,948
Trainable parameters: 123,955,948

Model architecture:
MLPTextGenerator(
  (embedding): Embedding(113644, 64)
  (activation): Tanh()
  (fc1): Linear(in_features=192, out_features=1024, bias=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=1024, out_features=113644, bias=True)
)


In [13]:
model3 = MLPTextGenerator(vocab_size, embedding_dim=64, hidden_dim=1024, window_size=5, activation_fn='relu')
model3 = model3.to(device)

# Count parameters
total_params = sum(p.numel() for p in model3.parameters())
trainable_params = sum(p.numel() for p in model3.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"\nModel architecture:\n{model3}")

Total parameters: 124,087,020
Trainable parameters: 124,087,020

Model architecture:
MLPTextGenerator(
  (embedding): Embedding(113644, 64)
  (activation): ReLU()
  (fc1): Linear(in_features=320, out_features=1024, bias=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=1024, out_features=113644, bias=True)
)


In [14]:
model4 = MLPTextGenerator(vocab_size, embedding_dim=64, hidden_dim=1024, window_size=5, activation_fn='tanh')
model4 = model4.to(device)

# Count parameters
total_params = sum(p.numel() for p in model4.parameters())
trainable_params = sum(p.numel() for p in model4.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"\nModel architecture:\n{model4}")

Total parameters: 124,087,020
Trainable parameters: 124,087,020

Model architecture:
MLPTextGenerator(
  (embedding): Embedding(113644, 64)
  (activation): Tanh()
  (fc1): Linear(in_features=320, out_features=1024, bias=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=1024, out_features=113644, bias=True)
)


In [15]:
model5= MLPTextGenerator(vocab_size, embedding_dim=32, hidden_dim=1024, window_size=5, activation_fn='relu')
model5 = model5.to(device)
# Count parameters
total_params = sum(p.numel() for p in model5.parameters())
trainable_params = sum(p.numel() for p in model5.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"\nModel architecture:\n{model5}")

Total parameters: 120,286,572
Trainable parameters: 120,286,572

Model architecture:
MLPTextGenerator(
  (embedding): Embedding(113644, 32)
  (activation): ReLU()
  (fc1): Linear(in_features=160, out_features=1024, bias=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=1024, out_features=113644, bias=True)
)


In [16]:
# Training function
def train_epoch(model, X_train, y_train, optimizer, criterion, batch_size=128):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    # Shuffle training data
    indices = np.random.permutation(len(X_train))
    X_train_shuffled = X_train[indices]
    y_train_shuffled = y_train[indices]
    
    num_batches = len(X_train) // batch_size
    
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = start_idx + batch_size
        
        batch_X = torch.LongTensor(X_train_shuffled[start_idx:end_idx]).to(device)
        batch_y = torch.LongTensor(y_train_shuffled[start_idx:end_idx]).to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        # Calculate accuracy
        _, predicted = torch.max(outputs.data, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()
    
    avg_loss = total_loss / num_batches
    accuracy = 100 * correct / total
    return avg_loss, accuracy

# Validation function
def validate(model, X_val, y_val, criterion, batch_size=128):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    num_batches = len(X_val) // batch_size
    
    with torch.no_grad():
        for i in range(num_batches):
            start_idx = i * batch_size
            end_idx = start_idx + batch_size
            
            batch_X = torch.LongTensor(X_val[start_idx:end_idx]).to(device)
            batch_y = torch.LongTensor(y_val[start_idx:end_idx]).to(device)
            
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            
            total_loss += loss.item()
            
            # Calculate accuracy
            _, predicted = torch.max(outputs.data, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
    
    avg_loss = total_loss / num_batches
    accuracy = 100 * correct / total
    return avg_loss, accuracy

print("Training and validation functions defined")

Training and validation functions defined


In [17]:
models=[model1, model2, model3, model4, model5]

In [None]:
# Training configuration
i=1
for model in models:

    embedding_dim = model.embedding.embedding_dim
    window_size_model = model.input_dim // embedding_dim

    # Recreate sequences for the current model's window size
    X, y = create_sequences(words, window_size_model)
    X = torch.tensor(X).to(device)
    y = torch.tensor(y).to(device)

    # Split into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    num_epochs = 300
    batch_size = 32768

    # Lists to store metrics
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    print("Starting training...")
    print(f"{'='*60}")
    print(f"Training model with window size: {window_size_model}")
    print(f"{'='*60}")


    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        # Train
        train_loss, train_acc = train_epoch(model, X_train.cpu().numpy(), y_train.cpu().numpy(), optimizer, criterion, batch_size)

        # Validate
        val_loss, val_acc = validate(model, X_val.cpu().numpy(), y_val.cpu().numpy(), criterion, batch_size)

        # Store metrics
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        train_accuracies.append(train_acc)
        val_accuracies.append(val_acc)


        # Print progress
        if (epoch + 1) % 25 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}]")
            print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
            print(f"  Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc:.2f}%")
            print(f"{'='*60}")

    print("\nTraining completed!")
    print(f"Best validation loss: {best_val_loss:.4f}")
    torch.save(model.state_dict(), f'mlp_text_model_dataset2var{i}.pth')
    i+=1

Starting training...
Training model with window size: 3
Epoch [25/300]
  Train Loss: 3.0642 | Train Acc: 39.22%
  Val Loss:   7.0935 | Val Acc:   21.74%
Epoch [50/300]
  Train Loss: 2.4739 | Train Acc: 46.51%
  Val Loss:   7.4066 | Val Acc:   24.48%
Epoch [75/300]
  Train Loss: 2.1240 | Train Acc: 51.72%
  Val Loss:   7.6499 | Val Acc:   26.11%
Epoch [100/300]
  Train Loss: 1.8860 | Train Acc: 55.90%
  Val Loss:   7.8765 | Val Acc:   27.10%
Epoch [125/300]
  Train Loss: 1.7221 | Train Acc: 59.02%
  Val Loss:   8.0780 | Val Acc:   27.66%
Epoch [150/300]
  Train Loss: 1.5929 | Train Acc: 61.67%
  Val Loss:   8.2562 | Val Acc:   28.09%
Epoch [175/300]
  Train Loss: 1.4993 | Train Acc: 63.62%
  Val Loss:   8.4058 | Val Acc:   28.31%
Epoch [200/300]
  Train Loss: 1.4242 | Train Acc: 65.26%
  Val Loss:   8.5469 | Val Acc:   28.52%
Epoch [225/300]
  Train Loss: 1.3648 | Train Acc: 66.55%
  Val Loss:   8.6568 | Val Acc:   28.73%
Epoch [250/300]
  Train Loss: 1.3174 | Train Acc: 67.59%
  Val Lo