<a href="https://colab.research.google.com/github/vvamsi91/RTML_AS5/blob/main/rtml_as5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torchinfo

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
import time
import numpy as np
import torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [None]:
# Sample text
text = '''Next character prediction is a fundamental task in the field of natural language processing (NLP) that involves predicting the next character in a sequence of text based on the characters that precede it. This task is essential for various applications, including text auto-completion, spell checking, and even in the development of sophisticated AI models capable of generating human-like text. At its core, next character prediction relies on statistical models or deep learning algorithms to analyze a given sequence of text and predict which character is most likely to follow. These predictions are based on patterns and relationships learned from large datasets of text during the training phase of the model. One of the most popular approaches to next character prediction involves the use of Recurrent Neural Networks (RNNs), and more specifically, a variant called Long Short-Term Memory (LSTM) networks. RNNs are particularly well-suited for sequential data like text, as they can maintain information in 'memory' about previous characters to inform the prediction of the next character. LSTM networks enhance this capability by being able to remember long-term dependencies, making them even more effective for next character prediction tasks. Training a model for next character prediction involves feeding it large amounts of text data, allowing it to learn the probability of each character's appearance following a sequence of characters. During this training process, the model adjusts its parameters to minimize the difference between its predictions and the actual outcomes, thus improving its predictive accuracy over time. Once trained, the model can be used to predict the next character in a given piece of text by considering the sequence of characters that precede it. This can enhance user experience in text editing software, improve efficiency in coding environments with auto-completion features, and enable more natural interactions with AI-based chatbots and virtual assistants. In summary, next character prediction plays a crucial role in enhancing the capabilities of various NLP applications, making text-based interactions more efficient, accurate, and human-like. Through the use of advanced machine learning models like RNNs and LSTMs, next character prediction continues to evolve, opening new possibilities for the future of text-based technology.'''

# Preparing the dataset for sequence prediction
max_length = 10  # Maximum length of input sequences
sequences = [text[i:i + max_length] for i in range(len(text) - max_length)]
labels = [text[i + max_length] for i in range(len(text) - max_length)]


In [None]:
# Define character vocabulary
chars = sorted(set(text))  # Unique characters in the text
char_to_ix = {ch: i for i, ch in enumerate(chars)}  # Mapping of characters to indices

# Convert sequences and labels to tensors
X = torch.tensor([[char_to_ix[ch] for ch in seq] for seq in sequences], dtype=torch.long)  # Input sequences as tensors
y = torch.tensor([char_to_ix[label] for label in labels], dtype=torch.long)  # Labels as tensors

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)  # Train/validation split

# Define Transformer model
class CharTransformer(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, nhead):
        super(CharTransformer, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)  # Character embedding layer
        self.transformer_encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(hidden_size, nhead), num_layers)  # Transformer encoder
        self.fc = nn.Linear(hidden_size, output_size)  # Fully connected layer for output

    def forward(self, x):
        output = self.fc(self.transformer_encoder(self.embedding(x))[:, -1, :])  # Forward pass
        return output

# Hyperparameters
hidden_size, num_layers, nhead = 128, 3, 2  # Model dimensions
learning_rate, epochs = 0.001, 50  # Training parameters

# Model, loss, and optimizer
model = CharTransformer(len(chars), hidden_size, len(chars), num_layers, nhead)  # Instantiate model
criterion, optimizer = nn.CrossEntropyLoss(), optim.Adam(model.parameters(), lr=learning_rate)  # Loss and optimizer

# Display model summary
print(torchinfo.summary(model, input_data=X_train))  # Model summary




Layer (type:depth-idx)                        Output Shape              Param #
CharTransformer                               [1900, 44]                --
├─Embedding: 1-1                              [1900, 10, 128]           5,632
├─TransformerEncoder: 1-2                     [1900, 10, 128]           --
│    └─ModuleList: 2-1                        --                        --
│    │    └─TransformerEncoderLayer: 3-1      [1900, 10, 128]           593,024
│    │    └─TransformerEncoderLayer: 3-2      [1900, 10, 128]           593,024
│    │    └─TransformerEncoderLayer: 3-3      [1900, 10, 128]           593,024
├─Linear: 1-3                                 [1900, 44]                5,676
Total params: 1,790,380
Trainable params: 1,790,380
Non-trainable params: 0
Total mult-adds (G): 3.03
Input size (MB): 0.15
Forward/backward pass size (MB): 1129.12
Params size (MB): 6.37
Estimated Total Size (MB): 1135.64


In [None]:
# Training the model
total_start_time = time.time()  # Start time for total execution
for epoch in range(epochs):
    start_time = time.time()  # Start time for current epoch
    model.train()  # Set model to training mode
    optimizer.zero_grad()  # Clear gradients
    output = model(X_train)  # Forward pass
    loss = criterion(output, y_train)  # Calculate training loss
    loss.backward()  # Backpropagation
    optimizer.step()  # Update weights

    # Validation
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        val_output = model(X_val)  # Forward pass for validation
        val_loss = criterion(val_output, y_val)  # Calculate validation loss
        _, predicted = torch.max(val_output, 1)  # Predicted labels
        val_accuracy = (predicted == y_val).float().mean()  # Validation accuracy

    # Print progress every 5 epochs
    if (epoch+1) % 5 == 0:
        end_time = time.time()  # End time for current epoch
        execution_time = end_time - start_time  # Execution time for current epoch
        print(f'Epoch {epoch+1}, Loss: {loss.item()}, Validation Loss: {val_loss.item()}, Validation Accuracy: {val_accuracy.item()}, Execution Time: {execution_time} seconds')

total_end_time = time.time()  # End time for total execution
total_execution_time = total_end_time - total_start_time  # Total execution time
print(f'Total Execution Time: {total_execution_time} seconds')  # Print total execution time


Epoch 5, Loss: 2.9149386882781982, Validation Loss: 2.8146414756774902, Validation Accuracy: 0.18907563388347626, Execution Time: 17.4826078414917 seconds
Epoch 10, Loss: 2.628356456756592, Validation Loss: 2.5621261596679688, Validation Accuracy: 0.25, Execution Time: 17.320786476135254 seconds
Epoch 15, Loss: 2.4864165782928467, Validation Loss: 2.4559226036071777, Validation Accuracy: 0.2710084021091461, Execution Time: 17.466740131378174 seconds
Epoch 20, Loss: 2.4133033752441406, Validation Loss: 2.4013073444366455, Validation Accuracy: 0.2899159789085388, Execution Time: 18.022886753082275 seconds
Epoch 25, Loss: 2.356829881668091, Validation Loss: 2.3727054595947266, Validation Accuracy: 0.3025210201740265, Execution Time: 17.724019765853882 seconds
Epoch 30, Loss: 2.313598394393921, Validation Loss: 2.3390276432037354, Validation Accuracy: 0.287815123796463, Execution Time: 18.265181064605713 seconds
Epoch 35, Loss: 2.2797720432281494, Validation Loss: 2.318742513656616, Valida

In [None]:
# Sample text
text = '''Next character prediction is a fundamental task in the field of natural language processing (NLP) that involves predicting the next character in a sequence of text based on the characters that precede it. This task is essential for various applications, including text auto-completion, spell checking, and even in the development of sophisticated AI models capable of generating human-like text. At its core, next character prediction relies on statistical models or deep learning algorithms to analyze a given sequence of text and predict which character is most likely to follow. These predictions are based on patterns and relationships learned from large datasets of text during the training phase of the model. One of the most popular approaches to next character prediction involves the use of Recurrent Neural Networks (RNNs), and more specifically, a variant called Long Short-Term Memory (LSTM) networks. RNNs are particularly well-suited for sequential data like text, as they can maintain information in 'memory' about previous characters to inform the prediction of the next character. LSTM networks enhance this capability by being able to remember long-term dependencies, making them even more effective for next character prediction tasks. Training a model for next character prediction involves feeding it large amounts of text data, allowing it to learn the probability of each character's appearance following a sequence of characters. During this training process, the model adjusts its parameters to minimize the difference between its predictions and the actual outcomes, thus improving its predictive accuracy over time. Once trained, the model can be used to predict the next character in a given piece of text by considering the sequence of characters that precede it. This can enhance user experience in text editing software, improve efficiency in coding environments with auto-completion features, and enable more natural interactions with AI-based chatbots and virtual assistants. In summary, next character prediction plays a crucial role in enhancing the capabilities of various NLP applications, making text-based interactions more efficient, accurate, and human-like. Through the use of advanced machine learning models like RNNs and LSTMs, next character prediction continues to evolve, opening new possibilities for the future of text-based technology.'''

# Preparing the dataset for sequence prediction
max_length = 20  # Maximum length of input sequences
sequences = [text[i:i + max_length] for i in range(len(text) - max_length)]
labels = [text[i + max_length] for i in range(len(text) - max_length)]

# Creating character vocabulary
chars = sorted(list(set(text)))  # Unique characters in the text
char_to_ix = {ch: i for i, ch in enumerate(chars)}  # Mapping of characters to indices

# Convert sequences and labels to tensors
X = torch.tensor([[char_to_ix[ch] for ch in seq] for seq in sequences], dtype=torch.long)  # Input sequences as tensors
y = torch.tensor([char_to_ix[label] for label in labels], dtype=torch.long)  # Labels as tensors

# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)  # Train/validation split

# Define Transformer model
class CharTransformer(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, nhead):
        super(CharTransformer, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)  # Embedding layer
        encoder_layers = nn.TransformerEncoderLayer(hidden_size, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)  # Transformer encoder
        self.fc = nn.Linear(hidden_size, output_size)  # Fully connected layer for output

    def forward(self, x):
        embedded = self.embedding(x)  # Embedding input
        transformer_output = self.transformer_encoder(embedded)  # Transformer encoder
        output = self.fc(transformer_output[:, -1, :])  # Get output of last Transformer block
        return output



In [None]:
# Hyperparameters
hidden_size, num_layers, nhead = 128, 3, 2  # Model dimensions
learning_rate, epochs = 0.001, 50  # Training parameters

# Model, loss, and optimizer
model = CharTransformer(len(chars), hidden_size, len(chars), num_layers, nhead)  # Instantiate model
criterion = nn.CrossEntropyLoss()  # Loss function
optimizer = optim.Adam(model.parameters(), lr=learning_rate)  # Optimizer

# Display model summary
summary = torchinfo.summary(model, input_data=X_train)  # Model summary
print(summary)

# Training the model
total_start_time = time.time()  # Start time for total execution
for epoch in range(epochs):
    start_time = time.time()  # Start time for current epoch
    model.train()  # Set model to training mode
    optimizer.zero_grad()  # Clear gradients
    output = model(X_train)  # Forward pass
    loss = criterion(output, y_train)  # Calculate training loss
    loss.backward()  # Backpropagation
    optimizer.step()  # Update weights

    # Validation
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        val_output = model(X_val)  # Forward pass for validation
        val_loss = criterion(val_output, y_val)  # Calculate validation loss
        _, predicted = torch.max(val_output, 1)  # Predicted labels
        val_accuracy = (predicted == y_val).float().mean()  # Validation accuracy

    # Print progress every 5 epochs
    if (epoch+1) % 5 == 0:
        end_time = time.time()  # End time for current epoch
        execution_time = end_time - start_time  # Execution time for current epoch
        print(f'Epoch {epoch+1}, Loss: {loss.item()}, Validation Loss: {val_loss.item()}, Validation Accuracy: {val_accuracy.item()}, Execution Time: {execution_time} seconds')

total_end_time = time.time()  # End time for total execution
total_execution_time = total_end_time - total_start_time  # Total execution time
print(f'Total Execution Time: {total_execution_time} seconds')  # Print total execution time




Layer (type:depth-idx)                        Output Shape              Param #
CharTransformer                               [1892, 44]                --
├─Embedding: 1-1                              [1892, 20, 128]           5,632
├─TransformerEncoder: 1-2                     [1892, 20, 128]           --
│    └─ModuleList: 2-1                        --                        --
│    │    └─TransformerEncoderLayer: 3-1      [1892, 20, 128]           593,024
│    │    └─TransformerEncoderLayer: 3-2      [1892, 20, 128]           593,024
│    │    └─TransformerEncoderLayer: 3-3      [1892, 20, 128]           593,024
├─Linear: 1-3                                 [1892, 44]                5,676
Total params: 1,790,380
Trainable params: 1,790,380
Non-trainable params: 0
Total mult-adds (G): 3.01
Input size (MB): 0.30
Forward/backward pass size (MB): 2248.06
Params size (MB): 6.37
Estimated Total Size (MB): 2254.73
Epoch 5, Loss: 2.849522113800049, Validation Loss: 2.7331020832061768, Valid

In [None]:
import torchinfo
# Sample text
text = '''Next character prediction is a fundamental task in the field of natural language processing (NLP) that involves predicting the next character in a sequence of text based on the characters that precede it. This task is essential for various applications, including text auto-completion, spell checking, and even in the development of sophisticated AI models capable of generating human-like text. At its core, next character prediction relies on statistical models or deep learning algorithms to analyze a given sequence of text and predict which character is most likely to follow. These predictions are based on patterns and relationships learned from large datasets of text during the training phase of the model. One of the most popular approaches to next character prediction involves the use of Recurrent Neural Networks (RNNs), and more specifically, a variant called Long Short-Term Memory (LSTM) networks. RNNs are particularly well-suited for sequential data like text, as they can maintain information in 'memory' about previous characters to inform the prediction of the next character. LSTM networks enhance this capability by being able to remember long-term dependencies, making them even more effective for next character prediction tasks. Training a model for next character prediction involves feeding it large amounts of text data, allowing it to learn the probability of each character's appearance following a sequence of characters. During this training process, the model adjusts its parameters to minimize the difference between its predictions and the actual outcomes, thus improving its predictive accuracy over time. Once trained, the model can be used to predict the next character in a given piece of text by considering the sequence of characters that precede it. This can enhance user experience in text editing software, improve efficiency in coding environments with auto-completion features, and enable more natural interactions with AI-based chatbots and virtual assistants. In summary, next character prediction plays a crucial role in enhancing the capabilities of various NLP applications, making text-based interactions more efficient, accurate, and human-like. Through the use of advanced machine learning models like RNNs and LSTMs, next character prediction continues to evolve, opening new possibilities for the future of text-based technology.'''

# Preparing the dataset for sequence prediction
max_length = 30  # Maximum length of input sequences
sequences = [text[i:i + max_length] for i in range(len(text) - max_length)]
labels = [text[i + max_length] for i in range(len(text) - max_length)]

# Creating character vocabulary
chars = sorted(set(text))  # Unique characters in the text
char_to_ix = {ch: i for i, ch in enumerate(chars)}  # Mapping of characters to indices

# Convert sequences and labels to tensors
X = torch.tensor([[char_to_ix[ch] for ch in seq] for seq in sequences], dtype=torch.long)  # Input sequences as tensors
y = torch.tensor([char_to_ix[label] for label in labels], dtype=torch.long)  # Labels as tensors

# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)  # Train/validation split

# Define Transformer model
class CharTransformer(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, nhead):
        super(CharTransformer, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)  # Embedding layer
        encoder_layers = nn.TransformerEncoderLayer(hidden_size, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)  # Transformer encoder
        self.fc = nn.Linear(hidden_size, output_size)  # Fully connected layer for output

    def forward(self, x):
        embedded = self.embedding(x)  # Embedding input
        transformer_output = self.transformer_encoder(embedded)  # Transformer encoder
        output = self.fc(transformer_output[:, -1, :])  # Get output of last Transformer block
        return output
        # Hyperparameters
hidden_size, num_layers, nhead = 128, 2, 2  # Model dimensions
learning_rate, epochs = 0.001, 50  # Training parameters

# Model, loss, and optimizer
model = CharTransformer(len(chars), hidden_size, len(chars), num_layers, nhead)  # Instantiate model
criterion = nn.CrossEntropyLoss()  # Define loss function
optimizer = optim.Adam(model.parameters(), lr=learning_rate)  # Define optimizer

# Display model summary
summary = torchinfo.summary(model, input_data=X_train)  # Model summary
print(summary)

# Training the model
total_start_time = time.time()  # Start time for total execution
for epoch in range(epochs):
    start_time = time.time()  # Start time for current epoch
    model.train()  # Set model to training mode
    optimizer.zero_grad()  # Clear gradients
    output = model(X_train)  # Forward pass
    loss = criterion(output, y_train)  # Calculate training loss
    loss.backward()  # Backpropagation
    optimizer.step()  # Update weights

    # Validation
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        val_output = model(X_val)  # Forward pass for validation
        val_loss = criterion(val_output, y_val)  # Calculate validation loss
        _, predicted = torch.max(val_output, 1)  # Predicted labels
        val_accuracy = (predicted == y_val).float().mean()  # Validation accuracy

    # Print progress every 5 epochs
    if (epoch+1) % 5 == 0:
        end_time = time.time()  # End time for current epoch
        execution_time = end_time - start_time  # Execution time for current epoch
        print(f'Epoch {epoch+1}, Loss: {loss.item()}, Validation Loss: {val_loss.item()}, Validation Accuracy: {val_accuracy.item()}, Execution Time: {execution_time} seconds')

total_end_time = time.time()  # End time for total execution
total_execution_time = total_end_time - total_start_time  # Total execution time
print(f'Total Execution Time: {total_execution_time} seconds')  # Print total execution time





Layer (type:depth-idx)                        Output Shape              Param #
CharTransformer                               [1884, 44]                --
├─Embedding: 1-1                              [1884, 30, 128]           5,632
├─TransformerEncoder: 1-2                     [1884, 30, 128]           --
│    └─ModuleList: 2-1                        --                        --
│    │    └─TransformerEncoderLayer: 3-1      [1884, 30, 128]           593,024
│    │    └─TransformerEncoderLayer: 3-2      [1884, 30, 128]           593,024
├─Linear: 1-3                                 [1884, 44]                5,676
Total params: 1,197,356
Trainable params: 1,197,356
Non-trainable params: 0
Total mult-adds (G): 2.01
Input size (MB): 0.45
Forward/backward pass size (MB): 2257.85
Params size (MB): 4.26
Estimated Total Size (MB): 2262.56
Epoch 5, Loss: 2.7680368423461914, Validation Loss: 2.763059377670288, Validation Accuracy: 0.2394067794084549, Execution Time: 30.75780725479126 seconds
Ep

In [None]:
from torch.utils.data import Dataset, DataLoader
import requests
import math

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
text = response.text  #Entire text data

In [None]:
# Define sequence length
sequence_length = 20

# Truncate text to fit sequence length
text = text[:sequence_length * (len(text)//sequence_length)]

# Create a character mapping to integers
chars = sorted(set(text))
char_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_char = {i: ch for i, ch in enumerate(chars)}

# Encode the text into integers
encoded_text = [char_to_int[ch] for ch in text]

# Create sequences and targets
sequences = []
targets = []
for i in range(0, len(encoded_text) - sequence_length):
    seq = encoded_text[i:i+sequence_length]
    target = encoded_text[i+sequence_length]
    sequences.append(seq)
    targets.append(target)

# Convert lists to PyTorch tensors
sequences = torch.tensor(sequences, dtype=torch.long)
targets = torch.tensor(targets, dtype=torch.long)


In [None]:
class CharDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        return self.sequences[index], self.targets[index]

# Instantiate the dataset
dataset = CharDataset(sequences, targets)

# Step 4: Create data loaders
batch_size = 128
train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

class CharModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, model_type='Transformer', num_layers=2, num_heads=2, dim_feedforward=256, dropout=0.1):
        super(CharModel, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        if model_type == 'Transformer':
            encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size, nhead=num_heads, dim_feedforward=dim_feedforward, dropout=dropout)
            self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        else:
            raise ValueError("Invalid model type. Choose 'Transformer'.")
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        transformer_output = self.transformer_encoder(embedded)
        output = self.fc(transformer_output[:, -1, :])
        return output


In [None]:
def train_evaluate(model_type, train_loader, val_loader, device):
    model = CharModel(len(chars), hidden_size, len(chars), model_type).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)  # Move data to device
            optimizer.zero_grad()
            output = model(inputs)
            loss = criterion(output, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)

        epoch_train_loss = train_loss / len(train_loader.dataset)

        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)  # Move data to device
                val_output = model(inputs)
                loss = criterion(val_output, targets)
                val_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(val_output, 1)
                total += targets.size(0)
                correct += (predicted == targets).sum().item()

        epoch_val_loss = val_loss / len(val_loader.dataset)
        epoch_val_accuracy = correct / total

        if (epoch+1) % 1 == 0:
            print(f'Epoch {epoch+1}, Train Loss: {epoch_train_loss}, Validation Loss: {epoch_val_loss}, Validation Accuracy: {epoch_val_accuracy}')

    end_time = time.time()
    execution_time = end_time - start_time

    return epoch_train_loss, epoch_val_loss, epoch_val_accuracy, execution_time


In [None]:
# Define parameters
hidden_size = 512
num_layers = 2
num_heads = 2
dim_feedforward = 256
dropout = 0.1
learning_rate = 0.0001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 20

# Train and evaluate models for sequence length 20
print("\nTraining models for sequence length: 20")
results = {}
for model_type in ['Transformer']:
    print(f"\nTraining {model_type} model...")
    loss, val_loss, val_accuracy, execution_time = train_evaluate(model_type, train_loader, test_loader, device)
    results[model_type] = {
        'loss': loss,
        'val_loss': val_loss,
        'val_accuracy': val_accuracy,
        'execution_time': execution_time
    }

# Print and compare results
print("\nResults for sequence length: 20")
for model_type, data in results.items():
    print(f"\n{model_type} Model:")
    print(f"Training Loss: {data['loss']}")
    print(f"Validation Loss: {data['val_loss']}")
    print(f"Validation Accuracy: {data['val_accuracy']}")
    print(f"Execution Time: {data['execution_time']} seconds")



Training models for sequence length: 20

Training Transformer model...




Epoch 1, Train Loss: 2.5118092319541607, Validation Loss: 2.4848576777122604, Validation Accuracy: 0.2652551642519007
Epoch 2, Train Loss: 2.4827921046124453, Validation Loss: 2.4753100329604183, Validation Accuracy: 0.26944663606369246
Epoch 3, Train Loss: 2.4774293056811447, Validation Loss: 2.4744577612019802, Validation Accuracy: 0.26935697891263805
Epoch 4, Train Loss: 2.4739716184684766, Validation Loss: 2.471600915177996, Validation Accuracy: 0.26886386458183903
Epoch 5, Train Loss: 2.4721536488260094, Validation Loss: 2.4703363802114597, Validation Accuracy: 0.2688459331516282
Epoch 6, Train Loss: 2.4707283641939406, Validation Loss: 2.466827875332141, Validation Accuracy: 0.2696662960837756
Epoch 7, Train Loss: 2.4696396465469785, Validation Loss: 2.466347221527187, Validation Accuracy: 0.26863972170420314
Epoch 8, Train Loss: 2.4688567626153084, Validation Loss: 2.467693672479781, Validation Accuracy: 0.2693345646248745
Epoch 9, Train Loss: 2.4679829821219283, Validation Loss