In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import pandas as pd
from sklearn.model_selection import train_test_split
import string
import random
from tqdm import tqdm

# Set random seeds for reproducibility
torch.manual_seed(42)
random.seed(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Vocabulary Class
class Vocabulary:
    def __init__(self):
        self.char2idx = {}
        self.idx2char = {}
        self.pad_token = 0
        self.sos_token = 1
        self.eos_token = 2
        self.unk_token = 3
        self._build_vocab()

    def _build_vocab(self):
        special_tokens = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
        all_chars = list(string.printable)

        self.char2idx = {token: idx for idx, token in enumerate(special_tokens)}
        self.char2idx.update({char: idx+len(special_tokens) for idx, char in enumerate(all_chars)})
        self.idx2char = {idx: char for char, idx in self.char2idx.items()}

    def __len__(self):
        return len(self.char2idx)

    def encode(self, text):
        return [self.char2idx.get(char, self.unk_token) for char in text]

    def decode(self, indices):
        return ''.join([self.idx2char.get(idx, '<UNK>') for idx in indices if idx not in {self.pad_token, self.sos_token, self.eos_token}])

# Data Preparation
def load_data(file_path, max_samples=800000):
    df = pd.read_csv(file_path)

    # Filter rows where 'Output' length is <=200
    df = df[df['Output'].str.len() <= 200]

    # Randomly select samples (if available)
    if len(df) > max_samples:
        df = df.sample(n=max_samples, random_state=42)

    inputs = df['Input'].tolist()
    outputs = df['Output'].tolist()

    return inputs, outputs

# Dataset Class
class CipherDataset(data.Dataset):
    def __init__(self, inputs, outputs, vocab, max_length):
        self.inputs = inputs
        self.outputs = outputs
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = str(self.inputs[idx])
        output_text = str(self.outputs[idx])

        input_encoded = [self.vocab.sos_token] + self.vocab.encode(input_text) + [self.vocab.eos_token]
        output_encoded = [self.vocab.sos_token] + self.vocab.encode(output_text) + [self.vocab.eos_token]

        input_padded = input_encoded + [self.vocab.pad_token] * (self.max_length - len(input_encoded))
        output_padded = output_encoded + [self.vocab.pad_token] * (self.max_length - len(output_encoded))

        input_padded = input_padded[:self.max_length]
        output_padded = output_padded[:self.max_length]

        return torch.tensor(input_padded), torch.tensor(output_padded)

# Transformer Model
class CaesarTransformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super().__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)

        # Simplified positional encoding (learned instead of fixed)
        self.encoder_pos = nn.Embedding(max_seq_length, d_model)
        self.decoder_pos = nn.Embedding(max_seq_length, d_model)

        # Encoder layers
        self.encoder_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model, num_heads, d_ff, dropout, batch_first=True)
            for _ in range(num_layers)
        ])

        # Decoder layers
        self.decoder_layers = nn.ModuleList([
            nn.TransformerDecoderLayer(d_model, num_heads, d_ff, dropout, batch_first=True)
            for _ in range(num_layers)
        ])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.max_seq_length = max_seq_length

    def forward(self, src, tgt):
        # Create masks
        src_mask = (src == 0)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(device)

        # Embed and add positional encoding
        src_pos = torch.arange(0, src.size(1), device=device).unsqueeze(0)
        tgt_pos = torch.arange(0, tgt.size(1), device=device).unsqueeze(0)

        src_embedded = self.dropout(self.encoder_embedding(src) + self.encoder_pos(src_pos))
        tgt_embedded = self.dropout(self.decoder_embedding(tgt) + self.decoder_pos(tgt_pos))

        # Encoder
        memory = src_embedded
        for layer in self.encoder_layers:
            memory = layer(memory, src_key_padding_mask=src_mask)

        # Decoder
        output = tgt_embedded
        for layer in self.decoder_layers:
            output = layer(output, memory, tgt_mask=tgt_mask, memory_key_padding_mask=src_mask)

        return self.fc(output)

# Training and Evaluation Functions
def train_epoch(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for src, tgt in tqdm(train_loader, desc="Training"):
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        loss = criterion(output.contiguous().view(-1, output.size(-1)),
                        tgt[:, 1:].contiguous().view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, tgt in tqdm(val_loader, desc="Evaluating"):
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            loss = criterion(output.contiguous().view(-1, output.size(-1)),
                            tgt[:, 1:].contiguous().view(-1))
            total_loss += loss.item()
    return total_loss / len(val_loader)

def calculate_accuracy(model, data_loader, vocab, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for src, tgt in data_loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            predictions = output.argmax(dim=-1)
            correct += ((predictions == tgt[:, 1:]) & (tgt[:, 1:] != vocab.pad_token)).sum().item()
            total += (tgt[:, 1:] != vocab.pad_token).sum().item()
    return correct / total if total > 0 else 0

def decrypt_text(model, text, vocab, max_length, device):
    model.eval()
    with torch.no_grad():
        encoded = [vocab.sos_token] + vocab.encode(str(text)) + [vocab.eos_token]
        encoded = encoded + [vocab.pad_token] * (max_length - len(encoded))
        encoded = torch.tensor(encoded[:max_length]).unsqueeze(0).to(device)

        target = torch.tensor([[vocab.sos_token]]).to(device)

        for _ in range(max_length - 1):
            output = model(encoded, target)
            next_token = output.argmax(2)[:, -1].item()
            if next_token == vocab.eos_token:
                break
            target = torch.cat([target, torch.tensor([[next_token]]).to(device)], dim=1)

        decrypted = vocab.decode(target[0].cpu().numpy())
        return decrypted

# Main Training Function
def train_with_best_config():
    # Load and prepare data
    inputs, outputs = load_data('training_newshift_1.csv')
    vocab = Vocabulary()
    max_length = 256

    # Split data
    train_inputs, val_inputs, train_outputs, val_outputs = train_test_split(
        inputs, outputs, test_size=0.2, random_state=42
    )

    # Create datasets
    train_dataset = CipherDataset(train_inputs, train_outputs, vocab, max_length)
    val_dataset = CipherDataset(val_inputs, val_outputs, vocab, max_length)

    # Best hyperparameters from Optuna
    best_config = {
        "d_model": 256,
        "num_heads": 4,
        "num_layers": 4,
        "d_ff": 256,
        "dropout": 0.2207862787473907,
        "batch_size": 1150,
        "learning_rate": 0.0005
    }

    # Create data loaders
    train_loader = data.DataLoader(train_dataset, batch_size=best_config["batch_size"], shuffle=True)
    val_loader = data.DataLoader(val_dataset, batch_size=best_config["batch_size"])

    # Initialize model
    model = CaesarTransformer(
        src_vocab_size=len(vocab),
        tgt_vocab_size=len(vocab),
        d_model=best_config["d_model"],
        num_heads=best_config["num_heads"],
        num_layers=best_config["num_layers"],
        d_ff=best_config["d_ff"],
        max_seq_length=max_length,
        dropout=best_config["dropout"]
    ).to(device)

    # Training setup
    optimizer = optim.Adam(model.parameters(), lr=best_config["learning_rate"])
    criterion = nn.CrossEntropyLoss(ignore_index=vocab.pad_token)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, factor=0.5)

    # Training parameters
    epochs = 30
    patience = 5
    best_val_loss = float('inf')
    epochs_no_improve = 0

    # Training loop
    for epoch in range(epochs):
        # Train epoch
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)

        # Evaluate
        val_loss = evaluate(model, val_loader, criterion, device)
        scheduler.step(val_loss)

        # Calculate accuracies
        train_acc = calculate_accuracy(model, train_loader, vocab, device)
        val_acc = calculate_accuracy(model, val_loader, vocab, device)

        print(f"\nEpoch {epoch+1} Results:")
        print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
        print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

        # Check for improvement
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
            torch.save(model.state_dict(), '/content/drive/MyDrive/best_trained_model.pth')
            print("Saved new best model!")
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print(f"\nEarly stopping after {epoch+1} epochs!")
                break

    # Load the best model
    model.load_state_dict(torch.load('/content/drive/MyDrive/best_trained_model.pth'))

    # Final evaluation
    final_train_loss = evaluate(model, train_loader, criterion, device)
    final_val_loss = evaluate(model, val_loader, criterion, device)
    final_train_acc = calculate_accuracy(model, train_loader, vocab, device)
    final_val_acc = calculate_accuracy(model, val_loader, vocab, device)

    print("\nFinal Training Results:")
    print(f"Train Loss: {final_train_loss:.4f} | Train Acc: {final_train_acc:.4f}")
    print(f"Val Loss: {final_val_loss:.4f} | Val Acc: {final_val_acc:.4f}")

    # Test decryption
    test_cases = [
        ("Please decrypt the following using Caesar cipher: gfbs", "fear"),
        ("Please decrypt the following using Caesar cipher: dpnqvufs", "computer"),
        ("Please decrypt the following using Caesar cipher: xibu", "what"),
        ("Please decrypt the following using Caesar cipher: ifmmp", "hello"),
        ("Please decrypt the following using Caesar cipher: uijt", "this")
    ]

    print("\nTest Decryptions:")
    for encrypted, expected in test_cases:
        decrypted = decrypt_text(model, encrypted, vocab, max_length, device)
        print(f"Input: '{encrypted}' | Output: '{decrypted}' | Expected: '{expected}' | {'✓' if decrypted == expected else '✗'}")

if __name__ == "__main__":
    train_with_best_config()

Using device: cuda


Training: 100%|██████████| 557/557 [10:22<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [00:59<00:00,  2.35it/s]



Epoch 1 Results:
Train Loss: 1.1808 | Train Acc: 0.9732
Val Loss: 0.0934 | Val Acc: 0.9724
Saved new best model!


Training: 100%|██████████| 557/557 [10:21<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [01:00<00:00,  2.33it/s]



Epoch 2 Results:
Train Loss: 0.1089 | Train Acc: 0.9903
Val Loss: 0.0329 | Val Acc: 0.9899
Saved new best model!


Training: 100%|██████████| 557/557 [10:22<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [00:59<00:00,  2.34it/s]



Epoch 3 Results:
Train Loss: 0.0570 | Train Acc: 0.9947
Val Loss: 0.0181 | Val Acc: 0.9944
Saved new best model!


Training: 100%|██████████| 557/557 [10:22<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [00:59<00:00,  2.35it/s]



Epoch 4 Results:
Train Loss: 0.0381 | Train Acc: 0.9964
Val Loss: 0.0120 | Val Acc: 0.9963
Saved new best model!


Training: 100%|██████████| 557/557 [10:21<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [01:00<00:00,  2.30it/s]



Epoch 5 Results:
Train Loss: 0.0277 | Train Acc: 0.9973
Val Loss: 0.0093 | Val Acc: 0.9972
Saved new best model!


Training: 100%|██████████| 557/557 [10:21<00:00,  1.11s/it]
Evaluating: 100%|██████████| 140/140 [01:00<00:00,  2.33it/s]



Epoch 6 Results:
Train Loss: 0.0219 | Train Acc: 0.9977
Val Loss: 0.0077 | Val Acc: 0.9976
Saved new best model!


Training: 100%|██████████| 557/557 [10:21<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [00:59<00:00,  2.34it/s]



Epoch 7 Results:
Train Loss: 0.0181 | Train Acc: 0.9979
Val Loss: 0.0069 | Val Acc: 0.9979
Saved new best model!


Training: 100%|██████████| 557/557 [10:21<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [00:59<00:00,  2.36it/s]



Epoch 8 Results:
Train Loss: 0.0159 | Train Acc: 0.9981
Val Loss: 0.0063 | Val Acc: 0.9980
Saved new best model!


Training: 100%|██████████| 557/557 [10:21<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [00:59<00:00,  2.35it/s]



Epoch 9 Results:
Train Loss: 0.0140 | Train Acc: 0.9984
Val Loss: 0.0055 | Val Acc: 0.9983
Saved new best model!


Training: 100%|██████████| 557/557 [10:21<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [00:59<00:00,  2.33it/s]



Epoch 10 Results:
Train Loss: 0.0128 | Train Acc: 0.9984
Val Loss: 0.0053 | Val Acc: 0.9983
Saved new best model!


Training: 100%|██████████| 557/557 [10:22<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [00:59<00:00,  2.36it/s]



Epoch 11 Results:
Train Loss: 0.0118 | Train Acc: 0.9985
Val Loss: 0.0049 | Val Acc: 0.9985
Saved new best model!


Training: 100%|██████████| 557/557 [10:23<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [00:59<00:00,  2.35it/s]



Epoch 12 Results:
Train Loss: 0.0113 | Train Acc: 0.9986
Val Loss: 0.0046 | Val Acc: 0.9985
Saved new best model!


Training: 100%|██████████| 557/557 [10:20<00:00,  1.11s/it]
Evaluating: 100%|██████████| 140/140 [00:59<00:00,  2.34it/s]



Epoch 13 Results:
Train Loss: 0.0104 | Train Acc: 0.9987
Val Loss: 0.0044 | Val Acc: 0.9986
Saved new best model!


Training: 100%|██████████| 557/557 [10:23<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [00:59<00:00,  2.34it/s]



Epoch 14 Results:
Train Loss: 0.0102 | Train Acc: 0.9988
Val Loss: 0.0041 | Val Acc: 0.9987
Saved new best model!


Training: 100%|██████████| 557/557 [10:22<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [00:59<00:00,  2.33it/s]



Epoch 15 Results:
Train Loss: 0.0094 | Train Acc: 0.9988
Val Loss: 0.0039 | Val Acc: 0.9988
Saved new best model!


Training: 100%|██████████| 557/557 [10:22<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [01:00<00:00,  2.32it/s]



Epoch 16 Results:
Train Loss: 0.0091 | Train Acc: 0.9989
Val Loss: 0.0037 | Val Acc: 0.9988
Saved new best model!


Training: 100%|██████████| 557/557 [10:22<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [01:00<00:00,  2.33it/s]



Epoch 17 Results:
Train Loss: 0.0089 | Train Acc: 0.9989
Val Loss: 0.0036 | Val Acc: 0.9988
Saved new best model!


Training: 100%|██████████| 557/557 [10:22<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [01:00<00:00,  2.33it/s]



Epoch 18 Results:
Train Loss: 0.0086 | Train Acc: 0.9990
Val Loss: 0.0035 | Val Acc: 0.9989
Saved new best model!


Training: 100%|██████████| 557/557 [10:22<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [01:00<00:00,  2.31it/s]



Epoch 19 Results:
Train Loss: 0.0083 | Train Acc: 0.9990
Val Loss: 0.0033 | Val Acc: 0.9989
Saved new best model!


Training: 100%|██████████| 557/557 [10:22<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [01:00<00:00,  2.32it/s]



Epoch 20 Results:
Train Loss: 0.0080 | Train Acc: 0.9990
Val Loss: 0.0031 | Val Acc: 0.9990
Saved new best model!


Training: 100%|██████████| 557/557 [10:21<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [00:59<00:00,  2.34it/s]



Epoch 21 Results:
Train Loss: 0.0080 | Train Acc: 0.9990
Val Loss: 0.0031 | Val Acc: 0.9990


Training: 100%|██████████| 557/557 [10:22<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [01:00<00:00,  2.32it/s]



Epoch 22 Results:
Train Loss: 0.0074 | Train Acc: 0.9991
Val Loss: 0.0029 | Val Acc: 0.9991
Saved new best model!


Training: 100%|██████████| 557/557 [10:23<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [01:00<00:00,  2.32it/s]



Epoch 23 Results:
Train Loss: 0.0073 | Train Acc: 0.9991
Val Loss: 0.0029 | Val Acc: 0.9991
Saved new best model!


Training: 100%|██████████| 557/557 [10:22<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [01:00<00:00,  2.33it/s]



Epoch 24 Results:
Train Loss: 0.0069 | Train Acc: 0.9992
Val Loss: 0.0027 | Val Acc: 0.9991
Saved new best model!


Training: 100%|██████████| 557/557 [10:23<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [01:00<00:00,  2.32it/s]



Epoch 25 Results:
Train Loss: 0.0070 | Train Acc: 0.9992
Val Loss: 0.0027 | Val Acc: 0.9991
Saved new best model!


Training: 100%|██████████| 557/557 [10:23<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [00:59<00:00,  2.36it/s]



Epoch 26 Results:
Train Loss: 0.0068 | Train Acc: 0.9992
Val Loss: 0.0026 | Val Acc: 0.9991
Saved new best model!


Training: 100%|██████████| 557/557 [10:21<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [01:00<00:00,  2.33it/s]



Epoch 27 Results:
Train Loss: 0.0067 | Train Acc: 0.9992
Val Loss: 0.0025 | Val Acc: 0.9992
Saved new best model!


Training: 100%|██████████| 557/557 [10:22<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [01:00<00:00,  2.30it/s]



Epoch 28 Results:
Train Loss: 0.0064 | Train Acc: 0.9992
Val Loss: 0.0024 | Val Acc: 0.9992
Saved new best model!


Training: 100%|██████████| 557/557 [10:23<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [00:59<00:00,  2.34it/s]



Epoch 29 Results:
Train Loss: 0.0064 | Train Acc: 0.9993
Val Loss: 0.0024 | Val Acc: 0.9992
Saved new best model!


Training: 100%|██████████| 557/557 [10:23<00:00,  1.12s/it]
Evaluating: 100%|██████████| 140/140 [01:00<00:00,  2.33it/s]



Epoch 30 Results:
Train Loss: 0.0063 | Train Acc: 0.9993
Val Loss: 0.0023 | Val Acc: 0.9992
Saved new best model!


FileNotFoundError: [Errno 2] No such file or directory: 'best_trained_model.pth'