In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
import pandas as pd
from sklearn.model_selection import train_test_split
import string
import random
import optuna
from tqdm import tqdm
import os
import json

# Set random seeds for reproducibility
torch.manual_seed(42)
random.seed(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

def load_data(file_path):
    df = pd.read_csv(file_path)

    # Convert all inputs and outputs to strings (handles None/NaN/boolean values)
    inputs = df['Input'].astype(str).tolist()
    outputs = df['Output'].astype(str).tolist()

    return inputs, outputs

# Tokenization and Vocabulary
class Vocabulary:
    def __init__(self):
        self.char2idx = {}
        self.idx2char = {}
        self.pad_token = 0
        self.sos_token = 1
        self.eos_token = 2
        self.unk_token = 3
        self._build_vocab()

    def _build_vocab(self):
        special_tokens = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
        all_chars = list(string.printable)
        self.char2idx = {token: idx for idx, token in enumerate(special_tokens)}
        self.char2idx.update({char: idx + len(special_tokens) for idx, char in enumerate(all_chars)})
        self.idx2char = {idx: char for char, idx in self.char2idx.items()}

    def __len__(self):
        return len(self.char2idx)

    def encode(self, text):
        return [self.char2idx.get(char, self.unk_token) for char in text]

    def decode(self, indices):
        return ''.join([self.idx2char.get(idx, '') for idx in indices if idx not in {self.pad_token, self.sos_token, self.eos_token}])

# Dataset Class
class CipherDataset(data.Dataset):
    def __init__(self, inputs, outputs, vocab, max_length):
        self.inputs = inputs
        self.outputs = outputs
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = str(self.inputs[idx])
        output_text = str(self.outputs[idx])
        input_encoded = [self.vocab.sos_token] + self.vocab.encode(input_text) + [self.vocab.eos_token]
        output_encoded = [self.vocab.sos_token] + self.vocab.encode(output_text) + [self.vocab.eos_token]
        input_padded = input_encoded + [self.vocab.pad_token] * (self.max_length - len(input_encoded))
        output_padded = output_encoded + [self.vocab.pad_token] * (self.max_length - len(output_encoded))
        return torch.tensor(input_padded[:self.max_length]), torch.tensor(output_padded[:self.max_length])

# Transformer Model Components
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        return torch.matmul(attn_probs, V)

    def split_heads(self, x):
        B, L, D = x.size()
        return x.view(B, L, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        B, H, L, D = x.size()
        return x.transpose(1, 2).contiguous().view(B, L, H * D)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        attn = self.scaled_dot_product_attention(Q, K, V, mask)
        return self.W_o(self.combine_heads(attn))

class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super().__init__()
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.ff = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        x = self.norm1(x + self.dropout(self.self_attn(x, x, x, mask)))
        return self.norm2(x + self.dropout(self.ff(x)))

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.ff = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, tgt_mask):
        x = self.norm1(x + self.dropout(self.self_attn(x, x, x, tgt_mask)))
        x = self.norm2(x + self.dropout(self.cross_attn(x, enc_out, enc_out, src_mask)))
        return self.norm3(x + self.dropout(self.ff(x)))

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super().__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model, max_seq_length)
        self.enc_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.dec_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_len = tgt.size(1)
        tgt_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=tgt.device)).bool()
        tgt_mask = tgt_mask.unsqueeze(0).unsqueeze(1)
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_emb = self.dropout(self.pos_enc(self.encoder_embedding(src)))
        tgt_emb = self.dropout(self.pos_enc(self.decoder_embedding(tgt)))
        for layer in self.enc_layers:
            src_emb = layer(src_emb, src_mask)
        for layer in self.dec_layers:
            tgt_emb = layer(tgt_emb, src_emb, src_mask, tgt_mask)
        return self.fc(tgt_emb)

# Training/Evaluation Utilities
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for src, tgt in tqdm(loader, desc="Training"):
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        out = model(src, tgt[:, :-1])
        loss = criterion(out.reshape(-1, out.size(-1)), tgt[:, 1:].reshape(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, tgt in tqdm(loader, desc="Evaluating"):
            src, tgt = src.to(device), tgt.to(device)
            out = model(src, tgt[:, :-1])
            loss = criterion(out.reshape(-1, out.size(-1)), tgt[:, 1:].reshape(-1))
            total_loss += loss.item()
    return total_loss / len(loader)

# Data Preparation for Training
inputs, outputs = load_data('train_augmented.csv')
vocab = Vocabulary()
max_length = 512
train_inputs, val_inputs, train_outputs, val_outputs = train_test_split(inputs, outputs, test_size=0.2, random_state=42)
train_dataset = CipherDataset(train_inputs, train_outputs, vocab, max_length)
val_dataset = CipherDataset(val_inputs, val_outputs, vocab, max_length)

# Global best tracking
best_overall_model = None
best_overall_loss = float('inf')
best_config = None

def objective(trial):
    global best_overall_model, best_overall_loss, best_config
    config = {
        "d_model": trial.suggest_categorical("d_model", [64, 128, 256]),
        "num_heads": trial.suggest_categorical("num_heads", [4, 8]),
        "num_layers": trial.suggest_categorical("num_layers", [2, 4, 6]),
        "d_ff": trial.suggest_categorical("d_ff", [512, 1024]),
        "dropout": trial.suggest_float("dropout", 0.1, 0.3),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-3, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [64]),
    }
    train_loader = data.DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
    val_loader = data.DataLoader(val_dataset, batch_size=config["batch_size"])
    model = Transformer(len(vocab), len(vocab), config["d_model"], config["num_heads"], config["num_layers"], config["d_ff"], max_length, config["dropout"]).to(device)
    optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"])
    criterion = nn.CrossEntropyLoss(ignore_index=vocab.pad_token)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2)
    val_loss = float('inf')
    for _ in range(5):
        train_epoch(model, train_loader, optimizer, criterion)
        val_loss = evaluate(model, val_loader, criterion)
        scheduler.step(val_loss)
    print(f"Trial {trial.number} completed with validation loss: {val_loss}")
    if val_loss < best_overall_loss:
        best_overall_loss = val_loss
        best_overall_model = copy.deepcopy(model.state_dict())
        best_config = config
        torch.save(best_overall_model, '/content/drive/MyDrive/best_model.pth')
    return val_loss

# Run Optuna Study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10, n_jobs=2)  # Use 2 workers

import json

# Define the path in Google Drive
result_path = "/content/drive/MyDrive/best_model_summary.json"

# Save the best trial results
with open(result_path, 'w') as f:
    json.dump({
        "Validation Loss": study.best_trial.value,
        "Params": study.best_trial.params
    }, f, indent=4)

print(f"\n✅ Best trial result saved to Google Drive: {result_path}")


Using device: cuda


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Training:  59%|█████▊    | 925/1576 [04:30<03:11,  3.40it/s]
Training:  59%|█████▉    | 926/1576 [04:30<03:08,  3.44it/s]
Training:  59%|█████▉    | 927/1576 [04:30<03:09,  3.43it/s]
Training:  59%|█████▉    | 928/1576 [04:31<03:08,  3.43it/s]
Training:  59%|█████▉    | 929/1576 [04:31<03:07,  3.45it/s]
Training:  59%|█████▉    | 930/1576 [04:31<03:09,  3.42it/s]
Training:  59%|█████▉    | 931/1576 [04:32<03:08,  3.42it/s]
Training:  59%|█████▉    | 932/1576 [04:32<03:07,  3.43it/s]
Training:  59%|█████▉    | 933/1576 [04:32<03:06,  3.44it/s]
Training:  59%|█████▉    | 934/1576 [04:32<03:05,  3.45it/s]
Training:  59%|█████▉    | 935/1576 [04:33<03:07,  3.42it/s]
Training:  59%|█████▉    | 936/1576 [04:33<03:03,  3.48it/s]
Training:  59%|█████▉    | 937/1576 [04:33<03:06,  3.42it/s]
Training:  60%|█████▉    | 938/1576 [04:34<03:06,  3.42it/s]
Training:  60%|█████▉    | 939/1576 [04:34<03:06,  3.42it/s]
Training:  60%|█████

Trial 1 completed with validation loss: 0.8433132363757506


Evaluating:  99%|█████████▊| 389/394 [00:49<00:08,  1.61s/it][A[I 2025-05-07 10:23:32,109] Trial 1 finished with value: 0.8433132363757506 and parameters: {'d_model': 128, 'num_heads': 4, 'num_layers': 2, 'd_ff': 512, 'dropout': 0.11316843996571438, 'learning_rate': 0.0002110077607934071, 'batch_size': 64}. Best is trial 1 with value: 0.8433132363757506.

Evaluating:  99%|█████████▉| 390/394 [00:50<00:04,  1.16s/it][A
Training:   0%|          | 0/1576 [00:00<?, ?it/s]
Training:   0%|          | 1/1576 [00:00<04:00,  6.54it/s]
Evaluating: 100%|██████████| 394/394 [00:50<00:00,  7.80it/s]
[I 2025-05-07 10:23:32,647] Trial 0 finished with value: 0.9257917039588018 and parameters: {'d_model': 64, 'num_heads': 4, 'num_layers': 6, 'd_ff': 512, 'dropout': 0.28233286604022645, 'learning_rate': 0.0002484223532618596, 'batch_size': 64}. Best is trial 1 with value: 0.8433132363757506.
Training:   0%|          | 4/1576 [00:00<03:02,  8.63it/s]

Trial 0 completed with validation loss: 0.9257917039588018


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Training:  22%|██▏       | 345/1576 [01:08<04:05,  5.02it/s]
Training:  22%|██▏       | 346/1576 [01:08<04:07,  4.97it/s]
Training:  22%|██▏       | 347/1576 [01:08<04:07,  4.97it/s]
Training:  22%|██▏       | 348/1576 [01:09<04:06,  4.99it/s]
Training:  22%|██▏       | 349/1576 [01:09<04:05,  5.00it/s]
Training:  22%|██▏       | 350/1576 [01:09<04:04,  5.01it/s]
Training:  22%|██▏       | 351/1576 [01:09<04:04,  5.01it/s]
Training:  22%|██▏       | 352/1576 [01:09<04:04,  5.01it/s]
Training:  22%|██▏       | 353/1576 [01:10<04:03,  5.02it/s]
Training:  22%|██▏       | 354/1576 [01:10<04:03,  5.01it/s]
Training:  23%|██▎       | 355/1576 [01:10<04:03,  5.02it/s]
Training:  23%|██▎       | 356/1576 [01:10<04:03,  5.02it/s]
Training:  23%|██▎       | 357/1576 [01:10<04:02,  5.02it/s]
Training:  23%|██▎       | 358/1576 [01:11<04:02,  5.03it/s]
Training:  23%|██▎       | 359/1576 [01:11<04:02,  5.02it/s]
Training:  23%|██▎  

Trial 2 completed with validation loss: 1.0907422975840302



Training:   0%|          | 1/1576 [00:00<04:56,  5.32it/s]
Evaluating: 100%|██████████| 394/394 [00:35<00:00, 11.09it/s]
Training:   0%|          | 2/1576 [00:00<05:08,  5.11it/s]

Trial 3 completed with validation loss: 0.732872376738466


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Training:  59%|█████▉    | 928/1576 [05:07<03:36,  2.99it/s]
Training:  59%|█████▉    | 929/1576 [05:08<03:35,  3.00it/s]
Training:  59%|█████▉    | 930/1576 [05:08<03:35,  3.00it/s]
Training:  59%|█████▉    | 931/1576 [05:08<03:34,  3.01it/s]
Training:  59%|█████▉    | 932/1576 [05:09<03:34,  3.01it/s]
Training:  59%|█████▉    | 933/1576 [05:09<03:33,  3.01it/s]
Training:  59%|█████▉    | 934/1576 [05:09<03:33,  3.01it/s]
Training:  59%|█████▉    | 935/1576 [05:10<03:32,  3.01it/s]
Training:  59%|█████▉    | 936/1576 [05:10<03:32,  3.01it/s]
Training:  59%|█████▉    | 937/1576 [05:10<03:32,  3.01it/s]
Training:  60%|█████▉    | 938/1576 [05:11<03:31,  3.01it/s]
Training:  60%|█████▉    | 939/1576 [05:11<03:31,  3.01it/s]
Training:  60%|█████▉    | 940/1576 [05:11<03:31,  3.01it/s]
Training:  60%|█████▉    | 941/1576 [05:12<03:30,  3.01it/s]
Training:  60%|█████▉    | 942/1576 [05:12<03:30,  3.01it/s]
Training:  60%|█████

Trial 4 completed with validation loss: 0.69411048898237



Evaluating:  99%|█████████▊| 389/394 [00:51<00:00, 10.73it/s][A[I 2025-05-07 11:40:52,376] Trial 4 finished with value: 0.69411048898237 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 2, 'd_ff': 512, 'dropout': 0.14770705038105442, 'learning_rate': 0.0001772284150914952, 'batch_size': 64}. Best is trial 4 with value: 0.69411048898237.

Evaluating:  99%|█████████▉| 391/394 [00:51<00:00, 11.72it/s][A
Evaluating: 100%|██████████| 394/394 [00:52<00:00,  7.55it/s]
[I 2025-05-07 11:40:52,680] Trial 5 finished with value: 0.8106605905264163 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 2, 'd_ff': 1024, 'dropout': 0.23842344292315304, 'learning_rate': 0.00012814702904858177, 'batch_size': 64}. Best is trial 4 with value: 0.69411048898237.
Training:   0%|          | 1/1576 [00:00<04:20,  6.05it/s]

Trial 5 completed with validation loss: 0.8106605905264163


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Training:  59%|█████▊    | 923/1576 [05:28<03:52,  2.81it/s]
Training:  59%|█████▊    | 924/1576 [05:29<03:52,  2.81it/s]
Training:  59%|█████▊    | 925/1576 [05:29<03:50,  2.82it/s]
Training:  59%|█████▉    | 926/1576 [05:29<03:51,  2.80it/s]
Training:  59%|█████▉    | 927/1576 [05:30<03:53,  2.78it/s]
Training:  59%|█████▉    | 928/1576 [05:30<03:50,  2.81it/s]
Training:  59%|█████▉    | 929/1576 [05:30<04:18,  2.50it/s]
Training:  59%|█████▉    | 930/1576 [05:31<03:44,  2.88it/s]
Training:  59%|█████▉    | 931/1576 [05:31<03:45,  2.86it/s]
Training:  59%|█████▉    | 932/1576 [05:31<03:45,  2.86it/s]
Training:  59%|█████▉    | 933/1576 [05:32<03:47,  2.83it/s]
Training:  59%|█████▉    | 934/1576 [05:32<03:47,  2.82it/s]
Training:  59%|█████▉    | 935/1576 [05:32<03:46,  2.83it/s]
Training:  59%|█████▉    | 936/1576 [05:33<03:46,  2.82it/s]
Training:  59%|█████▉    | 937/1576 [05:33<03:49,  2.78it/s]
Training:  60%|█████

Trial 6 completed with validation loss: 1.1925706687917563



Evaluating:  97%|█████████▋| 382/394 [00:51<00:01,  8.67it/s][A
Evaluating:  97%|█████████▋| 384/394 [00:51<00:01,  9.36it/s][A
Evaluating:  98%|█████████▊| 386/394 [00:52<00:00,  9.80it/s][A
Training:   0%|          | 0/1576 [00:00<?, ?it/s]
Training:   0%|          | 1/1576 [00:00<11:26,  2.29it/s]
Training:   0%|          | 2/1576 [00:00<11:30,  2.28it/s]
Training:   0%|          | 3/1576 [00:01<11:31,  2.27it/s]
Training:   0%|          | 4/1576 [00:01<11:32,  2.27it/s]
Training:   0%|          | 5/1576 [00:02<11:32,  2.27it/s]
Evaluating: 100%|██████████| 394/394 [00:54<00:00,  7.20it/s]


Trial 7 completed with validation loss: 0.54793657347333


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Training:  59%|█████▉    | 930/1576 [13:08<09:13,  1.17it/s]
Training:  59%|█████▉    | 931/1576 [13:09<09:11,  1.17it/s]
Training:  59%|█████▉    | 932/1576 [13:10<09:10,  1.17it/s]
Training:  59%|█████▉    | 933/1576 [13:11<09:08,  1.17it/s]
Training:  59%|█████▉    | 934/1576 [13:12<09:08,  1.17it/s]
Training:  59%|█████▉    | 935/1576 [13:13<09:06,  1.17it/s]
Training:  59%|█████▉    | 936/1576 [13:14<09:05,  1.17it/s]
Training:  59%|█████▉    | 937/1576 [13:14<09:04,  1.17it/s]
Training:  60%|█████▉    | 938/1576 [13:15<09:03,  1.17it/s]
Training:  60%|█████▉    | 939/1576 [13:16<09:02,  1.17it/s]
Training:  60%|█████▉    | 940/1576 [13:17<09:02,  1.17it/s]
Training:  60%|█████▉    | 941/1576 [13:18<09:01,  1.17it/s]
Training:  60%|█████▉    | 942/1576 [13:19<09:00,  1.17it/s]
Training:  60%|█████▉    | 943/1576 [13:20<08:59,  1.17it/s]
Training:  60%|█████▉    | 944/1576 [13:20<08:58,  1.17it/s]
Training:  60%|█████

Trial 8 completed with validation loss: 0.7509539659858355



Evaluating:  97%|█████████▋| 382/394 [01:58<00:02,  4.23it/s][A
Evaluating:  97%|█████████▋| 383/394 [01:58<00:02,  4.58it/s][A
Evaluating:  97%|█████████▋| 384/394 [01:58<00:02,  4.87it/s][A
Evaluating:  98%|█████████▊| 385/394 [01:58<00:01,  5.09it/s][A
Evaluating:  98%|█████████▊| 386/394 [01:58<00:01,  5.25it/s][A
Evaluating:  98%|█████████▊| 387/394 [01:58<00:01,  5.37it/s][A
Evaluating:  98%|█████████▊| 388/394 [01:59<00:01,  5.46it/s][A
Evaluating:  99%|█████████▊| 389/394 [01:59<00:00,  5.51it/s][A
Evaluating:  99%|█████████▉| 390/394 [01:59<00:00,  5.55it/s][A
Evaluating:  99%|█████████▉| 391/394 [01:59<00:00,  5.47it/s][A
Evaluating:  99%|█████████▉| 392/394 [01:59<00:00,  5.54it/s][A
Evaluating: 100%|█████████▉| 393/394 [02:00<00:00,  5.59it/s][A
Evaluating: 100%|██████████| 394/394 [02:00<00:00,  3.28it/s]
[I 2025-05-07 14:34:27,371] Trial 9 finished with value: 0.18627729520277322 and parameters: {'d_model': 256, 'num_heads': 8, 'num_layers': 6, 'd_ff': 1024, 

Trial 9 completed with validation loss: 0.18627729520277322

✅ Best trial result saved to Google Drive: /content/drive/MyDrive/best_model_summary.json


In [6]:
!pip install --upgrade torch torchvision torchaudio

Collecting torch
  Downloading torch-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting torchvision
  Downloading torchvision-0.22.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading torchaudio-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.6.77 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.6.80 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.5.1.17 (from torch)
  Downloading nvidia_cu

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0
