In [42]:
import sys
sys.path.append("../src/")
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter  # for logging, if you plan to use TensorBoard within Jupyter
from torch.optim.lr_scheduler import StepLR
from data import load_dataset  # Ensure this module is accessible from your notebook
from model import DualStreamTransformer


In [43]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")


In [44]:
config = {
    "protein_vocab_size": 23,
    "selfies_vocab_size": 112,
    "embedding_dim": 128,
    "nhead": 8,
    "nhid": 2048,
    "nlayers": 6,
    "output_dim": 1,
    "learning_rate": 1e-3,
    "epochs": 10,
    "lr_step_size": 30,
    "lr_gamma": 0.1
}


In [45]:
train_loader, val_loader = load_dataset("../data/raw/Enriched_Set_60percent_similarity.csv", test_size=0.2)


In [46]:
model = DualStreamTransformer(
    protein_vocab_size=config["protein_vocab_size"],
    selfies_vocab_size=config["selfies_vocab_size"],
    embedding_dim=config["embedding_dim"],
    nhead=config["nhead"],
    nhid=config["nhid"],
    nlayers=config["nlayers"],
    output_dim=config["output_dim"]
).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"])
scheduler = StepLR(optimizer, step_size=config["lr_step_size"], gamma=config["lr_gamma"])




In [47]:
# Assuming model, criterion, optimizer, scheduler are already defined as above
writer = SummaryWriter()  # TensorBoard summary writer


In [48]:
def train_one_epoch(epoch_index):
    model.train()  # Set the model to training mode
    total_loss = 0.0
    for i, batch in enumerate(train_loader):
        # Assuming batch['seq'] returns protein sequences and batch['selfies'] returns SELFIES sequences
        protein_seq = batch['seq'].to(device)
        selfies_seq = batch['selfies'].to(device)  # Ensure this is the correct key for your SELFIES data
        labels = batch['isActive'].to(device)

        optimizer.zero_grad()  # Zero the parameter gradients
        
        # Make sure to pass both protein_seq and selfies_seq to the model
        outputs = model(protein_seq, selfies_seq)  # Forward pass

        loss = criterion(outputs.squeeze(), labels.float())  # Compute the loss
        loss.backward()  # Backward pass
        optimizer.step()  # Optimize

        total_loss += loss.item()
        if i % 100 == 99:  # Log every 100 mini-batches
            print(f'Epoch [{epoch_index + 1}/{config["epochs"]}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}')


def validate(epoch_index):
    model.eval()  # Set model to evaluation mode
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch['seq'].to(device), batch['isActive'].to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
    avg_val_loss = val_loss / len(val_loader)
    writer.add_scalar('validation loss', avg_val_loss, epoch_index)
    return avg_val_loss


In [49]:
best_val_loss = float('inf')
early_stopping_patience = 5
patience_counter = 0

for epoch in range(config["epochs"]):
    train_one_epoch(epoch)
    current_val_loss = validate(epoch)
    print(f"Epoch {epoch}, Validation Loss: {current_val_loss}")
    
    # Checkpointing
    if current_val_loss < best_val_loss:
        best_val_loss = current_val_loss
        torch.save(model.state_dict(), 'best_model.pth')
        patience_counter = 0  # reset counter
    else:
        patience_counter += 1
    
    # Early Stopping
    if patience_counter >= early_stopping_patience:
        print("Early stopping triggered.")
        break
    
    scheduler.step()  # Adjust learning rate

writer.close()  # Close the TensorBoard writer


RuntimeError: Sizes of tensors must match except in dimension 2. Expected size 780 but got size 350 for tensor number 1 in the list.

In [50]:
from model import DualStreamTransformer

In [51]:
import torch
import torch.nn as nn
from data import load_dataset  # Ensure you import or define your DataLoader

# Assuming model is already defined and device is set up
loss_function = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

def train(model, train_loader, optimizer, loss_function, device):
    model.train()  # Set the model to training mode
    total_loss = 0

    for batch in train_loader:
        protein_seq = batch['seq'].to(device)
        selfies_seq = batch['selfies'].to(device)
        labels = batch['isActive'].view(-1, 1).to(device)  # Ensure labels are correctly sized and on the right device

        optimizer.zero_grad()  # Clear previous gradients
        outputs = model(protein_seq, selfies_seq)
        loss = loss_function(outputs, labels)
        loss.backward()  # Compute gradients
        optimizer.step()  # Update model parameters

        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    return average_loss

def validate(model, test_loader, loss_function, device):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    with torch.no_grad():  # No need to track gradients during validation
        for batch in test_loader:
            protein_seq = batch['seq'].to(device)
            selfies_seq = batch['selfies'].to(device)
            labels = batch['isActive'].view(-1, 1).to(device)

            outputs = model(protein_seq, selfies_seq)
            loss = loss_function(outputs, labels)
            total_loss += loss.item()

    average_loss = total_loss / len(test_loader)
    return average_loss

# Training Loop
num_epochs = 5  # Number of epochs to train for
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, loss_function, device)
    valid_loss = validate(model, test_loader, loss_function, device)
    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}')


RuntimeError: Sizes of tensors must match except in dimension 2. Expected size 780 but got size 350 for tensor number 1 in the list.

In [27]:
# Mock Data Creation
mock_protein_seq = torch.randint(0, 26, (4, 100))  # (batch_size, seq_len)
mock_selfies_seq = torch.randint(0, 115, (4, 100))  # (batch_size, seq_len)
mock_protein_seq, mock_selfies_seq = mock_protein_seq.to(device), mock_selfies_seq.to(device)

# Forward Pass
with torch.no_grad():
    mock_output = model(mock_protein_seq, mock_selfies_seq)
print(mock_output)  # Check output shapes and values


tensor([[0.5442],
        [0.5452],
        [0.5429],
        [0.5435],
        [0.5445],
        [0.5448],
        [0.5449],
        [0.5447],
        [0.5447],
        [0.5445],
        [0.5449],
        [0.5455],
        [0.5450],
        [0.5441],
        [0.5450],
        [0.5436],
        [0.5454],
        [0.5454],
        [0.5444],
        [0.5448],
        [0.5453],
        [0.5449],
        [0.5448],
        [0.5454],
        [0.5458],
        [0.5444],
        [0.5450],
        [0.5444],
        [0.5439],
        [0.5454],
        [0.5459],
        [0.5455],
        [0.5449],
        [0.5445],
        [0.5446],
        [0.5446],
        [0.5448],
        [0.5458],
        [0.5454],
        [0.5454],
        [0.5451],
        [0.5449],
        [0.5456],
        [0.5440],
        [0.5449],
        [0.5450],
        [0.5443],
        [0.5456],
        [0.5463],
        [0.5457],
        [0.5451],
        [0.5447],
        [0.5450],
        [0.5452],
        [0.5462],
        [0

NameError: name 'self' is not defined