In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('/home/dreamtim/Coding/ITMO/itmo-cpp/output_data/all_peptides_with_smiles.csv', index_col=0)
df.columns

Index(['id', 'sequence', 'extra_name', 'cpp_category', 'is_cpp', 'cpp_type',
       'origin', 'id_uptake', 'peptide', 'uptake_type', 'raw_efficiency',
       'raw_toxicity', 'raw_concentration', 'id_experiment',
       'peptide_experiment', 'raw_time', 'method', 'cell_line', 'cargo',
       'mechanism', 'raw_temperature', 'id_article', 'doi', 'pubmed_id',
       'title', 'sequence_category', 'standard_sequence', 'nh3_tail',
       'po3_pos', 'biotinylated', 'acylated_n_terminal', 'cyclic', 'amidated',
       'stearyl_uptake', 'hexahistidine_tagged', 'modifications',
       'smiles_sequence'],
      dtype='object')

In [3]:
df.is_cpp.value_counts()

is_cpp
True     1601
False    1321
Name: count, dtype: int64

In [4]:
# Create combined DataFrame with essential columns
df_working = df[['smiles_sequence', 'is_cpp']].copy()

# Filter df: only not na and not empty smiles_sequence
df_working = df_working.dropna(subset=['smiles_sequence'])

# First split: train (80%) vs temp (20%)
train_df, temp_df = train_test_split(
    df_working,
    test_size=0.2,
    random_state=42,
    stratify=df_working['is_cpp']  # Maintain class balance
)

# Second split: validation (10%) and test (10%)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=42,
    stratify=temp_df['is_cpp']
)

In [5]:
def add_condition_token(row):
    return f"[CPP]{row['smiles_sequence']}" if row['is_cpp'] else f"[NON]{row['smiles_sequence']}"

for df_split in [train_df, val_df, test_df]:
    df_split['conditional_smiles'] = df_split.apply(add_condition_token, axis=1)

In [6]:
print(f"Train CPP ratio: {train_df['is_cpp'].mean():.2f}")
print(f"Val CPP ratio: {val_df['is_cpp'].mean():.2f}")
print(f"Test CPP ratio: {test_df['is_cpp'].mean():.2f}")

Train CPP ratio: 0.55
Val CPP ratio: 0.55
Test CPP ratio: 0.55


In [7]:
from tokenizers import Tokenizer, models, trainers

# Train BPE tokenizer
tokenizer = Tokenizer(models.BPE())
trainer = trainers.BpeTrainer(
    special_tokens=["[PAD]", "[CPP]", "[NON]"],
    min_frequency=2
)

In [8]:
# Add special tokens and post-processing
tokenizer.add_special_tokens(["[CPP]", "[NON]", "[PAD]"])

# Set padding and truncation
tokenizer.enable_padding(
    pad_id=tokenizer.token_to_id("[PAD]"),
    pad_token="[PAD]",
    length=128  # Adjust based on your max SMILES length
)
tokenizer.enable_truncation(max_length=128)

In [9]:
import torch
from torch.nn.utils.rnn import pad_sequence

class CPPDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_sequences):
        self.sequences = tokenized_sequences
        
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx])

def collate_fn(batch):
    return pad_sequence(
        batch,
        batch_first=True,
        padding_value=tokenizer.token_to_id("[PAD]")
    )

# Tokenize all splits
train_enc = [tokenizer.encode(s).ids for s in train_df['conditional_smiles']]
val_enc = [tokenizer.encode(s).ids for s in val_df['conditional_smiles']]

train_dataset = CPPDataset(train_enc)
val_dataset = CPPDataset(val_enc)

In [10]:
from transformers import AutoTokenizer, GPTNeoModel


# 1. Load pre-trained chemical model
base_model = GPTNeoModel.from_pretrained("ncfrey/ChemGPT-19M")

# 2. Modify for CPP generation
class CPPGenerator(base_model.__class__):
    def __init__(self, config):
        super().__init__(config)
        # Add CPP-specific conditioning
        self.cpp_embed = torch.nn.Embedding(2, config.hidden_size)  # 0=non-CPP, 1=CPP

# 3. Initialize with pre-trained weights
model = CPPGenerator.from_pretrained("ncfrey/ChemGPT-19M", config=base_model.config)

Some weights of CPPGenerator were not initialized from the model checkpoint at ncfrey/ChemGPT-19M and are newly initialized: ['transformer.cpp_embed.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from transformers import GPTNeoPreTrainedModel, GPTNeoModel

class CPPGenerator(GPTNeoPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.model = GPTNeoModel(config)
        self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        
        # Add CPP-specific embedding (0=non-CPP, 1=CPP)
        self.cpp_embed = torch.nn.Embedding(2, config.hidden_size)

        self.init_weights()

    def forward(self, input_ids, attention_mask=None, cpp_labels=None):
        # Get hidden states from ChemGPT
        outputs = self.model(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state

        # If CPP labels are provided, add the CPP embedding
        if cpp_labels is not None:
            cpp_embeddings = self.cpp_embed(cpp_labels).unsqueeze(1)  # Shape: (batch, 1, hidden_size)
            hidden_states = hidden_states + cpp_embeddings

        logits = self.lm_head(hidden_states)
        return logits


# Bad model

In [19]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained("ncfrey/ChemGPT-19M")
model = CPPGenerator.from_pretrained("ncfrey/ChemGPT-19M", config=config)


Some weights of CPPGenerator were not initialized from the model checkpoint at ncfrey/ChemGPT-19M and are newly initialized: ['transformer.cpp_embed.weight', 'transformer.lm_head.weight', 'transformer.model.h.0.attn.attention.k_proj.weight', 'transformer.model.h.0.attn.attention.out_proj.bias', 'transformer.model.h.0.attn.attention.out_proj.weight', 'transformer.model.h.0.attn.attention.q_proj.weight', 'transformer.model.h.0.attn.attention.v_proj.weight', 'transformer.model.h.0.ln_1.bias', 'transformer.model.h.0.ln_1.weight', 'transformer.model.h.0.ln_2.bias', 'transformer.model.h.0.ln_2.weight', 'transformer.model.h.0.mlp.c_fc.bias', 'transformer.model.h.0.mlp.c_fc.weight', 'transformer.model.h.0.mlp.c_proj.bias', 'transformer.model.h.0.mlp.c_proj.weight', 'transformer.model.h.1.attn.attention.k_proj.weight', 'transformer.model.h.1.attn.attention.out_proj.bias', 'transformer.model.h.1.attn.attention.out_proj.weight', 'transformer.model.h.1.attn.attention.q_proj.weight', 'transformer.m

In [20]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=collate_fn)


In [21]:
train_labels = torch.tensor(train_df['is_cpp'].values, dtype=torch.long)
val_labels = torch.tensor(val_df['is_cpp'].values, dtype=torch.long)

In [22]:
from torch.optim import AdamW

criterion = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=5e-5)


In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        batch = batch.to(device)

        # Generate logits
        logits = model(batch)
        
        # Shift targets (next token prediction)
        targets = batch[:, 1:].contiguous().view(-1)
        logits = logits[:, :-1, :].contiguous().view(-1, logits.size(-1))

        loss = criterion(logits, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")


Epoch 1, Loss: 1.1694699436017912
Epoch 2, Loss: 0.39172834709082566
Epoch 3, Loss: 0.1755400059157855
Epoch 4, Loss: 0.09764893138653612
Epoch 5, Loss: 0.06253999374704818
Epoch 6, Loss: 0.04374667650012121
Epoch 7, Loss: 0.03245650874833538
Epoch 8, Loss: 0.025107077692877757
Epoch 9, Loss: 0.02003458711280398
Epoch 10, Loss: 0.016374668374351444


# Better model

In [12]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Config, GPT2LMHeadModel, AdamW
from tokenizers import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from rdkit import Chem
import pandas as pd
import numpy as np

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
class CPPDataset(Dataset):
    def __init__(self, smiles, labels, tokenizer, max_length=128):
        self.sequences = [
            tokenizer.encode(f"[{'CPP' if label else 'NON'}]{s}").ids[:max_length]
            for s, label in zip(smiles, labels)
        ]
        
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long)

In [15]:
class CPPGenerator(GPT2LMHeadModel):
    def __init__(self, config):
        super().__init__(config)
        self.condition_embed = nn.Embedding(2, config.n_embd)
        
    def forward(self, input_ids=None, labels=None, condition=None):
        if condition is not None:
            cond_emb = self.condition_embed(condition)
            inputs_embeds = self.transformer.wte(input_ids) + cond_emb.unsqueeze(1)
            return super().forward(inputs_embeds=inputs_embeds, labels=labels)
        return super().forward(input_ids=input_ids, labels=labels)

def initialize_generator(tokenizer):
    config = GPT2Config(
        vocab_size=tokenizer.get_vocab_size(),
        n_positions=128,
        n_embd=256,
        n_layer=6,
        n_head=8,
        pad_token_id=tokenizer.token_to_id("[PAD]"),
        bos_token_id=tokenizer.token_to_id("[CPP]"),
    )
    return CPPGenerator(config).to(device)

In [16]:
class CPPClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.classifier = nn.Linear(hidden_dim, 1)
        
    def forward(self, x):
        x = self.embedding(x)
        _, (h_n, _) = self.lstm(x)
        return torch.sigmoid(self.classifier(h_n[-1]))

In [17]:
# --------------------------------------------------
# 4. Training Utilities
# --------------------------------------------------
def train_generator(model, train_loader, val_loader, epochs=10):
    optimizer = AdamW(model.parameters(), lr=1e-4)
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        
        for batch in train_loader:
            inputs = batch.to(device)
            condition = (inputs[:, 0] == tokenizer.token_to_id("[CPP]")).long()
            
            # Remove condition token for labels
            labels = torch.where(inputs == tokenizer.token_to_id("[CPP]"), -100, inputs)
            labels = torch.where(inputs == tokenizer.token_to_id("[NON]"), -100, labels)
            
            outputs = model(inputs[:, 1:], labels=labels[:, 1:].to(device), condition=condition)
            loss = outputs.loss
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        # Validation
        val_loss = 0
        model.eval()
        with torch.no_grad():
            for batch in val_loader:
                inputs = batch.to(device)
                condition = (inputs[:, 0] == tokenizer.token_to_id("[CPP]")).long()
                labels = torch.where(inputs == tokenizer.token_to_id("[CPP]"), -100, inputs)
                labels = torch.where(inputs == tokenizer.token_to_id("[NON]"), -100, labels)
                
                outputs = model(inputs[:, 1:], labels=labels[:, 1:].to(device), condition=condition)
                val_loss += outputs.loss.item()
        
        print(f"Epoch {epoch+1} | Train Loss: {total_loss/len(train_loader):.4f} | Val Loss: {val_loss/len(val_loader):.4f}")

def train_classifier(model, train_loader, val_loader, epochs=10):
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        
        for batch in train_loader:
            inputs = batch[0].to(device)
            labels = batch[1].float().to(device)
            
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        # Validation
        val_loss = 0
        val_acc = 0
        model.eval()
        with torch.no_grad():
            for batch in val_loader:
                inputs = batch[0].to(device)
                labels = batch[1].cpu().numpy()
                
                outputs = model(inputs).squeeze().cpu().numpy()
                preds = (outputs > 0.5).astype(int)
                
                val_loss += criterion(torch.tensor(outputs), torch.tensor(labels)).item()
                val_acc += accuracy_score(labels, preds)
        
        print(f"Epoch {epoch+1} | Train Loss: {total_loss/len(train_loader):.4f} | Val Loss: {val_loss/len(val_loader):.4f} | Val Acc: {val_acc/len(val_loader):.4f}")

In [18]:
# 6. Initialize models with strict validation
def initialize_generator(tokenizer):
    # Verify special tokens first
    assert tokenizer.token_to_id("[PAD]") is not None, "Missing [PAD] token"
    assert tokenizer.token_to_id("[CPP]") is not None, "Missing [CPP] token"
    assert tokenizer.token_to_id("[NON]") is not None, "Missing [NON] token"
    
    config = GPT2Config(
        vocab_size=tokenizer.get_vocab_size(),
        n_positions=128,
        n_embd=256,
        n_layer=6,
        n_head=8,
        pad_token_id=tokenizer.token_to_id("[PAD]"),
        bos_token_id=tokenizer.token_to_id("[CPP]"),
        eos_token_id=tokenizer.token_to_id("[PAD]"),
    )
    model = CPPGenerator(config)
    
    # Initialize embeddings properly
    model.transformer.wte.weight.data.normal_(mean=0.0, std=0.02)
    model.condition_embed.weight.data.normal_(mean=0.0, std=0.01)
    
    return model.to(device)

In [19]:
# 7. Enhanced batch validation with memory safety
def validate_batch(batch):
    # Move to CPU for safe validation
    cpu_batch = batch.cpu()
    valid_ids = set(tokenizer.get_vocab().values())
    
    # Check token range
    min_id, max_id = min(valid_ids), max(valid_ids)
    invalid = (cpu_batch < min_id) | (cpu_batch > max_id)
    
    if invalid.any():
        bad_tokens = cpu_batch[invalid].unique().tolist()
        print(f"Invalid tokens found: {bad_tokens}")
        print(f"Valid token range: {min_id}-{max_id}")
        raise ValueError("Batch contains invalid token IDs")
        
    # Check special tokens
    pad_present = (cpu_batch == tokenizer.token_to_id("[PAD]")).any()
    cpp_present = (cpu_batch == tokenizer.token_to_id("[CPP]")).any()
    
    if not pad_present:
        print("Warning: No padding tokens in batch")
    if not cpp_present:
        print("Warning: No CPP tokens in batch")

In [20]:
# 8. Add CUDA memory management
def clear_cuda_cache():
    torch.cuda.empty_cache()
    allocated = torch.cuda.memory_allocated() / 1e6
    reserved = torch.cuda.memory_reserved() / 1e6
    print(f"CUDA memory: {allocated:.1f}MB allocated, {reserved:.1f}MB reserved")

In [21]:
# 9. Modified training loop
def safe_train_generator(model, train_loader, val_loader, epochs=10):
    optimizer = AdamW(model.parameters(), lr=1e-4)
    scaler = torch.cuda.amp.GradScaler()
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        
        for batch_idx, batch in enumerate(train_loader):
            clear_cuda_cache()
            
            try:
                # Validate before moving to GPU
                validate_batch(batch)
                batch = batch.to(device)
                
                # Create labels
                labels = batch.clone()
                labels[labels == tokenizer.token_to_id("[CPP]")] = -100
                labels[labels == tokenizer.token_to_id("[NON]")] = -100
                
                # Forward pass with mixed precision
                with torch.amp.autocast('cuda'):
                    outputs = model(
                        input_ids=batch[:, 1:],
                        labels=labels[:, 1:],
                        condition=(batch[:, 0] == tokenizer.token_to_id("[CPP]")).long()
                    )
                    loss = outputs.loss
                
                # Backward pass with gradient clipping
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                
                total_loss += loss.item()
                
                # Every 10 batches, validate memory
                if batch_idx % 10 == 0:
                    clear_cuda_cache()
                    
            except RuntimeError as e:
                if 'CUDA out of memory' in str(e):
                    print("OOM detected, reducing batch size")
                    del batch, outputs, loss
                    clear_cuda_cache()
                    continue
                else:
                    raise
        
        print(f"Epoch {epoch+1} | Loss: {total_loss/len(train_loader):.4f}")

In [22]:
# Train generator
train_dataset = CPPDataset(train_df['smiles_sequence'], train_df['is_cpp'], tokenizer)
val_dataset = CPPDataset(val_df['smiles_sequence'], val_df['is_cpp'], tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

model = initialize_generator(tokenizer)
safe_train_generator(model, train_loader, val_loader, epochs=10)

  scaler = torch.cuda.amp.GradScaler()


CUDA memory: 19.2MB allocated, 21.0MB reserved
CUDA memory: 73.9MB allocated, 165.7MB reserved
CUDA memory: 73.9MB allocated, 165.7MB reserved
CUDA memory: 112.8MB allocated, 216.0MB reserved
CUDA memory: 111.9MB allocated, 209.7MB reserved
CUDA memory: 112.7MB allocated, 201.3MB reserved
CUDA memory: 111.9MB allocated, 216.0MB reserved
CUDA memory: 112.2MB allocated, 201.3MB reserved
CUDA memory: 111.9MB allocated, 211.8MB reserved
CUDA memory: 112.7MB allocated, 222.3MB reserved
CUDA memory: 111.9MB allocated, 211.8MB reserved
CUDA memory: 112.6MB allocated, 201.3MB reserved
CUDA memory: 112.0MB allocated, 216.0MB reserved
CUDA memory: 111.9MB allocated, 216.0MB reserved
CUDA memory: 112.2MB allocated, 201.3MB reserved
CUDA memory: 111.9MB allocated, 211.8MB reserved
CUDA memory: 112.7MB allocated, 222.3MB reserved
CUDA memory: 111.9MB allocated, 211.8MB reserved
CUDA memory: 112.6MB allocated, 201.3MB reserved
CUDA memory: 111.9MB allocated, 216.0MB reserved
CUDA memory: 112.2MB all

In [42]:
def generate_cpp(
    model, 
    tokenizer,
    num_samples=10,
    max_length=100,
    temperature=1.0,
    top_k=50,
    top_p=0.95,
    device="cuda" if torch.cuda.is_available() else "cpu"
):
    """Generate CPP SMILES with robust sampling controls"""
    model.eval()
    valid_smiles = []
    start_token = tokenizer.token_to_id("[CPP]")
    pad_token = tokenizer.token_to_id("[PAD]")
    
    with torch.no_grad():
        for _ in range(num_samples):
            input_ids = torch.tensor([[start_token]], dtype=torch.long, device=device)
            condition = torch.tensor([1], device=device)
            
            for _ in range(max_length):
                outputs = model(input_ids, condition=condition)
                logits = outputs.logits[:, -1, :] / temperature
                
                # Filter invalid tokens first
                valid_mask = torch.ones_like(logits, dtype=torch.bool)
                valid_mask[:, [pad_token]] = False  # Always exclude pad token
                logits[~valid_mask] = -float('inf')
                
                # Dynamic top_k adjustment
                num_valid = valid_mask.sum().item()
                current_top_k = min(top_k, num_valid) if num_valid > 0 else 1
                
                if current_top_k > 0:
                    top_values = torch.topk(logits, current_top_k)[0]
                    logits[logits < top_values[:, [-1]]] = -float('inf')
                
                if top_p > 0.0:
                    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                    cumulative_probs = torch.cumsum(
                        torch.softmax(sorted_logits, dim=-1), 
                        dim=-1
                    )
                    sorted_indices_to_remove = cumulative_probs > top_p
                    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
                    sorted_indices_to_remove[..., 0] = False
                    indices_to_remove = sorted_indices_to_remove.scatter(
                        -1, sorted_indices, sorted_indices_to_remove
                    )
                    logits[indices_to_remove] = -float('inf')

                # Check for valid tokens
                if (logits == -float('inf')).all():
                    break
                
                probs = torch.softmax(logits, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1)
                
                if next_token.item() == pad_token:
                    break
                    
                input_ids = torch.cat([input_ids, next_token], dim=-1)

            # Decode and validate
            generated = tokenizer.decode(input_ids[0].cpu().tolist())
            generated = generated.replace("[CPP]", "").replace("[PAD]", "").strip()
            
            try:
                if Chem.MolFromSmiles(generated) is not None:
                    valid_smiles.append(generated)
            except:
                continue
    
    return valid_smiles

In [43]:
def save_model(model, tokenizer, save_dir="./cpp_generator"):
    """Save model and tokenizer"""
    import os
    os.makedirs(save_dir, exist_ok=True)
    
    # Save model
    torch.save({
        "model_state": model.state_dict(),
        "config": model.config.to_dict()
    }, f"{save_dir}/model.pth")
    
    # Save tokenizer
    tokenizer.save(f"{save_dir}/tokenizer.json")

In [44]:
def load_model(save_dir="./cpp_generator", device="cpu"):
    """Load model and tokenizer"""
    from tokenizers import Tokenizer
    from transformers import GPT2Config
    
    # Load tokenizer
    tokenizer = Tokenizer.from_file(f"{save_dir}/tokenizer.json")
    
    # Load model config
    checkpoint = torch.load(f"{save_dir}/model.pth", map_location=device)
    config = GPT2Config.from_dict(checkpoint["config"])
    
    # Initialize model
    model = CPPGenerator(config).to(device)
    model.load_state_dict(checkpoint["model_state"])
    
    return model, tokenizer

In [45]:
# Save the trained model
save_model(model, tokenizer)

In [46]:
# Generate with safe parameters
generated_smiles = generate_cpp(
    model=model,
    tokenizer=tokenizer,
    num_samples=10,
    temperature=0.9,
    top_k=40,  # Will auto-adjust if needed
    top_p=0.9,
    device=device
)

print("Generated CPPs:")
for i, smi in enumerate(generated_smiles, 1):
    print(f"{i}. {smi}")


Generated CPPs:
1. 
2. 
3. 
4. 
5. 
6. 
7. 
8. 
9. 
10. 


In [34]:
generated_smiles

['', '', '', '', '', '', '', '', '', '']

In [47]:
def debug_generation(model, tokenizer, device):
    model.eval()
    start_token = tokenizer.token_to_id("[CPP]")
    
    with torch.no_grad():
        input_ids = torch.tensor([[start_token]], device=device)
        print("Initial tokens:", tokenizer.decode(input_ids[0].cpu().tolist()))
        
        for step in range(5):  # First 5 generation steps
            outputs = model(input_ids)
            logits = outputs.logits[:, -1, :]
            next_token = torch.argmax(logits, dim=-1)
            print(f"Step {step+1} token: {next_token.item()} ({tokenizer.id_to_token(next_token.item())})")
            input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=-1)
    
    raw_output = tokenizer.decode(input_ids[0].cpu().tolist())
    print("Raw generated:", raw_output)
    return raw_output

# Usage
debug_output = debug_generation(model, tokenizer, device)

Initial tokens: 
Step 1 token: 1 ([NON])
Step 2 token: 1 ([NON])
Step 3 token: 1 ([NON])
Step 4 token: 1 ([NON])
Step 5 token: 1 ([NON])
Raw generated: 


In [49]:
def debug_generation(model, tokenizer, device):
    model.eval()
    start_token = tokenizer.token_to_id("[CPP]")
    
    with torch.no_grad():
        input_ids = torch.tensor([[start_token]], device=device)
        condition = torch.tensor([1], device=device)  # CPP condition
        
        print("Initial tokens:", tokenizer.decode(input_ids[0].cpu().tolist()))
        
        for step in range(5):
            outputs = model(input_ids, condition=condition)  # Pass condition
            logits = outputs.logits[:, -1, :]
            next_token = torch.argmax(logits, dim=-1)
            print(f"Step {step+1} token: {next_token.item()} ({tokenizer.id_to_token(next_token.item())})")
            input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=-1)
    
    raw_output = tokenizer.decode(input_ids[0].cpu().tolist())
    print("Raw generated:", raw_output)
    return raw_output

In [55]:
# Should show "[CPP]SMILES..." for positive, "[NON]SMILES..." for negative
print("Sample training sequences:")
for i in range(3):
    # Get complete sequence for sample i
    token_sequence = train_dataset[i].tolist()
    
    # Decode properly with batch dimension
    decoded_seq = tokenizer.decode(ids=token_sequence)
    
    print(f"Sample {i+1}: {decoded_seq}")
    print(f"Token IDs: {token_sequence}")
    print("-" * 50)

Sample training sequences:
Sample 1: 
Token IDs: [1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
--------------------------------------------------
Sample 2: 
Token IDs: [1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
--------------------------------------------------
Sample 3: 
Token IDs: [0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2