In [1]:
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
import pandas as pd
import ast
from transformers import BertModel
import torch.nn as nn
import asyncio
import time



In [3]:
import torch

def clear_cuda_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        print("CUDA cache cleared.")
    else:
        print("CUDA is not available on this system.")

In [4]:
import torch

def clear_cuda_cache(show_memory_info=False):
    if not torch.cuda.is_available():
        print("CUDA is not available on this system.")
        return

    if show_memory_info:
        print(f"Before clearing - Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
        print(f"Before clearing - Cached: {torch.cuda.memory_reserved() / 1e9:.2f} GB")

    torch.cuda.empty_cache()
    torch.cuda.synchronize()

    if show_memory_info:
        print(f"After clearing - Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
        print(f"After clearing - Cached: {torch.cuda.memory_reserved() / 1e9:.2f} GB")

    print("CUDA cache cleared.")

In [5]:
class ProteinDataset(Dataset):
    def __init__(self, data_file, tokenizer, max_length=512):
        self.data = pd.read_csv(data_file, sep=',', header=0)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        sequence = row['sequence']
        advanced_description = row['advanced_description']
        simple_description = row['simple_description']
        go_terms = ast.literal_eval(row['go_terms'])
        keywords = ast.literal_eval(row['keywords'])
        function = row['function']
        # Combine sequence and metadata for richer context
        input_texts = [
            f"Sequence: {sequence} Description: {advanced_description} GO Terms: {' '.join(go_terms)} Keywords: {' '.join(keywords)}",
            f"Sequence: {sequence} Description: {simple_description} GO Terms: {' '.join(go_terms)} Keywords: {' '.join(keywords)}",
            f"Sequence: {sequence} Description: {function} GO Terms: {' '.join(go_terms)} Keywords: {' '.join(keywords)}"
        ]
        
        encodings = [self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        ) for text in input_texts]

        return [{
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'sequence': sequence
        } for encoding in encodings]

In [19]:
class ProteinVAE(nn.Module):
    def __init__(self, bert_model, latent_dim=128):
        super(ProteinVAE, self).__init__()
        self.bert = bert_model
        self.bert_output_dim = bert_model.config.hidden_size
        self.vocab_size = bert_model.config.vocab_size
        self.encoder = nn.Sequential(
            nn.Linear(self.bert_output_dim, 512),
            nn.ReLU(),
            nn.Linear(512, latent_dim * 2)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 1536)  # 3 * 512 = 1536
        )
        self.output = nn.Linear(1536, self.vocab_size * 1536)
        self.latent_dim = latent_dim

    def encode(self, x, attention_mask=None):
        batch_size, num_descriptions, seq_length = x.shape
        x = x.view(-1, seq_length)
        attention_mask = attention_mask.view(-1, seq_length)

        with torch.no_grad():
            bert_output = self.bert(x, attention_mask=attention_mask)[0]
        bert_output = bert_output.view(batch_size, num_descriptions, seq_length, -1).mean(dim=[1, 2])
        h = self.encoder(bert_output)
        return h[:, :self.latent_dim], h[:, self.latent_dim:]

    def decode(self, z):
        h = self.decoder(z)
        return self.output(h)

    def forward(self, x, attention_mask=None):
        batch_size, num_descriptions, seq_length = x.shape
        mu, logvar = self.encode(x, attention_mask)
        z = self.reparameterize(mu, logvar)
        decoded = self.decode(z)
        # Reshape decoded output to match input shape
        recon_x = decoded.view(batch_size, num_descriptions, seq_length, self.vocab_size)
        return recon_x, mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

def vae_loss(recon_x, x, mu, logvar):
    # Flatten the input and reconstructed tensors
    recon_x_flat = recon_x.view(-1, recon_x.size(-1))
    x_flat = x.view(-1)
    
    # Compute BCE loss
    BCE = nn.functional.cross_entropy(recon_x_flat, x_flat, reduction='sum')
    
    # Compute KLD loss
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    
    return BCE + KLD

# The rest of your code remains the same

In [20]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel
import os
import gc
import numpy as np

def load_protein_data(file_path, tokenizer, batch_size=8, val_split=0.1):
    dataset = ProteinDataset(file_path, tokenizer)
    val_size = int(len(dataset) * val_split)
    train_size = len(dataset) - val_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, val_loader

def train_vae(vae_model, train_loader, val_loader, epochs=20, learning_rate=1e-4, save_dir='models'):
    optimizer = torch.optim.Adam(vae_model.parameters(), lr=learning_rate)
    best_val_loss = float('inf')
    
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    for epoch in range(epochs):
        vae_model.train()
        total_train_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
           
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
           
            recon_batch, mu, logvar = vae_model(input_ids, attention_mask=attention_mask)
            loss = vae_loss(recon_batch, input_ids.view(-1, input_ids.size(-1)), mu, logvar)
           
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()



            print(f"Train Loss: {loss.item()}")
            torch.cuda.empty_cache()
            gc.collect()
            
        avg_train_loss = total_train_loss / len(train_loader)
        
        # Validation
        vae_model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                
                recon_batch, mu, logvar = vae_model(input_ids, attention_mask=attention_mask)
                loss = vae_loss(recon_batch, input_ids.view(-1, input_ids.size(-1)), mu, logvar)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)
        
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
        
        # Save the best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(vae_model.state_dict(), os.path.join(save_dir, 'best_vae_model.pth'))
            print(f"New best model saved with validation loss: {best_val_loss:.4f}")

    print(f"Training completed. Best validation loss: {best_val_loss:.4f}")

# Main execution
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
bert_model = BertModel.from_pretrained("Rostlab/prot_bert")

# Freeze BERT parameters
for param in bert_model.parameters():
    param.requires_grad = False

vae_model = ProteinVAE(bert_model).to(device)

train_loader, val_loader = load_protein_data(r"C:\Users\wes\vectordb_data_good\molT5_custom\fully_merged_dataset.csv", tokenizer, batch_size=8)
train_vae(vae_model, train_loader, val_loader, epochs=20, save_dir='vae_models')



Train Loss: 41910.68359375
Train Loss: 41483.14453125
Train Loss: 41095.48046875
Train Loss: 40734.8828125
Train Loss: 40223.5390625
Train Loss: 39738.07421875
Train Loss: 39259.85546875
Train Loss: 38566.21484375
Train Loss: 37798.703125
Train Loss: 37317.73828125
Train Loss: 36139.05078125
Train Loss: 35739.359375
Train Loss: 34351.58984375
Train Loss: 33700.54296875
Train Loss: 32220.51171875


KeyboardInterrupt: 

In [None]:
def generate_protein_sequence(vae_model, tokenizer, max_length=256):
    vae_model.eval()
    with torch.no_grad():
        z = torch.randn(1, vae_model.latent_dim).to(device)
        decoded = vae_model.decode(z)
        tokens = torch.argmax(decoded, dim=-1)
        sequence = tokenizer.decode(tokens[0], skip_special_tokens=True)
        
        # Extract only the sequence part
        if "Sequence:" in sequence:
            sequence = sequence.split("Sequence:")[1].split()[0]
        
        return sequence

In [None]:

# Generate and optimize sequences
novel_sequences = [generate_protein_sequence(vae_model, tokenizer) for _ in range(5)]

# Use your existing optimization pipeline here
optimized_results = await run_optimization_pipeline(novel_sequences)

for result in optimized_results:
    print(f"Generated sequence: {result['original_sequence'][:50]}...")
    print(f"Optimized sequence: {result['optimized_sequence'][:50]}...")
    print(f"Optimized score: {result['optimized_score']}")
    print(f"Properties: {result['properties']}")
    print("---"

In [None]:

def generate_protein_sequence(vae_model, tokenizer, max_length=256):
    vae_model.eval()
    with torch.no_grad():
        z = torch.randn(1, vae_model.latent_dim).to(device)
        decoded = vae_model.decode(z)
        tokens = torch.argmax(decoded, dim=-1)
        sequence = tokenizer.decode(tokens[0], skip_special_tokens=True)
        
        # Extract only the sequence part
        if "Sequence:" in sequence:
            sequence = sequence.split("Sequence:")[1].split()[0]
        
        return sequence

# Main execution
async def main():
    tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
    bert_model = BertModel.from_pretrained("Rostlab/prot_bert")
    vae_model = ProteinVAE(bert_model).to(device)

    data_loader = load_protein_data("path_to_your_protein_data.csv", tokenizer)
    train_vae(vae_model, data_loader, epochs=20)

    # Generate and optimize sequences
    novel_sequences = [generate_protein_sequence(vae_model, tokenizer) for _ in range(5)]
    
    # Use your existing optimization pipeline here
    optimized_results = await run_optimization_pipeline(novel_sequences)

    for result in optimized_results:
        print(f"Generated sequence: {result['original_sequence'][:50]}...")
        print(f"Optimized sequence: {result['optimized_sequence'][:50]}...")
        print(f"Optimized score: {result['optimized_scoreY']}")
        print(f"Properties: {result['properties']}")
        print("---")

if __name__ == "__main__":
    asyncio.run(main())

In [22]:
def train_vae(vae_model, train_loader, val_loader, epochs=20, learning_rate=5e-5, save_dir='models'):
    optimizer = torch.optim.Adam(vae_model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)
    best_val_loss = float('inf')
    
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    def compute_loss(recon_x, x, mu, logvar, kl_weight=0.01):
        recon_x_flat = recon_x.view(-1, recon_x.size(-1))
        x_flat = x.view(-1)
        BCE = nn.functional.cross_entropy(recon_x_flat, x_flat, reduction='sum')
        KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
        return BCE + kl_weight * KLD, BCE, KLD

    for epoch in range(epochs):
        vae_model.train()
        total_train_loss = 0
        total_train_bce = 0
        total_train_kld = 0
        for batch in train_loader:
            optimizer.zero_grad()
           
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
           
            recon_batch, mu, logvar = vae_model(input_ids, attention_mask=attention_mask)
            loss, bce, kld = compute_loss(recon_batch, input_ids, mu, logvar)
           
            loss.backward()
            torch.nn.utils.clip_grad_norm_(vae_model.parameters(), max_norm=1.0)
            optimizer.step()
            
            total_train_loss += loss.item()
            total_train_bce += bce.item()
            total_train_kld += kld.item()
            print(f"Train Loss: {loss.item()}")
            torch.cuda.empty_cache()
            gc.collect()
           
        avg_train_loss = total_train_loss / len(train_loader)
        avg_train_bce = total_train_bce / len(train_loader)
        avg_train_kld = total_train_kld / len(train_loader)
       
        # Validation
        vae_model.eval()
        total_val_loss = 0
        total_val_bce = 0
        total_val_kld = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
               
                recon_batch, mu, logvar = vae_model(input_ids, attention_mask=attention_mask)
                loss, bce, kld = compute_loss(recon_batch, input_ids, mu, logvar)
                total_val_loss += loss.item()
                total_val_bce += bce.item()
                total_val_kld += kld.item()

        avg_val_loss = total_val_loss / len(val_loader)
        avg_val_bce = total_val_bce / len(val_loader)
        avg_val_kld = total_val_kld / len(val_loader)
       
        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Val Loss: {avg_val_loss:.4f} (BCE: {avg_val_bce:.4f}, KLD: {avg_val_kld:.4f})")
       
        scheduler.step(avg_val_loss)
        
        # Save the best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(vae_model.state_dict(), os.path.join(save_dir, 'best_vae_model.pth'))
            print(f"New best model saved with validation loss: {best_val_loss:.4f}")
    
    print(f"Training completed. Best validation loss: {best_val_loss:.4f}")
    
# Main execution
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
bert_model = BertModel.from_pretrained("Rostlab/prot_bert")

# Freeze BERT parameters
for param in bert_model.parameters():
    param.requires_grad = False

vae_model = ProteinVAE(bert_model).to(device)

train_loader, val_loader = load_protein_data(r"C:\Users\wes\vectordb_data_good\molT5_custom\fully_merged_dataset.csv", tokenizer, batch_size=8)
train_vae(vae_model, train_loader, val_loader, epochs=20, save_dir='vae_models')



KeyboardInterrupt: 