# Deep Learning Homework 2 - Question 2.1
## RNA Binding Protein (RBP) Interaction Prediction

## 1. Setup and Imports

In [1]:
# Install required packages
!pip install openpyxl -q

In [2]:
import os
import random
import time
import itertools
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from typing import List, Tuple

# Create Output Directory
OUTPUT_DIR = 'Outputs_bestTuning'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Using device: cuda
GPU: NVIDIA GeForce RTX 4060 Laptop GPU


## 2. Download Data

Download the data files from the Google Drive link provided in the homework:
- `norm_data.txt`
- `metadata.xlsx`

Upload them to Colab or mount your Google Drive.

In [3]:
#from google.colab import drive
#drive.mount('/content/drive')

In [4]:
# Set the path to your data files
DATA_DIR = 'data'  # Change this to your folder path

## 3. Configuration and Utility Functions

In [5]:
from config import RNAConfig

In [6]:
from utils import configure_seed, masked_mse_loss, masked_spearman_correlation

configure_seed(42)

## 4. Data Loader

In [7]:
from utils import RNACompeteLoader, load_rnacompete_data

## 5. Model Definitions

### 5.1 CNN Model

In [8]:
class RNABindingCNN(nn.Module):
    """
    1D Convolutional Neural Network for RNA sequence binding prediction.

    Architecture:
    - 3 convolutional layers with increasing channels (64 -> 128 -> 256)
    - Batch normalization after each conv layer
    - ReLU activation and dropout for regularization
    - Global max + average pooling for richer representation
    - 2 fully connected layers for regression output
    """

    def __init__(self, input_channels=4, seq_length=41, hidden_dim=128, dropout=0.3):
        super(RNABindingCNN, self).__init__()
        #L_out = ((L_in + 2P - K) / S)  + 1

        # Convolutional layers with different kernel sizes to capture various motif lengths
        # Padding choices allow to preserve input length through the feature maps
        #L_out = 41
        self.conv1 = nn.Conv1d(input_channels, 64, kernel_size=5, padding=2)
        self.bn1 = nn.BatchNorm1d(64)

        #L_out = 41
        self.conv2 = nn.Conv1d(64, 128, kernel_size=7, padding=3)
        self.bn2 = nn.BatchNorm1d(128)

        #L_out = 41
        self.conv3 = nn.Conv1d(128, 256, kernel_size=9, padding=4)
        self.bn3 = nn.BatchNorm1d(256)

        self.dropout = nn.Dropout(dropout)

        # Global pooling (both max and average)
        # allows to get the strongest value of a given feature for the sequence
        self.global_max_pool = nn.AdaptiveMaxPool1d(1)
        # allows to get the average of how much a feature is present throughout the sequence
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)

        # Fully connected layers
        self.fc1 = nn.Linear(256 * 2, hidden_dim)  # *2 for concat of max and avg pool
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        # Input x shape: (batch, seq_length, 4)
        # Conv1d expects: (batch, channels, seq_length)
        x = x.permute(0, 2, 1)

        # Convolutional blocks
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.dropout(x)

        x = self.relu(self.bn2(self.conv2(x)))
        x = self.dropout(x)

        x = self.relu(self.bn3(self.conv3(x)))
        x = self.dropout(x)

        # Global pooling
        max_pool = self.global_max_pool(x).squeeze(-1)
        avg_pool = self.global_avg_pool(x).squeeze(-1)
        x = torch.cat([max_pool, avg_pool], dim=1)

        # Fully connected layers
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)

        return x

### 5.2 LSTM Model

In [9]:
class RNABindingLSTM(nn.Module):
    """
    Bidirectional LSTM for RNA sequence binding prediction.

    Architecture:
    - 2-layer bidirectional LSTM
    - Batch normalization
    - Dropout for regularization
    - 2 fully connected layers for regression
    """

    def __init__(self, input_dim=4, hidden_dim=128, num_layers=2, dropout=0.3, bidirectional=True):
        super(RNABindingLSTM, self).__init__()

        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.num_directions = 2 if bidirectional else 1

        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=bidirectional
        )

        self.bn = nn.BatchNorm1d(hidden_dim * self.num_directions)
        self.dropout = nn.Dropout(dropout)

        self.fc1 = nn.Linear(hidden_dim * self.num_directions, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        # Input x shape: (batch, seq_length, 4)
        lstm_out, (hidden, cell) = self.lstm(x)

        if self.bidirectional:
            # Concatenate last hidden states from forward and backward
            hidden_forward = hidden[-2, :, :]
            hidden_backward = hidden[-1, :, :]
            combined = torch.cat([hidden_forward, hidden_backward], dim=1)
        else:
            combined = hidden[-1, :, :]

        combined = self.bn(combined)
        combined = self.dropout(combined)

        x = self.relu(self.fc1(combined))
        x = self.dropout(x)
        x = self.fc2(x)

        return x

## 6. Training and Evaluation Functions

In [10]:
def train_epoch(model, train_loader, optimizer, device):
    """Train for one epoch and return loss and correlation."""
    model.train()
    total_loss = 0.0
    num_batches = 0
    
    all_preds = []
    all_targets = []
    all_masks = []

    for batch in train_loader:
        x, y, mask = batch
        x, y, mask = x.to(device), y.to(device), mask.to(device)

        optimizer.zero_grad()
        predictions = model(x)
        loss = masked_mse_loss(predictions, y, mask)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        total_loss += loss.item()
        num_batches += 1
        
        # Collect for metric calculation
        all_preds.append(predictions.detach().cpu())
        all_targets.append(y.detach().cpu())
        all_masks.append(mask.detach().cpu())

    all_preds = torch.cat(all_preds, dim=0)
    all_targets = torch.cat(all_targets, dim=0)
    all_masks = torch.cat(all_masks, dim=0)
    
    spearman_corr = masked_spearman_correlation(all_preds, all_targets, all_masks)

    return total_loss / num_batches, spearman_corr.item()


def evaluate(model, data_loader, device):
    """Evaluate model and return loss and Spearman correlation."""
    model.eval()
    total_loss = 0.0
    num_batches = 0

    all_preds = []
    all_targets = []
    all_masks = []

    with torch.no_grad():
        for batch in data_loader:
            x, y, mask = batch
            x, y, mask = x.to(device), y.to(device), mask.to(device)

            predictions = model(x)
            loss = masked_mse_loss(predictions, y, mask)

            total_loss += loss.item()
            num_batches += 1

            all_preds.append(predictions.cpu())
            all_targets.append(y.cpu())
            all_masks.append(mask.cpu())

    all_preds = torch.cat(all_preds, dim=0)
    all_targets = torch.cat(all_targets, dim=0)
    all_masks = torch.cat(all_masks, dim=0)

    spearman_corr = masked_spearman_correlation(all_preds, all_targets, all_masks)

    return total_loss / num_batches, spearman_corr.item()


def train_model(model, train_loader, val_loader, optimizer, scheduler, device,
                num_epochs, model_name, patience=15, save_every=10):
    """Full training loop with early stopping and periodic checkpoints."""
    print(f"\n{'='*60}")
    print(f"Training {model_name}")
    print(f"{'='*60}")

    train_losses = []
    train_correlations = []
    val_losses = []
    val_correlations = []

    best_val_corr = -float('inf')
    best_model_state = None
    epochs_without_improvement = 0

    start_time = time.time()

    for epoch in range(num_epochs):
        epoch_start = time.time()

        train_loss, train_corr = train_epoch(model, train_loader, optimizer, device)
        val_loss, val_corr = evaluate(model, val_loader, device)

        if scheduler is not None:
            scheduler.step(val_corr)

        train_losses.append(train_loss)
        train_correlations.append(train_corr)
        val_losses.append(val_loss)
        val_correlations.append(val_corr)

        if val_corr > best_val_corr:
            best_val_corr = val_corr
            best_model_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            epochs_without_improvement = 0
            # Save best model to Output directory
            torch.save(best_model_state, f'{OUTPUT_DIR}/{model_name}_best_tun.pth')
            print(f"  → Saved new best model (Spearman: {val_corr:.4f})")
        else:
            epochs_without_improvement += 1

        epoch_time = time.time() - epoch_start

        if (epoch + 1) % 5 == 0 or epoch == 0:
            print(f"Epoch {epoch+1:3d}/{num_epochs} | "
                  f"Train Loss: {train_loss:.4f} | "
                  f"Val Loss: {val_loss:.4f} | "
                  f"Train Spearman: {train_corr:.4f} | "
                  f"Val Spearman: {val_corr:.4f} | "
                  f"Time: {epoch_time:.2f}s")

        # Periodic checkpoint every N epochs
        if (epoch + 1) % save_every == 0:
            checkpoint = {
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_losses': train_losses,
                'train_correlations': train_correlations,
                'val_losses': val_losses,
                'val_correlations': val_correlations,
                'best_val_corr': best_val_corr
            }
            torch.save(checkpoint, f'{OUTPUT_DIR}/{model_name}_checkpoint_epoch{epoch+1}_tun.pth')
            print(f"  → Checkpoint saved at epoch {epoch+1}")

        if epochs_without_improvement >= patience:
            print(f"\nEarly stopping at epoch {epoch+1}")
            break

    total_time = time.time() - start_time

    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    print(f"\nTraining completed in {total_time:.2f}s")
    print(f"Best validation Spearman correlation: {best_val_corr:.4f}")

    return {
        'train_losses': train_losses,
        'train_correlations': train_correlations,
        'val_losses': val_losses,
        'val_correlations': val_correlations,
        'best_val_corr': best_val_corr,
        'training_time': total_time
    }

## 7. Load Data

In [11]:
PROTEIN_NAME = 'RBFOX1'
BATCH_SIZE = 64

# Load data
print(f"Loading data for protein: {PROTEIN_NAME}")
config = RNAConfig()

train_dataset = load_rnacompete_data(PROTEIN_NAME, split='train', config=config)
val_dataset = load_rnacompete_data(PROTEIN_NAME, split='val', config=config)
test_dataset = load_rnacompete_data(PROTEIN_NAME, split='test', config=config)

print(f"\nDataset sizes:")
print(f"  Train: {len(train_dataset)}")
print(f"  Val: {len(val_dataset)}")
print(f"  Test: {len(test_dataset)}")

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

Loading data for protein: RBFOX1
Found cached data for RBFOX1 (train). Loading from data/RBFOX1_train_data.pt...
Found cached data for RBFOX1 (val). Loading from data/RBFOX1_val_data.pt...
Found cached data for RBFOX1 (test). Loading from data/RBFOX1_test_data.pt...

Dataset sizes:
  Train: 96261
  Val: 24065
  Test: 121031


## 8. Hyperparameter Search and Training

Here we define the search space and a function to iterate through hyperparameters, training the model for each combination.

In [12]:
cnn_configs = [
    {'learning_rate': 0.003, 'hidden_dim': 256, 'dropout': 0.2},
    {'learning_rate': 0.001, 'hidden_dim': 256, 'dropout': 0.3},
]

lstm_configs = [
    {'learning_rate': 0.0001, 'hidden_dim': 256, 'dropout': 0.1},
    {'learning_rate': 0.0001, 'hidden_dim': 256, 'dropout': 0.4},
]

def run_specific_experiments(model_class, model_name, configurations, train_loader, val_loader, device, epochs=50):
    """
    Runs training for a specific list of hyperparameter dictionaries.
    Mirroring the logic of the working hyperparameter_search function.
    """
    
    best_val_corr = -float('inf')
    best_config = None
    best_history = None
    
    print(f"Starting Specific Experiments for {model_name} with {len(configurations)} configurations...")
    
    results = []
    # Save results to a specific JSON file to avoid overwriting grid search results
    results_path = f'{OUTPUT_DIR}/{model_name}_specific_results.json'
    
    for i, config in enumerate(configurations):
        print(f"\nRunning specific experiment {i+1}/{len(configurations)}: {config}")
        
        configure_seed(42)
        
        # Initialize model with current config
        # Handling potential init differences if any (though usually they are consistent)
        if model_name == 'LSTM':
             model = model_class(
                hidden_dim=config['hidden_dim'],
                dropout=config['dropout']
            ).to(device)
        else:
             model = model_class(
                hidden_dim=config['hidden_dim'],
                dropout=config['dropout']
            ).to(device)
            
        optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay=1e-5)
        # Using the same scheduler as your working example
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5)
        
        # Train
        # We use a unique name for temporary checkpoints to avoid conflicts
        temp_name = f"{model_name}_specific_exp{i}"
        
        history = train_model(
            model, train_loader, val_loader, optimizer, scheduler,
            device, num_epochs=epochs, model_name=temp_name, patience=7, save_every=100
        )
        
        # --- SAVE PROGRESS IMMEDIATELY ---
        run_result = {
            'experiment_index': i,
            'config': config,
            'best_val_corr': history['best_val_corr'],
            'training_time': history['training_time'],
            'history': history 
        }
        results.append(run_result)
        
        try:
            with open(results_path, 'w') as f:
                json.dump(results, f, indent=4)
            print(f"  -> Progress saved to {results_path}")
        except Exception as e:
            print(f"  -> Warning: Could not save progress to JSON: {e}")

        # Check if this specific run is the new best
        if history['best_val_corr'] > best_val_corr:
            best_val_corr = history['best_val_corr']
            best_config = config
            best_history = history
            
            # Save the best model of this specific search as the final best model
            # We construct the path exactly as train_model is expected to save it
            try:
                best_state = torch.load(f'{OUTPUT_DIR}/{temp_name}_best.pth')
                torch.save(best_state, f'{OUTPUT_DIR}/best_{model_name.lower()}_specific_model.pth')
            except FileNotFoundError:
                print(f"  -> Warning: Could not find checkpoint {temp_name}_best.pth to save as best model.")
            
    print(f"\n{'='*60}")
    print(f"Best {model_name} specific configuration: {best_config}")
    print(f"Best validation Spearman: {best_val_corr:.4f}")
    print(f"Best model saved to {OUTPUT_DIR}/best_{model_name.lower()}_specific_model.pth")
    print(f"{'='*60}")
    
    return best_config, best_history



In [13]:
# 8.1 Train and Search CNN
print("---- Tuning CNN ----")
best_cnn_config, cnn_history = run_specific_experiments(
    RNABindingCNN, 'CNN', cnn_configs, train_loader, val_loader, device, epochs=2
)

---- Tuning CNN ----
Starting Specific Experiments for CNN with 2 configurations...

Running specific experiment 1/2: {'learning_rate': 0.003, 'hidden_dim': 256, 'dropout': 0.2}

Training CNN_specific_exp0
  → Saved new best model (Spearman: 0.5456)
Epoch   1/2 | Train Loss: 0.6562 | Val Loss: 0.4849 | Train Spearman: 0.4529 | Val Spearman: 0.5456 | Time: 14.86s
  → Saved new best model (Spearman: 0.5778)

Training completed in 25.78s
Best validation Spearman correlation: 0.5778
  -> Progress saved to Outputs_bestTuning/CNN_specific_results.json

Running specific experiment 2/2: {'learning_rate': 0.001, 'hidden_dim': 256, 'dropout': 0.3}

Training CNN_specific_exp1
  → Saved new best model (Spearman: 0.5484)
Epoch   1/2 | Train Loss: 0.6328 | Val Loss: 0.5268 | Train Spearman: 0.4396 | Val Spearman: 0.5484 | Time: 15.02s
  → Saved new best model (Spearman: 0.5775)

Training completed in 27.46s
Best validation Spearman correlation: 0.5775
  -> Progress saved to Outputs_bestTuning/CNN_sp

In [14]:
# 8.2 Train and Search LSTM
print("---- Tuning LSTM ----")
best_lstm_config, lstm_history = run_specific_experiments(
    RNABindingLSTM, 'LSTM', lstm_configs, train_loader, val_loader, device, epochs=2
)

---- Tuning LSTM ----
Starting Specific Experiments for LSTM with 2 configurations...

Running specific experiment 1/2: {'learning_rate': 0.0001, 'hidden_dim': 256, 'dropout': 0.1}

Training LSTM_specific_exp0
  → Saved new best model (Spearman: 0.3076)
Epoch   1/2 | Train Loss: 0.9641 | Val Loss: 1.3382 | Train Spearman: 0.2400 | Val Spearman: 0.3076 | Time: 41.02s
  → Saved new best model (Spearman: 0.3744)

Training completed in 69.86s
Best validation Spearman correlation: 0.3744
  -> Progress saved to Outputs_bestTuning/LSTM_specific_results.json

Running specific experiment 2/2: {'learning_rate': 0.0001, 'hidden_dim': 256, 'dropout': 0.4}

Training LSTM_specific_exp1
  → Saved new best model (Spearman: 0.2893)
Epoch   1/2 | Train Loss: 1.0019 | Val Loss: 0.9478 | Train Spearman: 0.1671 | Val Spearman: 0.2893 | Time: 22.79s
  → Saved new best model (Spearman: 0.3486)

Training completed in 48.66s
Best validation Spearman correlation: 0.3486
  -> Progress saved to Outputs_bestTuning