# Predicting Antibody Binding from Amino Acid Sequences

## Data Preprocessing and Cleaning

This notebook focuses on data preprocessing and cleaning steps for the antibody binding prediction project.

## 1. Import Libraries

In [3]:
# Data processing and analysis
import numpy as np
import pandas as pd
import scipy.stats as stats

# Bioinformatics
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Utilities
from tqdm.notebook import tqdm
import os
import sys
import warnings

# Set plotting style
sns.set_style("whitegrid")
sns.set_theme(style="whitegrid")

# Ignore warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.3f}'.format)

# Set random seed for reproducibility
np.random.seed(42)

## 2. Define Paths

In [4]:
# Define paths
DATA_RAW_DIR = '../data/raw'
DATA_PROCESSED_DIR = '../data/processed'
RESULTS_DIR = '../results'
FIGURES_DIR = '../results/figures'

## 3. Load Raw Data

In this section, we'll load the AVIDa-SARS-CoV-2 dataset. The dataset contains 77,003 full-length VHH (alpaca) antibody sequences, including 22,002 binders and 55,001 non-binders across 12 SARS-CoV-2 variants.

In [5]:
def load_raw_data(filepath):
    """
    Load the raw AVIDa-SARS-CoV-2 dataset.
    
    Parameters:
    -----------
    filepath : str
        Path to the raw data file
        
    Returns:
    --------
    pd.DataFrame
        DataFrame containing the raw data
    """
    # This is a placeholder function that will be implemented when the actual data is available
    # The actual implementation will depend on the format of the data
    
    print(f"Loading data from {filepath}...")
    
    # Example implementation (to be updated with actual data loading logic)
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Data file not found at {filepath}")
    
    # Assuming the data is in CSV format
    try:
        df = pd.read_csv(filepath)
        print(f"Successfully loaded data with {df.shape[0]} rows and {df.shape[1]} columns")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        raise

## 4. Data Cleaning

In this section, we'll clean the raw data by handling missing values, removing duplicates, and addressing any other data quality issues.

In [6]:
def clean_amino_acid_sequences(df, sequence_col='amino_acid_sequence'):
    """
    Clean and validate amino acid sequences.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame containing the raw data
    sequence_col : str, optional
        Name of the column containing amino acid sequences
        
    Returns:
    --------
    pd.DataFrame
        DataFrame with cleaned sequences
    """
    print("Cleaning amino acid sequences...")
    
    # Make a copy of the dataframe to avoid modifying the original
    df_clean = df.copy()
    
    # Check for missing sequences
    missing_seq = df_clean[sequence_col].isna().sum()
    print(f"Found {missing_seq} missing sequences")
    
    # Remove rows with missing sequences
    df_clean = df_clean.dropna(subset=[sequence_col])
    
    # Check for invalid amino acid codes
    # Valid amino acid codes are: A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y
    valid_aa = set('ACDEFGHIKLMNPQRSTVWY')
    
    # Function to check if a sequence contains only valid amino acids
    def is_valid_sequence(seq):
        return set(seq.upper()).issubset(valid_aa)
    
    # Check for invalid sequences
    invalid_seq = df_clean[~df_clean[sequence_col].apply(is_valid_sequence)]
    print(f"Found {len(invalid_seq)} sequences with invalid amino acid codes")
    
    # Remove rows with invalid sequences
    df_clean = df_clean[df_clean[sequence_col].apply(is_valid_sequence)]
    
    # Convert sequences to uppercase
    df_clean[sequence_col] = df_clean[sequence_col].str.upper()
    
    print(f"Cleaning complete. Final dataset has {len(df_clean)} rows")
    return df_clean

## 5. Handle Class Imbalance

The dataset has a class imbalance with 22,002 binders and 55,001 non-binders. We'll address this imbalance to ensure our models are not biased.

In [7]:
def handle_class_imbalance(df, target_col='binding_label', method='class_weight'):
    """
    Handle class imbalance in the dataset.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame containing the data
    target_col : str, optional
        Name of the column containing the target variable
    method : str, optional
        Method to handle class imbalance ('class_weight', 'undersampling', 'oversampling', 'smote')
        
    Returns:
    --------
    pd.DataFrame or tuple
        Depending on the method, either the modified DataFrame or a tuple containing the DataFrame and class weights
    """
    print(f"Handling class imbalance using method: {method}")
    
    # Check class distribution
    class_counts = df[target_col].value_counts()
    print(f"Class distribution:\n{class_counts}")
    
    if method == 'class_weight':
        # Calculate class weights
        n_samples = len(df)
        n_classes = len(class_counts)
        class_weights = {i: n_samples / (n_classes * count) for i, count in enumerate(class_counts)}
        print(f"Class weights: {class_weights}")
        return df, class_weights
    
    elif method == 'undersampling':
        # Undersample the majority class
        from sklearn.utils import resample
        
        # Separate majority and minority classes
        df_majority = df[df[target_col] == class_counts.idxmax()]
        df_minority = df[df[target_col] == class_counts.idxmin()]
        
        # Undersample majority class
        df_majority_undersampled = resample(
            df_majority, 
            replace=False,
            n_samples=len(df_minority),
            random_state=42
        )
        
        # Combine minority class with undersampled majority class
        df_balanced = pd.concat([df_majority_undersampled, df_minority])
        
        print(f"After undersampling, class distribution:\n{df_balanced[target_col].value_counts()}")
        return df_balanced
    
    elif method == 'oversampling':
        # Oversample the minority class
        from sklearn.utils import resample
        
        # Separate majority and minority classes
        df_majority = df[df[target_col] == class_counts.idxmax()]
        df_minority = df[df[target_col] == class_counts.idxmin()]
        
        # Oversample minority class
        df_minority_oversampled = resample(
            df_minority, 
            replace=True,
            n_samples=len(df_majority),
            random_state=42
        )
        
        # Combine oversampled minority class with majority class
        df_balanced = pd.concat([df_majority, df_minority_oversampled])
        
        print(f"After oversampling, class distribution:\n{df_balanced[target_col].value_counts()}")
        return df_balanced
    
    elif method == 'smote':
        # Use SMOTE to generate synthetic samples
        try:
            from imblearn.over_sampling import SMOTE
            
            # Assuming X contains features and y contains target
            X = df.drop(columns=[target_col])
            y = df[target_col]
            
            # Apply SMOTE
            smote = SMOTE(random_state=42)
            X_resampled, y_resampled = smote.fit_resample(X, y)
            
            # Create a new DataFrame with balanced classes
            df_balanced = pd.DataFrame(X_resampled, columns=X.columns)
            df_balanced[target_col] = y_resampled
            
            print(f"After SMOTE, class distribution:\n{df_balanced[target_col].value_counts()}")
            return df_balanced
        except ImportError:
            print("SMOTE requires the imbalanced-learn package. Please install it using 'pip install imbalanced-learn'")
            return df
    
    else:
        print(f"Unknown method: {method}. Returning original DataFrame.")
        return df

## 6. Split Data

We'll split the data into training, validation, and test sets while ensuring proper stratification.

In [8]:
def split_data(df, target_col='binding_label', test_size=0.2, val_size=0.2, random_state=42):
    """
    Split the data into training, validation, and test sets.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame containing the preprocessed data
    target_col : str, optional
        Name of the column containing the target variable
    test_size : float, optional
        Proportion of data to use for testing
    val_size : float, optional
        Proportion of training data to use for validation
    random_state : int, optional
        Random seed for reproducibility
        
    Returns:
    --------
    tuple
        (X_train, X_val, X_test, y_train, y_val, y_test)
    """
    from sklearn.model_selection import train_test_split
    
    # Separate features and target
    X = df.drop(columns=[target_col])
    y = df[target_col]
    
    # First, split into training+validation and test
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, 
        test_size=test_size, 
        random_state=random_state,
        stratify=y  # Stratify by the target variable
    )
    
    # Then split training+validation into training and validation
    # Adjust validation size to account for the reduced dataset size
    adjusted_val_size = val_size / (1 - test_size)
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, 
        test_size=adjusted_val_size, 
        random_state=random_state,
        stratify=y_train_val  # Stratify by the target variable
    )
    
    print(f"Data split complete:")
    print(f"  Training set: {X_train.shape[0]} samples")
    print(f"  Validation set: {X_val.shape[0]} samples")
    print(f"  Test set: {X_test.shape[0]} samples")
    
    return X_train, X_val, X_test, y_train, y_val, y_test

## 7. Save Processed Data

We'll save the processed data for use in subsequent notebooks.

In [9]:
def save_processed_data(X_train, X_val, X_test, y_train, y_val, y_test, output_dir=DATA_PROCESSED_DIR):
    """
    Save the processed data to disk.
    
    Parameters:
    -----------
    X_train, X_val, X_test : pd.DataFrame
        Feature matrices for training, validation, and test sets
    y_train, y_val, y_test : pd.Series
        Target variables for training, validation, and test sets
    output_dir : str, optional
        Directory to save the processed data
    """
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Save the data
    X_train.to_csv(os.path.join(output_dir, 'X_train.csv'), index=False)
    X_val.to_csv(os.path.join(output_dir, 'X_val.csv'), index=False)
    X_test.to_csv(os.path.join(output_dir, 'X_test.csv'), index=False)
    y_train.to_csv(os.path.join(output_dir, 'y_train.csv'), index=False)
    y_val.to_csv(os.path.join(output_dir, 'y_val.csv'), index=False)
    y_test.to_csv(os.path.join(output_dir, 'y_test.csv'), index=False)
    
    print(f"Processed data saved to {output_dir}")

## 8. Next Steps

The data preprocessing and cleaning steps are now defined. In the next notebook, we'll focus on feature engineering from amino acid sequences.