# Predicting Antibody Binding from Amino Acid Sequences

## Environment Setup and Project Structure

This notebook verifies the environment setup and project structure for the antibody binding prediction project.

## 1. Verify Python Environment

In [1]:
# Check Python version
import sys
print(f"Python version: {sys.version}")

# Check if we're in the correct environment
import os
print(f"Current working directory: {os.getcwd()}")

# Check if conda environment is active (if using conda)
conda_env = os.environ.get('CONDA_DEFAULT_ENV')
if conda_env:
    print(f"Active conda environment: {conda_env}")
else:
    print("No conda environment detected. Using system Python or a non-conda virtual environment.")

Python version: 3.10.16 (main, Apr 29 2025, 11:15:06) [Clang 17.0.0 (clang-1700.0.13.3)]
Current working directory: /Users/wendwise/dev/Home/UCB/Final_Project/notebooks
No conda environment detected. Using system Python or a non-conda virtual environment.


## 2. Verify Required Libraries

In [2]:
# Function to check if a package is installed and get its version
def check_package(package_name):
    try:
        package = __import__(package_name)
        if hasattr(package, '__version__'):
            return f"{package_name}: {package.__version__}"
        else:
            return f"{package_name}: Installed (version unknown)"
    except ImportError:
        return f"{package_name}: Not installed"

# List of required packages
required_packages = [
    'numpy', 'pandas', 'scipy', 'matplotlib', 'seaborn', 'plotly',
    'scikit-learn', 'xgboost', 'lightgbm', 'tensorflow', 'keras',
    'biopython', 'biotite', 'datasets', 'huggingface_hub',
    'tqdm', 'ipywidgets', 'statsmodels', 'umap'
]

# Check each package
for package in required_packages:
    print(check_package(package))

numpy: 2.2.6
pandas: 2.2.3
scipy: 1.15.3
matplotlib: 3.10.3
seaborn: 0.13.2
plotly: Not installed
scikit-learn: Not installed
xgboost: Not installed
lightgbm: Not installed
tensorflow: Not installed
keras: Not installed
biopython: Not installed
biotite: Not installed
datasets: 4.0.0
huggingface_hub: 0.33.4
tqdm: 4.67.1
ipywidgets: 8.1.7
statsmodels: Not installed
umap: Not installed


## 3. Verify Project Structure

In [3]:
import os

# Function to display directory structure
def display_directory_structure(startpath, max_depth=3):
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        if level > max_depth:
            continue
        indent = ' ' * 4 * level
        print(f"{indent}{os.path.basename(root)}/")
        sub_indent = ' ' * 4 * (level + 1)
        for f in files:
            print(f"{sub_indent}{f}")

# Get the project root directory (assuming this notebook is in the notebooks directory)
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
print(f"Project root directory: {project_root}")
print("\nProject structure:")
display_directory_structure(project_root)

Project root directory: /Users/wendwise/dev/Home/UCB/Final_Project

Project structure:
Final_Project/
    .DS_Store
    requirements.txt
    environment.yml
    README.md
    .gitignore
    results/
        models/
            .gitkeep
        figures/
            .gitkeep
    .kiro/
        specs/
            supervised-learning-project/
                requirements.md
                tasks.md
                design.md
    .git/
        config
        HEAD
        description
        index
        COMMIT_EDITMSG
        objects/
            0d/
                19c50820d668a35c134f14ceb14c259bf48d6e
            d0/
                77e545885488ff113c00aaee1ab2a92d105dac
            pack/
            45/
                9dd354cdbeb1cb4d98d41d099b9720a626276c
            26/
                602d18f0201eba8658571ef1e5901d0a779519
            info/
            e6/
                9de29bb2d1d6434b8b29ae775ad8c2e48c5391
            13/
                0c5513822e2db3f42d2f4bffbaf2bdd0e79699
  

## 4. Create Helper Functions for Project

In [4]:
# Define paths
DATA_RAW_DIR = os.path.join(project_root, 'data', 'raw')
DATA_PROCESSED_DIR = os.path.join(project_root, 'data', 'processed')
RESULTS_DIR = os.path.join(project_root, 'results')
FIGURES_DIR = os.path.join(RESULTS_DIR, 'figures')
MODELS_DIR = os.path.join(RESULTS_DIR, 'models')

# Ensure all directories exist
for directory in [DATA_RAW_DIR, DATA_PROCESSED_DIR, RESULTS_DIR, FIGURES_DIR, MODELS_DIR]:
    os.makedirs(directory, exist_ok=True)
    print(f"Directory exists: {directory}")

Directory exists: /Users/wendwise/dev/Home/UCB/Final_Project/data/raw
Directory exists: /Users/wendwise/dev/Home/UCB/Final_Project/data/processed
Directory exists: /Users/wendwise/dev/Home/UCB/Final_Project/results
Directory exists: /Users/wendwise/dev/Home/UCB/Final_Project/results/figures
Directory exists: /Users/wendwise/dev/Home/UCB/Final_Project/results/models


## 5. Create Utility Functions for Data Loading

In [5]:
import pandas as pd

def load_data(split='train'):
    """Load data from CSV files.
    
    Parameters
    ----------
    split : str, optional
        Which data split to load ('train' or 'test'), by default 'train'
    
    Returns
    -------
    pandas.DataFrame
        The loaded data
    """
    file_path = os.path.join(DATA_RAW_DIR, f"{split}.csv")
    if os.path.exists(file_path):
        return pd.read_csv(file_path)
    else:
        raise FileNotFoundError(f"Data file not found: {file_path}")

def save_processed_data(df, name):
    """Save processed data to CSV.
    
    Parameters
    ----------
    df : pandas.DataFrame
        The data to save
    name : str
        Name of the file (without extension)
    """
    file_path = os.path.join(DATA_PROCESSED_DIR, f"{name}.csv")
    df.to_csv(file_path, index=False)
    print(f"Data saved to {file_path}")

# Test loading the data
try:
    train_data = load_data('train')
    test_data = load_data('test')
    print(f"Train data shape: {train_data.shape}")
    print(f"Test data shape: {test_data.shape}")
except FileNotFoundError as e:
    print(f"Error: {e}")

Train data shape: (49685, 6)
Test data shape: (27318, 6)


## 6. Create Utility Functions for Visualization

In [6]:
import matplotlib.pyplot as plt
import seaborn as sns

def save_figure(fig, filename):
    """Save a matplotlib figure to the figures directory.
    
    Parameters
    ----------
    fig : matplotlib.figure.Figure
        The figure to save
    filename : str
        Name of the file (without extension)
    """
    file_path = os.path.join(FIGURES_DIR, f"{filename}.png")
    fig.savefig(file_path, dpi=300, bbox_inches='tight')
    print(f"Figure saved to {file_path}")

# Set default plotting style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

## 7. Create Utility Functions for Model Saving and Loading

In [7]:
import pickle
import joblib

def save_model(model, model_name):
    """Save a trained model to the models directory.
    
    Parameters
    ----------
    model : object
        The trained model to save
    model_name : str
        Name of the model file (without extension)
    """
    file_path = os.path.join(MODELS_DIR, f"{model_name}.pkl")
    joblib.dump(model, file_path)
    print(f"Model saved to {file_path}")

def load_model(model_name):
    """Load a trained model from the models directory.
    
    Parameters
    ----------
    model_name : str
        Name of the model file (without extension)
    
    Returns
    -------
    object
        The loaded model
    """
    file_path = os.path.join(MODELS_DIR, f"{model_name}.pkl")
    if os.path.exists(file_path):
        return joblib.load(file_path)
    else:
        raise FileNotFoundError(f"Model file not found: {file_path}")

## 8. Summary

The project environment has been set up with the following components:

1. Verified Python environment and required libraries
2. Confirmed project directory structure
3. Created utility functions for:
   - Data loading and saving
   - Visualization and figure saving
   - Model saving and loading

The project is now ready for data acquisition, exploration, and model development.