# Data Preprocessing for Lifespan Prediction

This notebook demonstrates how to use the refactored `lifespan_predictor` package to:
1. Load and clean CSV data
2. Run featurization with new modules
3. Generate fingerprints
4. Save processed data

The refactored code provides a clean, modular interface compared to the original notebooks.

## 1. Setup and Imports

In [None]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path

# Import from the refactored package
from lifespan_predictor.config import Config
from lifespan_predictor.data.preprocessing import load_and_clean_csv, validate_smiles_list
from lifespan_predictor.data.featurizers import CachedGraphFeaturizer
from lifespan_predictor.data.fingerprints import FingerprintGenerator
from lifespan_predictor.utils.logging import setup_logger

# Setup logging
logger = setup_logger("preprocessing", level="INFO")
logger.info("Starting data preprocessing notebook")

## 2. Load Configuration

Load configuration from YAML file. You can modify `default_config.yaml` or create your own.

In [None]:
# Load configuration
config_path = "../lifespan_predictor/config/default_config.yaml"
config = Config.from_yaml(config_path)

# Display key configuration parameters
logger.info(f"Configuration loaded from: {config_path}")
logger.info(f"Train CSV: {config.data.train_csv}")
logger.info(f"Test CSV: {config.data.test_csv}")
logger.info(f"Output directory: {config.data.output_dir}")
logger.info(f"Using cache: {config.featurization.use_cache}")

# Create output directories
os.makedirs(config.data.graph_features_dir, exist_ok=True)
os.makedirs(config.data.fingerprints_dir, exist_ok=True)
os.makedirs(config.data.output_dir, exist_ok=True)

## 3. Load and Clean Training Data

The `load_and_clean_csv` function handles:
- Loading CSV files
- Cleaning and canonicalizing SMILES
- Removing invalid molecules
- Logging statistics

In [None]:
# Load and clean training data
logger.info("Loading training data...")
train_df = load_and_clean_csv(
    csv_path=config.data.train_csv,
    smiles_column=config.data.smiles_column,
    label_column=config.data.label_column
)

logger.info(f"Training data loaded: {len(train_df)} molecules")
logger.info(f"Label distribution:\n{train_df[config.data.label_column].value_counts()}")

# Display first few rows
train_df.head()

## 4. Load and Clean Test Data

In [None]:
# Load and clean test data
logger.info("Loading test data...")
test_df = load_and_clean_csv(
    csv_path=config.data.test_csv,
    smiles_column=config.data.smiles_column,
    label_column=None  # Test data may not have labels
)

logger.info(f"Test data loaded: {len(test_df)} molecules")
test_df.head()

## 5. Generate Graph Features

The `CachedGraphFeaturizer` provides:
- Automatic caching to disk
- Parallel processing
- Progress bars
- Robust error handling

In [None]:
# Initialize graph featurizer with caching
graph_featurizer = CachedGraphFeaturizer(
    cache_dir=config.data.graph_features_dir,
    max_atoms=config.featurization.max_atoms,
    atom_feature_dim=config.featurization.atom_feature_dim,
    n_jobs=config.featurization.n_jobs
)

logger.info("Generating graph features for training data...")

In [None]:
# Featurize training data
train_smiles = train_df[config.data.smiles_column].tolist()
train_labels = train_df[config.data.label_column].values

train_adj, train_features, train_labels_out = graph_featurizer.featurize(
    smiles_list=train_smiles,
    labels=train_labels,
    force_recompute=False  # Use cache if available
)

logger.info(f"Training graph features shape: adj={train_adj.shape}, features={train_features.shape}")
logger.info(f"Training labels shape: {train_labels_out.shape}")

In [None]:
# Featurize test data
logger.info("Generating graph features for test data...")
test_smiles = test_df[config.data.smiles_column].tolist()

test_adj, test_features, _ = graph_featurizer.featurize(
    smiles_list=test_smiles,
    labels=None,
    force_recompute=False
)

logger.info(f"Test graph features shape: adj={test_adj.shape}, features={test_features.shape}")

## 6. Generate Molecular Fingerprints

The `FingerprintGenerator` generates:
- Morgan fingerprints (hashed)
- RDKit topological fingerprints (hashed)
- MACCS keys (non-hashed)

All with automatic caching and parallel processing.

In [None]:
# Initialize fingerprint generator
fp_generator = FingerprintGenerator(
    morgan_radius=config.featurization.morgan_radius,
    morgan_nbits=config.featurization.morgan_nbits,
    rdkit_fp_nbits=config.featurization.rdkit_fp_nbits,
    n_jobs=config.featurization.n_jobs
)

logger.info("Generating fingerprints for training data...")

In [None]:
# Generate training fingerprints
train_fp_hashed, train_fp_nonhashed = fp_generator.generate_fingerprints(
    smiles_list=train_smiles,
    cache_dir=config.data.fingerprints_dir
)

logger.info(f"Training fingerprints shape: hashed={train_fp_hashed.shape}, non-hashed={train_fp_nonhashed.shape}")

In [None]:
# Generate test fingerprints
logger.info("Generating fingerprints for test data...")
test_fp_hashed, test_fp_nonhashed = fp_generator.generate_fingerprints(
    smiles_list=test_smiles,
    cache_dir=config.data.fingerprints_dir
)

logger.info(f"Test fingerprints shape: hashed={test_fp_hashed.shape}, non-hashed={test_fp_nonhashed.shape}")

## 7. Save Processed Data

Save all processed features for later use in training.

In [None]:
# Save training data
train_output_dir = os.path.join(config.data.output_dir, "train")
os.makedirs(train_output_dir, exist_ok=True)

np.save(os.path.join(train_output_dir, "adj.npy"), train_adj)
np.save(os.path.join(train_output_dir, "features.npy"), train_features)
np.save(os.path.join(train_output_dir, "labels.npy"), train_labels_out)
np.save(os.path.join(train_output_dir, "fp_hashed.npy"), train_fp_hashed)
np.save(os.path.join(train_output_dir, "fp_nonhashed.npy"), train_fp_nonhashed)

# Save SMILES for reference
train_df.to_csv(os.path.join(train_output_dir, "processed_data.csv"), index=False)

logger.info(f"Training data saved to: {train_output_dir}")

In [None]:
# Save test data
test_output_dir = os.path.join(config.data.output_dir, "test")
os.makedirs(test_output_dir, exist_ok=True)

np.save(os.path.join(test_output_dir, "adj.npy"), test_adj)
np.save(os.path.join(test_output_dir, "features.npy"), test_features)
np.save(os.path.join(test_output_dir, "fp_hashed.npy"), test_fp_hashed)
np.save(os.path.join(test_output_dir, "fp_nonhashed.npy"), test_fp_nonhashed)

# Save SMILES for reference
test_df.to_csv(os.path.join(test_output_dir, "processed_data.csv"), index=False)

logger.info(f"Test data saved to: {test_output_dir}")

## 8. Summary Statistics

In [None]:
# Print summary
print("\n" + "="*60)
print("DATA PREPROCESSING SUMMARY")
print("="*60)
print(f"\nTraining Data:")
print(f"  - Molecules: {len(train_df)}")
print(f"  - Graph features: {train_adj.shape}")
print(f"  - Hashed fingerprints: {train_fp_hashed.shape}")
print(f"  - Non-hashed fingerprints: {train_fp_nonhashed.shape}")
print(f"  - Labels: {train_labels_out.shape}")

print(f"\nTest Data:")
print(f"  - Molecules: {len(test_df)}")
print(f"  - Graph features: {test_adj.shape}")
print(f"  - Hashed fingerprints: {test_fp_hashed.shape}")
print(f"  - Non-hashed fingerprints: {test_fp_nonhashed.shape}")

print(f"\nOutput Directories:")
print(f"  - Training: {train_output_dir}")
print(f"  - Test: {test_output_dir}")
print(f"  - Cache: {config.data.graph_features_dir}")
print("\n" + "="*60)
print("Preprocessing complete! Ready for model training.")
print("="*60)