# Aptamer Feature Engineering

This notebook extracts and analyzes features from aptamer sequences for use in binding affinity and cross-reactivity prediction models.

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

%matplotlib inline
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
# Add the project root to the path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

In [None]:
from src.data_processing.data_loader import AptamerDataLoader
from src.feature_extraction.sequence_features import SequenceFeatureExtractor
from src.feature_extraction.structure_prediction import StructurePredictor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## Load Preprocessed Data

In [None]:
# Load preprocessed data
processed_path = '../data/processed/preprocessed_aptamers.csv'

if os.path.exists(processed_path):
    df = pd.read_csv(processed_path)
    print(f"Loaded preprocessed data: {len(df)} rows")
else:
    # Load raw data and preprocess if necessary
    from src.data_processing.preprocessor import AptamerPreprocessor
    
    data_loader = AptamerDataLoader()
    df = data_loader.load_from_csv('../data/raw/fentanyl.csv')
    
    preprocessor = AptamerPreprocessor()
    df = preprocessor.clean_data(df)
    
    if 'Target_Name' in df.columns:
        df = preprocessor.normalize_target_names(df)
    
    print(f"Loaded and preprocessed raw data: {len(df)} rows")

In [None]:
# Display the first few rows
df.head()

## Extract Sequence Features

In [None]:
# Determine sequence column name
seq_col = 'Sequence' if 'Sequence' in df.columns else 'sequence'

if seq_col not in df.columns:
    print(f"ERROR: No sequence column ('{seq_col}') found in the dataset")
else:
    # Initialize feature extractor
    feature_extractor = SequenceFeatureExtractor()
    
    # Extract all features
    print("Extracting sequence features...")
    sequence_features = feature_extractor.extract_all_features(df[seq_col].tolist())
    
    print(f"Extracted {len(sequence_features.columns)} sequence features")
    
    # Display the first few rows of features
    sequence_features.head()

## Extract Structural Features

In [None]:
if seq_col in df.columns:
    # Initialize structure predictor
    structure_predictor = StructurePredictor()
    
    # Predict structures and extract features
    print("Predicting structures and extracting structural features...")
    structure_features = structure_predictor.predict_and_analyze_structures(df[seq_col].tolist())
    
    print(f"Extracted {len(structure_features.columns)} structural features")
    
    # Display the first few rows of features
    structure_features.head()

## Visualize Secondary Structures

In [None]:
def simple_structure_visualization(sequence, structure):
    """Create a simple visualization of the secondary structure."""
    fig, ax = plt.subplots(figsize=(12, 2))
    
    # Draw sequence
    for i, nt in enumerate(sequence):
        ax.text(i, 0, nt, ha='center', va='center',
               bbox=dict(boxstyle='circle', facecolor='white', edgecolor='black'))
    
    # Draw structure (base pairs)
    stack = []
    for i, char in enumerate(structure):
        if char == '(':
            stack.append(i)
        elif char == ')':
            if stack:
                j = stack.pop()
                # Draw arc connecting the pair
                center = (i + j) / 2
                width = i - j
                height = width / 2
                ax.plot([j, i], [0.5, 0.5], 'k-', alpha=0.3)
    
    ax.set_xlim(-1, len(sequence))
    ax.set_ylim(-1, 3)
    ax.axis('off')
    plt.tight_layout()
    plt.show()

# Show a few example structures
if 'predicted_structure' in structure_features.columns:
    for i in range(min(3, len(structure_features))):
        seq = structure_features['sequence'].iloc[i]
        struct = structure_features['predicted_structure'].iloc[i]
        energy = structure_features['energy'].iloc[i]
        
        print(f"Sequence {i+1}:")
        print(f"Sequence: {seq}")
        print(f"Structure: {struct}")
        print(f"Energy: {energy:.2f} kcal/mol")
        simple_structure_visualization(seq, struct)
        print("\n" + "-"*80 + "\n")

## Combine Features

In [None]:
# Combine all features
combined_df = pd.concat([
    df.reset_index(drop=True),
    sequence_features.drop(columns=['sequence']).reset_index(drop=True),
    structure_features.drop(columns=['sequence']).reset_index(drop=True)
], axis=1)

print(f"Original DataFrame: {df.shape[1]} columns")
print(f"Combined DataFrame: {combined_df.shape[1]} columns")
print(f"Added {combined_df.shape[1] - df.shape[1]} new feature columns")

## Feature Analysis

In [None]:
# Calculate correlation between features
numeric_columns = combined_df.select_dtypes(include=[np.number]).columns

# Limit to a subset for visualization
important_features = [
    'gc_content', 'length', 'purine_pyrimidine_ratio', 
    'energy', 'ensemble_diversity', 'stem_count', 'hairpin_loop_count',
    'paired_percentage', 'unpaired_percentage'
]

# Ensure all important features exist in the DataFrame
important_features = [f for f in important_features if f in numeric_columns]

# Calculate correlation matrix
corr_matrix = combined_df[important_features].corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Calculate feature distributions
fig, axes = plt.subplots(3, 3, figsize=(16, 12))
axes = axes.flatten()

for i, feature in enumerate(important_features[:9]):  # Limit to 9 features for the grid
    if feature in combined_df.columns:
        sns.histplot(combined_df[feature], kde=True, ax=axes[i])
        axes[i].set_title(f'{feature} Distribution')

plt.tight_layout()
plt.show()

## Dimensionality Reduction

In [None]:
# Prepare numeric features for PCA
numeric_df = combined_df.select_dtypes(include=[np.number])

# Remove constant columns
numeric_df = numeric_df.loc[:, numeric_df.std() > 0]

# Scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

# Apply PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)

# Create DataFrame with PCA results
pca_df = pd.DataFrame({
    'PC1': pca_result[:, 0],
    'PC2': pca_result[:, 1]
})

# Add target information if available
if 'Target_Name' in combined_df.columns:
    pca_df['Target'] = combined_df['Target_Name']

# Plot PCA results
plt.figure(figsize=(12, 8))
if 'Target' in pca_df.columns:
    sns.scatterplot(x='PC1', y='PC2', hue='Target', data=pca_df, s=100, alpha=0.7)
    plt.legend(title='Target', bbox_to_anchor=(1.05, 1), loc='upper left')
else:
    sns.scatterplot(x='PC1', y='PC2', data=pca_df, s=100, alpha=0.7)

plt.title('PCA of Aptamer Features')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

# Print explained variance ratio
print(f"Explained variance ratio:")
print(f"PC1: {pca.explained_variance_ratio_[0]:.2%}")
print(f"PC2: {pca.explained_variance_ratio_[1]:.2%}")
print(f"Total: {sum(pca.explained_variance_ratio_[:2]):.2%}")

## Feature Importance Analysis

In [None]:
# Analyze feature loadings from PCA
feature_loadings = pd.DataFrame({
    'Feature': numeric_df.columns,
    'PC1_loading': pca.components_[0],
    'PC2_loading': pca.components_[1]
})

# Sort by absolute loading values
feature_loadings['PC1_abs'] = abs(feature_loadings['PC1_loading'])
feature_loadings['PC2_abs'] = abs(feature_loadings['PC2_loading'])

# Top features for PC1
print("Top features for PC1:")
print(feature_loadings.sort_values('PC1_abs', ascending=False).head(10)[['Feature', 'PC1_loading']])

print("\nTop features for PC2:")
print(feature_loadings.sort_values('PC2_abs', ascending=False).head(10)[['Feature', 'PC2_loading']])

In [None]:
# Plot feature loadings
plt.figure(figsize=(12, 10))
plt.scatter(feature_loadings['PC1_loading'], feature_loadings['PC2_loading'], alpha=0.7)

# Add feature labels
for i, txt in enumerate(feature_loadings['Feature']):
    # Only label important features for readability
    if (abs(feature_loadings['PC1_loading'].iloc[i]) > 0.2 or 
        abs(feature_loadings['PC2_loading'].iloc[i]) > 0.2):
        plt.annotate(txt, 
                    (feature_loadings['PC1_loading'].iloc[i], 
                     feature_loadings['PC2_loading'].iloc[i]),
                    fontsize=9)

plt.title('PCA Feature Loadings')
plt.xlabel('PC1 Loading')
plt.ylabel('PC2 Loading')
plt.axhline(y=0, color='k', linestyle='-', alpha=0.3)
plt.axvline(x=0, color='k', linestyle='-', alpha=0.3)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## G-Quadruplex Analysis

In [None]:
# Analyze G-quadruplex forming potential
if seq_col in combined_df.columns:
    g4_scores = structure_predictor.get_g_quadruplex_propensity(combined_df[seq_col].tolist())
    combined_df['g4_propensity'] = g4_scores
    
    plt.figure(figsize=(10, 6))
    sns.histplot(combined_df['g4_propensity'], kde=True, bins=20)
    plt.title('G-Quadruplex Forming Propensity Distribution')
    plt.xlabel('G4 Propensity Score')
    plt.ylabel('Count')
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Show target-specific distribution if available
    if 'Target_Name' in combined_df.columns:
        plt.figure(figsize=(12, 6))
        sns.boxplot(x='Target_Name', y='g4_propensity', data=combined_df)
        plt.title('G-Quadruplex Propensity by Target')
        plt.xlabel('Target')
        plt.ylabel('G4 Propensity Score')
        plt.xticks(rotation=45, ha='right')
        plt.grid(alpha=0.3)
        plt.tight_layout()
        plt.show()

## Thermodynamic Stability Analysis

In [None]:
# Calculate thermodynamic stability
if seq_col in combined_df.columns:
    thermo_results = structure_predictor.calculate_thermodynamic_stability(combined_df[seq_col].tolist())
    
    # Extract stability scores
    stability_scores = [result['stability_score'] for result in thermo_results]
    tm_estimates = [result['approximated_tm'] for result in thermo_results]
    
    combined_df['stability_score'] = stability_scores
    combined_df['estimated_tm'] = tm_estimates
    
    # Plot distributions
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    sns.histplot(combined_df['stability_score'], kde=True, bins=20, ax=axes[0])
    axes[0].set_title('Aptamer Stability Score Distribution')
    axes[0].set_xlabel('Stability Score')
    axes[0].set_ylabel('Count')
    
    sns.histplot(combined_df['estimated_tm'], kde=True, bins=20, ax=axes[1])
    axes[1].set_title('Estimated Melting Temperature (Tm) Distribution')
    axes[1].set_xlabel('Estimated Tm (°C)')
    axes[1].set_ylabel('Count')
    
    plt.tight_layout()
    plt.show()

## Save Feature-Enriched Dataset

In [None]:
# Save the combined dataset with all features
output_path = '../data/processed/aptamers_with_features.csv'
combined_df.to_csv(output_path, index=False)
print(f"Feature-enriched dataset saved to {output_path}")

## Conclusions

Key findings from feature engineering:

1. Sequence features: [Fill in after running the notebook]
2. Structural features: [Fill in after running the notebook]
3. Most important features: [Fill in after running the notebook]
4. Target-specific patterns: [Fill in after running the notebook]

These features will be used for training machine learning models in the next notebook.