# Aptamer Cross-Reactivity Analysis

This notebook focuses on analyzing and visualizing aptamer cross-reactivity to ensure high specificity for the target substances.

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

%matplotlib inline
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
# Add the project root to the path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

In [None]:
from src.data_processing.data_loader import AptamerDataLoader
from src.models.cross_reactivity import CrossReactivityAnalyzer
from src.visualization.plot_utils import AptamerVisualizer

## Load Models and Data

In [None]:
# Load the feature-enriched dataset
feature_path = '../data/processed/aptamers_with_features.csv'

if os.path.exists(feature_path):
    df = pd.read_csv(feature_path)
    print(f"Loaded feature-enriched data: {len(df)} rows, {len(df.columns)} columns")
else:
    print(f"Feature-enriched data not found at {feature_path}")
    print("Please run the 02_feature_engineering.ipynb notebook first")
    
    # For demonstration purposes, try loading the raw data
    processed_path = '../data/processed/preprocessed_aptamers.csv'
    if os.path.exists(processed_path):
        df = pd.read_csv(processed_path)
        print(f"Loaded preprocessed data instead: {len(df)} rows")
    else:
        df = pd.DataFrame()
        print("No data available. Please run the previous notebooks first.")

In [None]:
# Check if required columns are present
if len(df) > 0:
    if 'Target_Name' not in df.columns:
        print("WARNING: No 'Target_Name' column found in the dataset. Cross-reactivity analysis requires target information.")
    else:
        print(f"Found {df['Target_Name'].nunique()} unique targets in the dataset")
        print(df['Target_Name'].value_counts())

In [None]:
# Load previously trained cross-reactivity model if available
cross_reactivity_model = CrossReactivityAnalyzer()
model_path = '../models/cross_reactivity_model.pkl'

if os.path.exists(model_path):
    try:
        cross_reactivity_model.load_model(model_path)
        print(f"Loaded cross-reactivity model from {model_path}")
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        print("Training a new cross-reactivity model...")
        if 'Target_Name' in df.columns and len(df) > 0:
            cross_reactivity_model.train_cross_reactivity_model(df)
else:
    print(f"No saved model found at {model_path}")
    print("Training a new cross-reactivity model...")
    if 'Target_Name' in df.columns and len(df) > 0:
        cross_reactivity_model.train_cross_reactivity_model(df)

## Cross-Reactivity Prediction

In [None]:
# Predict cross-reactivity
if 'Target_Name' in df.columns and len(df) > 0 and hasattr(cross_reactivity_model, 'model'):
    # Make predictions
    predictions_df = cross_reactivity_model.predict_cross_reactivity(df)
    
    # Show prediction results
    cols_to_show = ['Target_Name', 'predicted_target']
    seq_col = 'Sequence' if 'Sequence' in predictions_df.columns else 'sequence'
    if seq_col in predictions_df.columns:
        cols_to_show = [seq_col] + cols_to_show
    
    # Add probability columns
    prob_cols = [col for col in predictions_df.columns if col.endswith('_probability')]
    cols_to_show = cols_to_show + prob_cols[:3]  # Show a few probability columns
    
    print("Sample of cross-reactivity predictions:")
    display(predictions_df[cols_to_show].head())

In [None]:
# Calculate confusion matrix
if 'Target_Name' in df.columns and 'predicted_target' in predictions_df.columns:
    from sklearn.metrics import confusion_matrix
    
    conf_matrix = confusion_matrix(
        predictions_df['Target_Name'], 
        predictions_df['predicted_target']
    )
    
    # Get target names
    target_names = cross_reactivity_model.target_names.tolist()
    
    # Create a nicer visualization
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
               xticklabels=target_names, yticklabels=target_names)
    plt.title('Confusion Matrix for Target Prediction')
    plt.xlabel('Predicted Target')
    plt.ylabel('Actual Target')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()
    
    # Calculate overall accuracy
    accuracy = (predictions_df['Target_Name'] == predictions_df['predicted_target']).mean()
    print(f"Overall prediction accuracy: {accuracy:.4f}")

## Cross-Reactivity Analysis

In [None]:
# Identify cross-reactive aptamers
if 'predicted_target' in predictions_df.columns:
    crossreact_df = cross_reactivity_model.identify_cross_reactive_aptamers(
        predictions_df, threshold=0.3
    )
    
    # Count cross-reactive aptamers
    cross_reactive_count = crossreact_df['is_cross_reactive'].sum()
    print(f"Identified {cross_reactive_count} potentially cross-reactive aptamers out of {len(crossreact_df)} ({cross_reactive_count/len(crossreact_df):.1%})")
    
    # Display cross-reactive aptamers
    if cross_reactive_count > 0:
        cr_examples = crossreact_df[crossreact_df['is_cross_reactive']].head(10)
        
        display_cols = ['Target_Name', 'predicted_target', 'cross_reactive_targets', 'specificity_score']
        seq_col = 'Sequence' if 'Sequence' in cr_examples.columns else 'sequence'
        if seq_col in cr_examples.columns:
            display_cols = [seq_col] + display_cols
            
        print("\nExamples of cross-reactive aptamers:")
        display(cr_examples[display_cols])

In [None]:
# Calculate specificity score
if 'predicted_target' in predictions_df.columns:
    specificity_df = cross_reactivity_model.calculate_specificity_score(predictions_df)
    
    # Add specificity score to the dataframe
    if 'specificity_score' not in crossreact_df.columns:
        crossreact_df['specificity_score'] = specificity_df['specificity_score']
    if 'entropy_specificity' not in crossreact_df.columns:
        crossreact_df['entropy_specificity'] = specificity_df['entropy_specificity']
    
    # Plot specificity score distribution
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    sns.histplot(crossreact_df['specificity_score'], bins=20, kde=True)
    plt.title('Specificity Score Distribution')
    plt.xlabel('Specificity Score')
    plt.ylabel('Count')
    
    plt.subplot(1, 2, 2)
    sns.histplot(crossreact_df['entropy_specificity'], bins=20, kde=True)
    plt.title('Entropy-Based Specificity Distribution')
    plt.xlabel('Entropy-Based Specificity')
    plt.ylabel('Count')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Plot specificity by target
if 'Target_Name' in crossreact_df.columns and 'specificity_score' in crossreact_df.columns:
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='Target_Name', y='specificity_score', data=crossreact_df)
    plt.title('Specificity Score by Target')
    plt.xlabel('Target')
    plt.ylabel('Specificity Score')
    plt.xticks(rotation=45, ha='right')
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Calculate average specificity by target
    avg_specificity = crossreact_df.groupby('Target_Name')['specificity_score'].mean().sort_values(ascending=False)
    print("Average specificity score by target:")
    for target, score in avg_specificity.items():
        print(f"{target}: {score:.4f}")

## Cross-Reactivity Matrix

In [None]:
# Create cross-reactivity matrix
if 'Target_Name' in crossreact_df.columns:
    # Use the AptamerVisualizer
    visualizer = AptamerVisualizer()
    visualizer.plot_cross_reactivity_matrix(crossreact_df, output_path=None)

## Dimensionality Reduction Visualization

In [None]:
# Visualize cross-reactivity using t-SNE
if 'Target_Name' in df.columns and hasattr(cross_reactivity_model, 'model'):
    cross_reactivity_model.visualize_cross_reactivity(
        df, method='tsne', output_path=None, show_plot=True
    )

In [None]:
# Visualize using PCA as an alternative
if 'Target_Name' in df.columns and hasattr(cross_reactivity_model, 'model'):
    cross_reactivity_model.visualize_cross_reactivity(
        df, method='pca', output_path=None, show_plot=True
    )

## High Specificity Aptamers

In [None]:
# Find the most specific aptamers for each target
if 'Target_Name' in crossreact_df.columns and 'specificity_score' in crossreact_df.columns:
    targets = crossreact_df['Target_Name'].unique()
    
    print("Most specific aptamers for each target:")
    top_specific = []
    
    for target in targets:
        # Filter for this target
        target_df = crossreact_df[crossreact_df['Target_Name'] == target]
        
        # Get top 3 most specific aptamers
        top_3 = target_df.sort_values('specificity_score', ascending=False).head(3)
        top_specific.append(top_3)
        
        print(f"\n--- {target} ---")
        for i, (_, row) in enumerate(top_3.iterrows(), 1):
            seq_col = 'Sequence' if 'Sequence' in row else 'sequence'
            seq = row.get(seq_col, 'N/A')
            print(f"#{i}: {seq[:30]}{'...' if len(seq) > 30 else ''}")
            print(f"   Specificity Score: {row['specificity_score']:.4f}")
            print(f"   Cross-reactive: {'Yes' if row.get('is_cross_reactive', False) else 'No'}")
    
    # Combine all highly specific aptamers
    high_specificity_df = pd.concat(top_specific, ignore_index=True)
    
    # Save for later use
    output_dir = '../data/processed'
    os.makedirs(output_dir, exist_ok=True)
    specificity_path = os.path.join(output_dir, 'high_specificity_aptamers.csv')
    high_specificity_df.to_csv(specificity_path, index=False)
    print(f"\nHigh specificity aptamers saved to {specificity_path}")

## Sequence and Structure Analysis of Specific vs Cross-Reactive Aptamers

In [None]:
# Compare specific vs cross-reactive aptamers
if 'is_cross_reactive' in crossreact_df.columns:
    # Create groups
    specific_df = crossreact_df[~crossreact_df['is_cross_reactive']]
    crossreact_df_subset = crossreact_df[crossreact_df['is_cross_reactive']]
    
    # Compare nucleotide composition
    seq_col = 'Sequence' if 'Sequence' in crossreact_df.columns else 'sequence'
    if seq_col in crossreact_df.columns:
        print("Comparing specific vs cross-reactive aptamers:")
        print(f"Specific aptamers: {len(specific_df)}")
        print(f"Cross-reactive aptamers: {len(crossreact_df_subset)}")
        
        # Calculate GC content
        gc_col = 'GC_Content' if 'GC_Content' in crossreact_df.columns else 'gc_content'
        if gc_col not in crossreact_df.columns:
            # Calculate it
            crossreact_df['gc_content'] = crossreact_df[seq_col].apply(lambda seq: (
                (seq.upper().count('G') + seq.upper().count('C')) / len(seq) * 100 if len(seq) > 0 else 0
            ))
            gc_col = 'gc_content'
            
        # Compare GC content
        specific_gc = specific_df[gc_col].mean()
        crossreact_gc = crossreact_df_subset[gc_col].mean()
        print(f"\nAverage GC content:")
        print(f"Specific aptamers: {specific_gc:.2f}%")
        print(f"Cross-reactive aptamers: {crossreact_gc:.2f}%")
        
        # Compare length
        if 'length' in crossreact_df.columns:
            specific_len = specific_df['length'].mean()
            crossreact_len = crossreact_df_subset['length'].mean()
            print(f"\nAverage sequence length:")
            print(f"Specific aptamers: {specific_len:.2f} nucleotides")
            print(f"Cross-reactive aptamers: {crossreact_len:.2f} nucleotides")
            
        # Plot comparisons
        crossreact_df['Specificity'] = crossreact_df['is_cross_reactive'].map({True: 'Cross-Reactive', False: 'Specific'})
        
        plt.figure(figsize=(12, 6))
        plt.subplot(1, 2, 1)
        sns.boxplot(x='Specificity', y=gc_col, data=crossreact_df)
        plt.title('GC Content Comparison')
        plt.ylabel('GC Content (%)')
        
        plt.subplot(1, 2, 2)
        if 'length' in crossreact_df.columns:
            sns.boxplot(x='Specificity', y='length', data=crossreact_df)
            plt.title('Sequence Length Comparison')
            plt.ylabel('Sequence Length')
        
        plt.tight_layout()
        plt.show()

## Structural Analysis of Cross-Reactive Aptamers

In [None]:
# Analyze structural differences between specific and cross-reactive aptamers
if 'is_cross_reactive' in crossreact_df.columns:
    # Check for structural features
    structural_features = ['paired_percentage', 'unpaired_percentage', 'stem_count', 'hairpin_loop_count', 'energy']
    available_features = [f for f in structural_features if f in crossreact_df.columns]
    
    if available_features:
        print("Structural comparison between specific and cross-reactive aptamers:")
        
        # Create figure for comparison
        fig, axes = plt.subplots(1, len(available_features), figsize=(4*len(available_features), 5))
        
        for i, feature in enumerate(available_features):
            # Handle case of single feature
            ax = axes[i] if len(available_features) > 1 else axes
            
            # Plot comparison
            sns.boxplot(x='Specificity', y=feature, data=crossreact_df, ax=ax)
            ax.set_title(f'{feature}')
            if i == 0:  # Only show y-label for first subplot
                ax.set_ylabel('Value')
            else:
                ax.set_ylabel('')
        
        plt.tight_layout()
        plt.show()
        
        # Print mean values for comparison
        print("\nMean structural feature values:")
        for feature in available_features:
            specific_val = specific_df[feature].mean()
            crossreact_val = crossreact_df_subset[feature].mean()
            print(f"{feature}:")
            print(f"  Specific aptamers: {specific_val:.2f}")
            print(f"  Cross-reactive aptamers: {crossreact_val:.2f}")
    else:
        print("No structural features found for comparison.")

## Target-Specific Cross-Reactivity Patterns

In [None]:
# Analyze which targets show the highest cross-reactivity
if 'Target_Name' in crossreact_df.columns and 'is_cross_reactive' in crossreact_df.columns:
    # Calculate cross-reactivity percentage by target
    target_crossreact = crossreact_df.groupby('Target_Name')['is_cross_reactive'].mean().sort_values(ascending=False) * 100
    
    # Create dataframe for visualization
    target_cr_df = pd.DataFrame({
        'Target': target_crossreact.index,
        'Cross_Reactivity_Percent': target_crossreact.values
    })
    
    # Plot results
    plt.figure(figsize=(10, 6))
    bars = plt.bar(target_cr_df['Target'], target_cr_df['Cross_Reactivity_Percent'], color='salmon')
    plt.title('Cross-Reactivity Percentage by Target')
    plt.xlabel('Target')
    plt.ylabel('Cross-Reactive Aptamers (%)')
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3)
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 1,
                f'{height:.1f}%', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    # Analyze target pairs that show cross-reactivity
    if 'cross_reactive_targets' in crossreact_df.columns:
        # Count occurrences of each target pair
        target_pairs = []
        
        # Filter only cross-reactive aptamers
        cr_only = crossreact_df[crossreact_df['is_cross_reactive']]
        
        for _, row in cr_only.iterrows():
            main_target = row['Target_Name']
            cross_targets = row['cross_reactive_targets'].split(', ') if isinstance(row['cross_reactive_targets'], str) else []
            
            for cross_target in cross_targets:
                if cross_target:  # Ignore empty strings
                    target_pairs.append((main_target, cross_target))
        
        # Count occurrences
        if target_pairs:
            pair_counts = pd.Series(target_pairs).value_counts().reset_index()
            pair_counts.columns = ['Target_Pair', 'Count']
            
            # Format target pairs for display
            pair_counts['Target_Pair_Str'] = pair_counts['Target_Pair'].apply(lambda x: f"{x[0]} → {x[1]}")
            
            # Display top pairs
            print("\nMost common cross-reactivity pairs:")
            display(pair_counts.head(10))
            
            # Visualize top pairs
            plt.figure(figsize=(12, 6))
            sns.barplot(x='Target_Pair_Str', y='Count', data=pair_counts.head(10))
            plt.title('Most Common Cross-Reactivity Pairs')
            plt.xlabel('Target Pair (From → To)')
            plt.ylabel('Occurrence Count')
            plt.xticks(rotation=45, ha='right')
            plt.grid(axis='y', alpha=0.3)
            plt.tight_layout()
            plt.show()

## Identify Sequence Motifs Associated with Cross-Reactivity

In [None]:
# Analyze sequence motifs that might be associated with cross-reactivity
import re
from collections import Counter

def find_common_kmers(sequences, k=4, top_n=10):
    """Find the most common k-mers in a list of sequences."""
    all_kmers = []
    
    for seq in sequences:
        # Extract k-mers
        seq_kmers = [seq[i:i+k] for i in range(len(seq)-k+1)]
        all_kmers.extend(seq_kmers)
    
    # Count k-mers
    kmer_counts = Counter(all_kmers)
    
    # Return top k-mers
    return kmer_counts.most_common(top_n)

if 'is_cross_reactive' in crossreact_df.columns:
    seq_col = 'Sequence' if 'Sequence' in crossreact_df.columns else 'sequence'
    
    if seq_col in crossreact_df.columns:
        # Get specific and cross-reactive sequences
        specific_seqs = specific_df[seq_col].tolist()
        crossreact_seqs = crossreact_df_subset[seq_col].tolist()
        
        # Find common motifs
        if specific_seqs and crossreact_seqs:
            print("Common sequence motifs in specific aptamers:")
            specific_kmers = find_common_kmers(specific_seqs, k=4, top_n=10)
            for kmer, count in specific_kmers:
                print(f"{kmer}: {count} occurrences")
            
            print("\nCommon sequence motifs in cross-reactive aptamers:")
            crossreact_kmers = find_common_kmers(crossreact_seqs, k=4, top_n=10)
            for kmer, count in crossreact_kmers:
                print(f"{kmer}: {count} occurrences")
                
            # Visualize motif comparison
            specific_kmer_dict = dict(specific_kmers)
            crossreact_kmer_dict = dict(crossreact_kmers)
            
            # Combine and normalize
            all_kmers = list(set(list(specific_kmer_dict.keys()) + list(crossreact_kmer_dict.keys())))
            
            if len(specific_seqs) > 0 and len(crossreact_seqs) > 0:
                # Normalize by sequence count
                specific_norm = {k: v / len(specific_seqs) for k, v in specific_kmer_dict.items()}
                crossreact_norm = {k: v / len(crossreact_seqs) for k, v in crossreact_kmer_dict.items()}
                
                # Create comparison data
                comparison_data = []
                for kmer in all_kmers:
                    comparison_data.append({
                        'k-mer': kmer,
                        'Specific': specific_norm.get(kmer, 0),
                        'Cross-Reactive': crossreact_norm.get(kmer, 0)
                    })
                
                # Convert to dataframe
                comparison_df = pd.DataFrame(comparison_data)
                
                # Calculate enrichment ratio
                comparison_df['Enrichment'] = comparison_df['Cross-Reactive'] / comparison_df['Specific']
                comparison_df['Enrichment'] = comparison_df['Enrichment'].replace([np.inf], 10)  # Cap extreme values
                comparison_df['Enrichment'] = comparison_df['Enrichment'].fillna(0)
                
                # Sort by enrichment
                enriched_in_cr = comparison_df.sort_values('Enrichment', ascending=False).head(10)
                enriched_in_specific = comparison_df.sort_values('Enrichment').head(10)
                
                print("\nMotifs enriched in cross-reactive aptamers:")
                display(enriched_in_cr[['k-mer', 'Cross-Reactive', 'Specific', 'Enrichment']])
                
                print("\nMotifs enriched in specific aptamers:")
                display(enriched_in_specific[['k-mer', 'Specific', 'Cross-Reactive', 'Enrichment']])

## Save Cross-Reactivity Analysis Results

In [None]:
# Save cross-reactivity analysis results
if 'is_cross_reactive' in crossreact_df.columns:
    output_dir = '../data/processed'
    os.makedirs(output_dir, exist_ok=True)
    
    # Save all results
    crossreact_path = os.path.join(output_dir, 'cross_reactivity_analysis.csv')
    crossreact_df.to_csv(crossreact_path, index=False)
    print(f"Cross-reactivity analysis results saved to {crossreact_path}")

## Conclusions

Based on the cross-reactivity analysis, we've identified:

1. The overall level of cross-reactivity among aptamers: [Fill in after running the notebook]

2. Which targets have the most specific aptamers: [Fill in after running the notebook]

3. Distinguishing features between specific and cross-reactive aptamers: [Fill in after running the notebook]

4. Recommendations for selecting highly specific aptamers: [Fill in after running the notebook]

In the next notebook, we'll use these insights to select optimal aptamers for each target with minimal cross-reactivity.