# GNN Building Notebook

This notebook builds a Graph Neural Network using the animal network data and features.

In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt

## Creating Interactions DataFrame

Extract predator-prey relationships from the directed network to create edge data for the GNN.

In [2]:
# Load the largest component network
network_file = '../data/largest_component.gexf'
print(f"Loading network file: {network_file}")

try:
    # Read the GEXF file using NetworkX
    G = nx.read_gexf(network_file)
    print(f"Successfully loaded network with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")
    print(f"Network is directed: {G.is_directed()}")
except Exception as e:
    print(f"Error loading network file: {e}")
    G = None

Loading network file: ../data/largest_component.gexf
Successfully loaded network with 1899 nodes and 2313 edges
Network is directed: True


In [3]:
# Explore the network structure
if G is not None:
    print("Network Structure Analysis:")
    print("="*50)
    
    print(f"Number of nodes: {G.number_of_nodes()}")
    print(f"Number of edges: {G.number_of_edges()}")
    print(f"Is directed: {G.is_directed()}")
    print(f"Network density: {nx.density(G):.6f}")
    
    # Check node attributes
    sample_nodes = list(G.nodes(data=True))[:3]
    print(f"\nSample nodes with attributes:")
    for node_id, attributes in sample_nodes:
        print(f"  Node {node_id}: {attributes}")
    
    # Check edge attributes (if any)
    sample_edges = list(G.edges(data=True))[:3]
    print(f"\nSample edges with attributes:")
    for source, target, attributes in sample_edges:
        print(f"  Edge {source} -> {target}: {attributes}")
    
    # Node degree statistics
    in_degrees = [d for n, d in G.in_degree()]
    out_degrees = [d for n, d in G.out_degree()]
    
    print(f"\nDegree statistics:")
    print(f"  In-degree - Mean: {np.mean(in_degrees):.2f}, Max: {max(in_degrees)}, Min: {min(in_degrees)}")
    print(f"  Out-degree - Mean: {np.mean(out_degrees):.2f}, Max: {max(out_degrees)}, Min: {min(out_degrees)}")
else:
    print("Cannot analyze network - loading failed")

Network Structure Analysis:
Number of nodes: 1899
Number of edges: 2313
Is directed: True
Network density: 0.000642

Sample nodes with attributes:
  Node 526: {'scientific_name': 'Asimina reticulata', 'common_name': 'netted pawpaw', 'category': 'Plantae', 'label': '526'}
  Node 4755: {'scientific_name': 'Thomisus onustus', 'common_name': 'Heather crab spider', 'category': 'Arachnida', 'label': '4755'}
  Node 3161: {'scientific_name': 'Nerodia', 'common_name': 'Watersnakes', 'category': 'Reptilia', 'label': '3161'}

Sample edges with attributes:
  Edge 4755 -> 1835: {'id': '2663', 'weight': 1.0}
  Edge 4755 -> 319: {'id': '2664', 'weight': 1.0}
  Edge 4755 -> 2539: {'id': '2665', 'weight': 1.0}

Degree statistics:
  In-degree - Mean: 1.22, Max: 47, Min: 0
  Out-degree - Mean: 1.22, Max: 46, Min: 0


In [4]:
# Create interactions dataframe from network edges
if G is not None:
    print("Creating interactions dataframe...")
    print("="*50)
    
    # Initialize lists to store interaction data
    interactions_data = []
    
    # Process each edge in the directed network
    edge_count = 0
    for source_id, target_id in G.edges():
        edge_count += 1
        
        # Get node attributes
        source_attrs = G.nodes[source_id]
        target_attrs = G.nodes[target_id]
        
        # Extract scientific names
        predator_scientific_name = source_attrs.get('scientific_name', f'Unknown_{source_id}')
        prey_scientific_name = target_attrs.get('scientific_name', f'Unknown_{target_id}')
        
        # Create interaction record
        interaction_record = {
            'id': edge_count,  # Unique interaction ID
            'predator_scientific_name': predator_scientific_name,
            'prey_scientific_name': prey_scientific_name,
            'Y': 1  # All interactions are positive (observed relationships)
        }
        
        interactions_data.append(interaction_record)
    
    # Create DataFrame
    interactions_df = pd.DataFrame(interactions_data)
    
    print(f"Interactions dataframe created!")
    print(f"Shape: {interactions_df.shape}")
    print(f"Columns: {list(interactions_df.columns)}")
    
    # Data summary
    print(f"\nInteractions summary:")
    print(f"  Total interactions: {len(interactions_df)}")
    print(f"  Unique predators: {interactions_df['predator_scientific_name'].nunique()}")
    print(f"  Unique prey: {interactions_df['prey_scientific_name'].nunique()}")
    print(f"  All Y values: {interactions_df['Y'].unique()}")
    
    # Check for any missing scientific names
    unknown_predators = interactions_df['predator_scientific_name'].str.contains('Unknown_', na=False).sum()
    unknown_prey = interactions_df['prey_scientific_name'].str.contains('Unknown_', na=False).sum()
    
    print(f"\nData quality:")
    print(f"  Unknown predator names: {unknown_predators}")
    print(f"  Unknown prey names: {unknown_prey}")
    
else:
    print("Cannot create interactions dataframe - network not loaded")
    interactions_df = pd.DataFrame()

Creating interactions dataframe...
Interactions dataframe created!
Shape: (2313, 4)
Columns: ['id', 'predator_scientific_name', 'prey_scientific_name', 'Y']

Interactions summary:
  Total interactions: 2313
  Unique predators: 963
  Unique prey: 1194
  All Y values: [1]

Data quality:
  Unknown predator names: 0
  Unknown prey names: 0


In [5]:
# Display sample interactions
if not interactions_df.empty:
    print("Sample interactions:")
    print("="*60)
    print(interactions_df.head(10))
    
    print(f"\nRandom sample of interactions:")
    if len(interactions_df) > 5:
        sample_interactions = interactions_df.sample(5)
        print(sample_interactions)
    
    # Check data types
    print(f"\nData types:")
    print(interactions_df.dtypes)
    
    # Verify Y column
    print(f"\nY column verification:")
    print(f"  All Y values are 1: {(interactions_df['Y'] == 1).all()}")
    print(f"  Y value counts: {interactions_df['Y'].value_counts()}")
    
else:
    print("No interactions data to display")

Sample interactions:
   id predator_scientific_name prey_scientific_name  Y
0   1         Thomisus onustus             Eupeodes  1
1   2         Thomisus onustus           Anthophila  1
2   3         Thomisus onustus           Lasiommata  1
3   4         Thomisus onustus   Cacyreus marshalli  1
4   5         Thomisus onustus            Dialictus  1
5   6         Thomisus onustus           Oestroidea  1
6   7         Thomisus onustus  Celastrina argiolus  1
7   8    Stagmomantis carolina       Phoebis sennae  1
8   9    Stagmomantis carolina            Acrididea  1
9  10    Stagmomantis carolina   Melanoplus keeleri  1

Random sample of interactions:
        id   predator_scientific_name           prey_scientific_name  Y
1171  1172        Halcyon albiventris                Ophiusa tirhaca  1
2262  2263             Ardea herodias                Siren lacertina  1
1947  1948  Panthera leo melanochaita  Connochaetes taurinus mearnsi  1
523    524    Tamiasciurus hudsonicus                 

In [6]:
# Analyze interaction patterns
if not interactions_df.empty:
    print("Interaction Pattern Analysis:")
    print("="*50)
    
    # Top predators (most outgoing connections)
    predator_counts = interactions_df['predator_scientific_name'].value_counts()
    print(f"\nTop 10 predators (most prey):")
    print(predator_counts.head(10))
    
    # Top prey (most incoming connections)
    prey_counts = interactions_df['prey_scientific_name'].value_counts()
    print(f"\nTop 10 prey (most predators):")
    print(prey_counts.head(10))
    
    # Species that are both predator and prey
    predators_set = set(interactions_df['predator_scientific_name'])
    prey_set = set(interactions_df['prey_scientific_name'])
    
    predator_prey_species = predators_set.intersection(prey_set)
    only_predators = predators_set - prey_set
    only_prey = prey_set - predators_set
    
    print(f"\nSpecies role analysis:")
    print(f"  Species that are both predator and prey: {len(predator_prey_species)}")
    print(f"  Species that are only predators: {len(only_predators)}")
    print(f"  Species that are only prey: {len(only_prey)}")
    
    if len(predator_prey_species) > 0:
        print(f"\nSample species that are both predator and prey:")
        sample_both = list(predator_prey_species)[:5]
        for species in sample_both:
            pred_count = predator_counts.get(species, 0)
            prey_count = prey_counts.get(species, 0)
            print(f"  - {species}: predates {pred_count} species, prey to {prey_count} predators")
    
else:
    print("No interactions data to analyze")

Interaction Pattern Analysis:

Top 10 predators (most prey):
predator_scientific_name
Ardea herodias                      46
Apis mellifera                      34
Lepidoptera                         23
Halcyon albiventris albiventris     19
Phidippus audax                     18
Pandion haliaetus                   18
Platycryptus undatus                16
Larus glaucescens × occidentalis    16
Larus glaucescens                   16
Pterygota                           15
Name: count, dtype: int64

Top 10 prey (most predators):
prey_scientific_name
Lepidoptera       47
Actinopterygii    22
Diptera           21
Magnoliopsida     17
Pterygota         17
Apis mellifera    17
Araneae           16
Ficus burkei      15
Coleoptera        14
Insecta           12
Name: count, dtype: int64

Species role analysis:
  Species that are both predator and prey: 258
  Species that are only predators: 705
  Species that are only prey: 936

Sample species that are both predator and prey:
  - Baeolophus bi

In [8]:
interactions_df.head(10)

Unnamed: 0,id,predator_scientific_name,prey_scientific_name,Y
0,1,Thomisus onustus,Eupeodes,1
1,2,Thomisus onustus,Anthophila,1
2,3,Thomisus onustus,Lasiommata,1
3,4,Thomisus onustus,Cacyreus marshalli,1
4,5,Thomisus onustus,Dialictus,1
5,6,Thomisus onustus,Oestroidea,1
6,7,Thomisus onustus,Celastrina argiolus,1
7,8,Stagmomantis carolina,Phoebis sennae,1
8,9,Stagmomantis carolina,Acrididea,1
9,10,Stagmomantis carolina,Melanoplus keeleri,1


In [None]:
# Save the datasets for future use
if X_train is not None:
    print("Saving GNN datasets...")
    print("="*30)
    
    # Save training data
    train_data.to_csv('../data/gnn_train_data.csv', index=False)
    val_data.to_csv('../data/gnn_val_data.csv', index=False)
    test_data.to_csv('../data/gnn_test_data.csv', index=False)
    
    # Save the full combined dataset
    full_dataset.to_csv('../data/gnn_full_dataset.csv', index=False)
    
    print(f"Datasets saved:")
    print(f"  Training data: ../data/gnn_train_data.csv ({len(train_data)} samples)")
    print(f"  Validation data: ../data/gnn_val_data.csv ({len(val_data)} samples)")
    print(f"  Test data: ../data/gnn_test_data.csv ({len(test_data)} samples)")
    print(f"  Full dataset: ../data/gnn_full_dataset.csv ({len(full_dataset)} samples)")
    
    print(f"\n✅ GNN Dataset Creation Complete!")
    print(f"   Ready for link prediction model training")
    print(f"   Variables available:")
    print(f"   - train_data, val_data, test_data (DataFrames)")
    print(f"   - X_train, y_train, X_val, y_val, X_test, y_test (arrays)")
    print(f"   - feature_names (list of feature column names)")
    print(f"   - Train/Val/Test split: {len(train_data)}/{len(val_data)}/{len(test_data)} samples")
    
else:
    print("Cannot save datasets - no data available")

In [None]:
# Prepare final datasets for GNN training
if len(train_data) > 0:
    print("Preparing final GNN datasets...")
    print("="*50)
    
    # Define feature columns (exclude metadata columns)
    metadata_cols = ['id', 'predator_scientific_name', 'prey_scientific_name', 'Y', 'link_exists']
    feature_cols = [col for col in train_data.columns if col not in metadata_cols]
    
    print(f"Dataset preparation:")
    print(f"  Total columns: {len(train_data.columns)}")
    print(f"  Metadata columns: {len(metadata_cols)} - {metadata_cols}")
    print(f"  Feature columns: {len(feature_cols)}")
    
    # Separate features and labels for each split
    def prepare_split(data, split_name):
        features = data[feature_cols].values
        labels = data['link_exists'].values
        metadata = data[metadata_cols]
        
        print(f"  {split_name}:")
        print(f"    Features shape: {features.shape}")
        print(f"    Labels shape: {labels.shape}")
        print(f"    Positive labels: {labels.sum()} ({labels.mean()*100:.1f}%)")
        
        return features, labels, metadata
    
    print(f"\nPreparing splits:")
    X_train, y_train, train_metadata = prepare_split(train_data, "Training")
    X_val, y_val, val_metadata = prepare_split(val_data, "Validation")
    X_test, y_test, test_metadata = prepare_split(test_data, "Test")
    
    print(f"\n✅ GNN datasets ready for training!")
    print(f"   Training: X_train {X_train.shape}, y_train {y_train.shape}")
    print(f"   Validation: X_val {X_val.shape}, y_val {y_val.shape}")
    print(f"   Test: X_test {X_test.shape}, y_test {y_test.shape}")
    
    # Save feature column names for later use
    feature_names = feature_cols.copy()
    print(f"   Feature names saved: {len(feature_names)} features")
    
    # Display sample features for verification
    print(f"\nSample feature columns:")
    predator_features = [col for col in feature_cols if col.startswith('predator_')][:5]
    prey_features = [col for col in feature_cols if col.startswith('prey_')][:5]
    print(f"  Predator features (first 5): {predator_features}")
    print(f"  Prey features (first 5): {prey_features}")
    
else:
    print("Cannot prepare final datasets - no training data available")
    X_train = y_train = X_val = y_val = X_test = y_test = None
    train_metadata = val_metadata = test_metadata = pd.DataFrame()
    feature_names = []

In [None]:
# Split data into training, validation, and test sets
if len(full_dataset) > 0:
    print("Splitting data into train/validation/test sets...")
    print("="*50)
    
    # Shuffle the dataset
    shuffled_dataset = full_dataset.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Define split ratios
    train_ratio = 0.7
    val_ratio = 0.15
    test_ratio = 0.15
    
    # Calculate split indices
    total_samples = len(shuffled_dataset)
    train_size = int(total_samples * train_ratio)
    val_size = int(total_samples * val_ratio)
    test_size = total_samples - train_size - val_size
    
    # Split the data
    train_data = shuffled_dataset[:train_size].copy()
    val_data = shuffled_dataset[train_size:train_size + val_size].copy()
    test_data = shuffled_dataset[train_size + val_size:].copy()
    
    print(f"Data split completed:")
    print(f"  Total samples: {total_samples}")
    print(f"  Training set: {len(train_data)} samples ({len(train_data)/total_samples*100:.1f}%)")
    print(f"  Validation set: {len(val_data)} samples ({len(val_data)/total_samples*100:.1f}%)")
    print(f"  Test set: {len(test_data)} samples ({len(test_data)/total_samples*100:.1f}%)")
    
    # Check label distribution in each split
    def check_label_distribution(data, split_name):
        label_counts = data['link_exists'].value_counts().sort_index()
        pos_count = label_counts.get(1, 0)
        neg_count = label_counts.get(0, 0)
        total = len(data)
        
        print(f"  {split_name}:")
        print(f"    Positive (links): {pos_count} ({pos_count/total*100:.1f}%)")
        print(f"    Negative (no-links): {neg_count} ({neg_count/total*100:.1f}%)")
    
    print(f"\nLabel distribution by split:")
    check_label_distribution(train_data, "Training")
    check_label_distribution(val_data, "Validation")
    check_label_distribution(test_data, "Test")
    
else:
    print("Cannot split data - no full dataset available")
    train_data = val_data = test_data = pd.DataFrame()

In [None]:
# Combine positive and negative samples
if len(positive_samples) > 0 and len(negative_samples) > 0:
    print("Combining positive and negative samples...")
    print("="*50)
    
    # Combine datasets
    full_dataset = pd.concat([positive_samples, negative_samples], ignore_index=True)
    
    print(f"Combined dataset:")
    print(f"  Total samples: {len(full_dataset)}")
    print(f"  Positive samples: {len(positive_samples)} ({len(positive_samples)/len(full_dataset)*100:.1f}%)")
    print(f"  Negative samples: {len(negative_samples)} ({len(negative_samples)/len(full_dataset)*100:.1f}%)")
    print(f"  Shape: {full_dataset.shape}")
    
    # Verify label distribution
    label_counts = full_dataset['link_exists'].value_counts().sort_index()
    print(f"\nLabel distribution:")
    for label, count in label_counts.items():
        label_name = "No-link" if label == 0 else "Link exists"
        print(f"  {label} ({label_name}): {count} samples")
    
    # Check for missing values
    missing_counts = full_dataset.isnull().sum()
    total_missing = missing_counts.sum()
    
    print(f"\nData quality check:")
    print(f"  Total missing values: {total_missing}")
    if total_missing > 0:
        print(f"  Columns with missing values:")
        for col, count in missing_counts[missing_counts > 0].items():
            print(f"    {col}: {count} missing")
    else:
        print(f"  ✅ No missing values!")
    
else:
    print("Cannot combine samples - missing positive or negative data")
    full_dataset = pd.DataFrame()

In [None]:
# Generate negative samples (no-links)
if len(positive_samples) > 0 and len(valid_species) > 1:
    print("Generating negative samples (no-links)...")
    print("="*50)
    
    # Create set of existing interactions for fast lookup
    existing_links = set(zip(positive_samples['predator_scientific_name'], 
                           positive_samples['prey_scientific_name']))
    
    print(f"Existing links to avoid: {len(existing_links)}")
    
    # Convert valid species to list for random sampling
    species_list = list(valid_species)
    num_species = len(species_list)
    
    # Generate negative samples (equal number to positive samples)
    num_negative_samples = len(positive_samples)
    negative_samples_data = []
    
    print(f"Generating {num_negative_samples} negative samples...")
    
    # Get animals_features for merging with negative samples
    animals_features_dict = animals_features.set_index('scientific_name').to_dict('index')
    
    attempts = 0
    max_attempts = num_negative_samples * 10  # Avoid infinite loop
    
    while len(negative_samples_data) < num_negative_samples and attempts < max_attempts:
        attempts += 1
        
        # Randomly sample predator and prey
        predator = np.random.choice(species_list)
        prey = np.random.choice(species_list)
        
        # Skip self-loops and existing links
        if predator != prey and (predator, prey) not in existing_links:
            # Check if both species have features
            if predator in animals_features_dict and prey in animals_features_dict:
                
                # Create negative sample with features
                negative_sample = {
                    'id': len(positive_samples) + len(negative_samples_data) + 1,
                    'predator_scientific_name': predator,
                    'prey_scientific_name': prey,
                    'Y': 0,  # Keep original Y column
                    'link_exists': 0  # Label for no-link
                }
                
                # Add predator features
                predator_features = animals_features_dict[predator]
                for feature_name, feature_value in predator_features.items():
                    if feature_name != 'scientific_name':
                        negative_sample[f'predator_{feature_name}'] = feature_value
                
                # Add prey features  
                prey_features = animals_features_dict[prey]
                for feature_name, feature_value in prey_features.items():
                    if feature_name != 'scientific_name':
                        negative_sample[f'prey_{feature_name}'] = feature_value
                
                negative_samples_data.append(negative_sample)
    
    # Create negative samples dataframe
    negative_samples = pd.DataFrame(negative_samples_data)
    
    print(f"Successfully generated {len(negative_samples)} negative samples")
    print(f"Attempts required: {attempts}")
    print(f"Shape: {negative_samples.shape}")
    
    # Display sample negative data
    print(f"\nSample negative interactions:")
    sample_cols = ['id', 'predator_scientific_name', 'prey_scientific_name', 'link_exists']
    if len(negative_samples) > 0:
        print(negative_samples[sample_cols].head(3))
    
else:
    print("Cannot generate negative samples - insufficient positive data")
    negative_samples = pd.DataFrame()

In [None]:
# Prepare positive samples (existing interactions)
if not interactions_with_all_features.empty:
    print("Preparing positive samples for link prediction...")
    print("="*50)
    
    # Filter out interactions with missing features (complete cases only)
    complete_interactions = interactions_with_all_features.dropna()
    
    print(f"Original interactions: {len(interactions_with_all_features)}")
    print(f"Complete interactions (no missing features): {len(complete_interactions)}")
    
    # Create positive samples dataframe
    positive_samples = complete_interactions.copy()
    positive_samples['link_exists'] = 1  # Label for existing links
    
    print(f"Positive samples created: {len(positive_samples)}")
    print(f"Shape: {positive_samples.shape}")
    
    # Get unique species that have complete features
    valid_species = set(complete_interactions['predator_scientific_name']) | set(complete_interactions['prey_scientific_name'])
    print(f"Valid species for dataset: {len(valid_species)}")
    
    # Display sample positive data
    print(f"\nSample positive interactions:")
    sample_cols = ['id', 'predator_scientific_name', 'prey_scientific_name', 'link_exists']
    print(positive_samples[sample_cols].head(3))
    
else:
    print("Cannot create positive samples - no interaction data available")
    positive_samples = pd.DataFrame()
    valid_species = set()

## Creating GNN Dataset for Link Prediction

Prepare the dataset for training a GNN model to predict new animal interactions. This includes creating positive samples (existing links) and negative samples (no-links), then splitting into training, validation, and test sets.

## Combining Features with Interactions

Merge animal features from animals_transformed.csv with the interactions dataframe for both predator and prey species.

In [9]:
# Load animal features from animals_transformed.csv
features_file = '../data/animals_transformed.csv'
print(f"Loading animal features: {features_file}")

try:
    # Load the transformed animal features
    animals_features = pd.read_csv(features_file)
    print(f"Successfully loaded animal features with shape: {animals_features.shape}")
    print(f"Columns: {list(animals_features.columns)}")
    
    # Display basic info
    print(f"\nAnimal features dataframe info:")
    print(f"  Number of animals: {len(animals_features)}")
    print(f"  Columns: {len(animals_features.columns)}")
    
    # Sample data
    print(f"\nSample animal features:")
    print(animals_features.head(3))
    
except Exception as e:
    print(f"Error loading features file: {e}")
    animals_features = pd.DataFrame()

Loading animal features: ../data/animals_transformed.csv
Successfully loaded animal features with shape: (1237, 26)
Columns: ['scientific_name', 'weight_normalized', 'size_normalized', 'life_span_normalized', 'diet_Carnivore', 'diet_Herbivore', 'diet_Insectivore', 'diet_Omnivore', 'habitat_Forest', 'habitat_Grassland', 'habitat_Garden', 'habitat_Urban', 'habitat_Woodland', 'habitat_Shrubland', 'habitat_Wetland', 'habitat_Savanna', 'habitat_Meadow', 'habitat_Gardens', 'habitat_other', 'continent_Europe', 'continent_Asia', 'continent_Africa', 'continent_North_America', 'continent_Central_America', 'continent_South_America', 'continent_Oceania']

Animal features dataframe info:
  Number of animals: 1237
  Columns: 26

Sample animal features:
         scientific_name  weight_normalized  size_normalized  \
0       Thomisus onustus       9.994803e-10         0.000000   
1                Nerodia       1.099450e-05         0.066667   
2  Stagmomantis carolina       3.997999e-08         0.00317

In [10]:
# Merge predator features with interactions dataframe
if not animals_features.empty and not interactions_df.empty:
    print("Merging predator features...")
    print("="*40)
    
    # Merge predator features
    interactions_with_predator_features = interactions_df.merge(
        animals_features, 
        left_on='predator_scientific_name', 
        right_on='scientific_name', 
        how='left',
        suffixes=('', '_predator')
    )
    
    # Rename feature columns to include 'predator_' prefix
    predator_feature_columns = [col for col in animals_features.columns if col != 'scientific_name']
    
    # Create mapping for renaming columns
    column_rename_map = {}
    for col in predator_feature_columns:
        if col in interactions_with_predator_features.columns:
            column_rename_map[col] = f'predator_{col}'
    
    # Rename columns
    interactions_with_predator_features = interactions_with_predator_features.rename(columns=column_rename_map)
    
    # Drop the duplicate scientific_name column from merge
    if 'scientific_name' in interactions_with_predator_features.columns:
        interactions_with_predator_features = interactions_with_predator_features.drop('scientific_name', axis=1)
    
    print(f"Shape after merging predator features: {interactions_with_predator_features.shape}")
    print(f"New predator feature columns: {[col for col in interactions_with_predator_features.columns if col.startswith('predator_') and col != 'predator_scientific_name']}")
    
    # Check merge success
    predator_features_merged = interactions_with_predator_features[[col for col in interactions_with_predator_features.columns if col.startswith('predator_') and col != 'predator_scientific_name']].notna().all(axis=1).sum()
    
    print(f"Interactions with complete predator features: {predator_features_merged}/{len(interactions_with_predator_features)}")
    
else:
    print("Cannot merge predator features - missing data")
    interactions_with_predator_features = pd.DataFrame()

Merging predator features...
Shape after merging predator features: (2313, 29)
New predator feature columns: ['predator_weight_normalized', 'predator_size_normalized', 'predator_life_span_normalized', 'predator_diet_Carnivore', 'predator_diet_Herbivore', 'predator_diet_Insectivore', 'predator_diet_Omnivore', 'predator_habitat_Forest', 'predator_habitat_Grassland', 'predator_habitat_Garden', 'predator_habitat_Urban', 'predator_habitat_Woodland', 'predator_habitat_Shrubland', 'predator_habitat_Wetland', 'predator_habitat_Savanna', 'predator_habitat_Meadow', 'predator_habitat_Gardens', 'predator_habitat_other', 'predator_continent_Europe', 'predator_continent_Asia', 'predator_continent_Africa', 'predator_continent_North_America', 'predator_continent_Central_America', 'predator_continent_South_America', 'predator_continent_Oceania']
Interactions with complete predator features: 1956/2313


In [11]:
# Merge prey features with interactions dataframe
if not animals_features.empty and not interactions_with_predator_features.empty:
    print("Merging prey features...")
    print("="*40)
    
    # Merge prey features
    interactions_with_all_features = interactions_with_predator_features.merge(
        animals_features, 
        left_on='prey_scientific_name', 
        right_on='scientific_name', 
        how='left',
        suffixes=('', '_prey')
    )
    
    # Rename feature columns to include 'prey_' prefix
    prey_feature_columns = [col for col in animals_features.columns if col != 'scientific_name']
    
    # Create mapping for renaming columns (avoid conflicts with predator columns)
    column_rename_map = {}
    for col in prey_feature_columns:
        if col in interactions_with_all_features.columns and not col.startswith('predator_'):
            column_rename_map[col] = f'prey_{col}'
    
    # Rename columns
    interactions_with_all_features = interactions_with_all_features.rename(columns=column_rename_map)
    
    # Drop the duplicate scientific_name column from merge
    if 'scientific_name' in interactions_with_all_features.columns:
        interactions_with_all_features = interactions_with_all_features.drop('scientific_name', axis=1)
    
    print(f"Shape after merging prey features: {interactions_with_all_features.shape}")
    print(f"New prey feature columns: {[col for col in interactions_with_all_features.columns if col.startswith('prey_') and col != 'prey_scientific_name']}")
    
    # Check merge success
    prey_features_merged = interactions_with_all_features[[col for col in interactions_with_all_features.columns if col.startswith('prey_') and col != 'prey_scientific_name']].notna().all(axis=1).sum()
    
    print(f"Interactions with complete prey features: {prey_features_merged}/{len(interactions_with_all_features)}")
    
    # Final dataframe summary
    print(f"\nFinal combined dataframe:")
    print(f"  Shape: {interactions_with_all_features.shape}")
    print(f"  Columns: {len(interactions_with_all_features.columns)}")
    
    # Count feature columns
    predator_feature_cols = [col for col in interactions_with_all_features.columns if col.startswith('predator_') and col != 'predator_scientific_name']
    prey_feature_cols = [col for col in interactions_with_all_features.columns if col.startswith('prey_') and col != 'prey_scientific_name']
    
    print(f"  Predator feature columns: {len(predator_feature_cols)}")
    print(f"  Prey feature columns: {len(prey_feature_cols)}")
    print(f"  Total feature columns: {len(predator_feature_cols) + len(prey_feature_cols)}")
    
else:
    print("Cannot merge prey features - missing data")
    interactions_with_all_features = pd.DataFrame()

Merging prey features...
Shape after merging prey features: (2313, 54)
New prey feature columns: ['prey_weight_normalized', 'prey_size_normalized', 'prey_life_span_normalized', 'prey_diet_Carnivore', 'prey_diet_Herbivore', 'prey_diet_Insectivore', 'prey_diet_Omnivore', 'prey_habitat_Forest', 'prey_habitat_Grassland', 'prey_habitat_Garden', 'prey_habitat_Urban', 'prey_habitat_Woodland', 'prey_habitat_Shrubland', 'prey_habitat_Wetland', 'prey_habitat_Savanna', 'prey_habitat_Meadow', 'prey_habitat_Gardens', 'prey_habitat_other', 'prey_continent_Europe', 'prey_continent_Asia', 'prey_continent_Africa', 'prey_continent_North_America', 'prey_continent_Central_America', 'prey_continent_South_America', 'prey_continent_Oceania']
Interactions with complete prey features: 1180/2313

Final combined dataframe:
  Shape: (2313, 54)
  Columns: 54
  Predator feature columns: 25
  Prey feature columns: 25
  Total feature columns: 50


In [None]:
# Display sample of the combined dataframe
if not interactions_with_all_features.empty:
    print("Sample of combined interactions with features:")
    print("="*60)
    
    # Display first few rows
    print("First 3 interactions with all features:")
    print(interactions_with_all_features.head(3))
    
    # Display columns organized by type
    all_columns = list(interactions_with_all_features.columns)
    basic_columns = ['id', 'predator_scientific_name', 'prey_scientific_name', 'Y']
    predator_columns = [col for col in all_columns if col.startswith('predator_') and col != 'predator_scientific_name']
    prey_columns = [col for col in all_columns if col.startswith('prey_') and col != 'prey_scientific_name']
    
    print(f"\nColumn organization:")
    print(f"  Basic columns ({len(basic_columns)}): {basic_columns}")
    print(f"  Predator features ({len(predator_columns)}): {predator_columns[:5]}{'...' if len(predator_columns) > 5 else ''}")
    print(f"  Prey features ({len(prey_columns)}): {prey_columns[:5]}{'...' if len(prey_columns) > 5 else ''}")
    
    # Check for missing values
    missing_counts = interactions_with_all_features.isnull().sum()
    columns_with_missing = missing_counts[missing_counts > 0]
    
    if len(columns_with_missing) > 0:
        print(f"\nColumns with missing values:")
        for col, count in columns_with_missing.items():
            print(f"  {col}: {count} missing values ({count/len(interactions_with_all_features)*100:.1f}%)")
    else:
        print(f"\n✅ No missing values in combined dataframe!")
    
    print(f"\n✅ Combined dataframe ready for GNN!")
    print(f"   Variable: 'interactions_with_all_features'")
    print(f"   Shape: {interactions_with_all_features.shape}")
    print(f"   Total interactions: {len(interactions_with_all_features)}")
    
else:
    print("No combined dataframe to display")~

Sample of combined interactions with features:
First 3 interactions with all features:
   id predator_scientific_name prey_scientific_name  Y  \
0   1         Thomisus onustus             Eupeodes  1   
1   2         Thomisus onustus           Anthophila  1   
2   3         Thomisus onustus           Lasiommata  1   

   predator_weight_normalized  predator_size_normalized  \
0                9.994803e-10                       0.0   
1                9.994803e-10                       0.0   
2                9.994803e-10                       0.0   

   predator_life_span_normalized  predator_diet_Carnivore  \
0                       0.009066                      0.0   
1                       0.009066                      0.0   
2                       0.009066                      0.0   

   predator_diet_Herbivore  predator_diet_Insectivore  ...  \
0                      0.0                        1.0  ...   
1                      0.0                        1.0  ...   
2           

In [13]:
interactions_with_all_features.head()

Unnamed: 0,id,predator_scientific_name,prey_scientific_name,Y,predator_weight_normalized,predator_size_normalized,predator_life_span_normalized,predator_diet_Carnivore,predator_diet_Herbivore,predator_diet_Insectivore,...,prey_habitat_Meadow,prey_habitat_Gardens,prey_habitat_other,prey_continent_Europe,prey_continent_Asia,prey_continent_Africa,prey_continent_North_America,prey_continent_Central_America,prey_continent_South_America,prey_continent_Oceania
0,1,Thomisus onustus,Eupeodes,1,9.994803e-10,0.0,0.009066,0.0,0.0,1.0,...,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1,2,Thomisus onustus,Anthophila,1,9.994803e-10,0.0,0.009066,0.0,0.0,1.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
2,3,Thomisus onustus,Lasiommata,1,9.994803e-10,0.0,0.009066,0.0,0.0,1.0,...,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
3,4,Thomisus onustus,Cacyreus marshalli,1,9.994803e-10,0.0,0.009066,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,5,Thomisus onustus,Dialictus,1,9.994803e-10,0.0,0.009066,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
