# GEXF Network to DataFrame Conversion

This notebook converts the largest_component.gexf network file to a DataFrame containing node information.

In [11]:
import pandas as pd
import networkx as nx
import numpy as np

In [12]:
# Load the GEXF file
gexf_file = '../data/largest_component.gexf'
print(f"Loading GEXF file: {gexf_file}")

try:
    # Read the GEXF file using NetworkX
    G = nx.read_gexf(gexf_file)
    print(f"Successfully loaded network with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")
except Exception as e:
    print(f"Error loading GEXF file: {e}")
    G = None

Loading GEXF file: ../data/largest_component.gexf
Successfully loaded network with 1899 nodes and 2313 edges


In [13]:
# Explore the network structure and node attributes
if G is not None:
    print("Network Information:")
    print(f"Number of nodes: {G.number_of_nodes()}")
    print(f"Number of edges: {G.number_of_edges()}")
    print(f"Is directed: {G.is_directed()}")
    
    # Check a few sample nodes and their attributes
    sample_nodes = list(G.nodes(data=True))[:5]
    print("\nSample nodes with attributes:")
    for node_id, attributes in sample_nodes:
        print(f"Node ID: {node_id}")
        print(f"Attributes: {attributes}")
        print()
    
    # Get all unique attribute keys across all nodes
    all_attributes = set()
    for _, attributes in G.nodes(data=True):
        all_attributes.update(attributes.keys())
    
    print(f"Available node attributes: {sorted(all_attributes)}")

Network Information:
Number of nodes: 1899
Number of edges: 2313
Is directed: True

Sample nodes with attributes:
Node ID: 526
Attributes: {'scientific_name': 'Asimina reticulata', 'common_name': 'netted pawpaw', 'category': 'Plantae', 'label': '526'}

Node ID: 4755
Attributes: {'scientific_name': 'Thomisus onustus', 'common_name': 'Heather crab spider', 'category': 'Arachnida', 'label': '4755'}

Node ID: 3161
Attributes: {'scientific_name': 'Nerodia', 'common_name': 'Watersnakes', 'category': 'Reptilia', 'label': '3161'}

Node ID: 4493
Attributes: {'scientific_name': 'Stagmomantis carolina', 'common_name': 'Carolina Mantis', 'category': 'Insecta', 'label': '4493'}

Node ID: 757
Attributes: {'scientific_name': 'Bridelia micrantha', 'common_name': 'Coastal Goldenleaf', 'category': 'Plantae', 'label': '757'}

Available node attributes: ['category', 'common_name', 'label', 'scientific_name']


In [14]:
# Convert network nodes to DataFrame
def network_to_dataframe(graph):
    """
    Convert NetworkX graph nodes to pandas DataFrame
    """
    if graph is None:
        return pd.DataFrame()
    
    # Extract node data
    node_data = []
    
    for node_id, attributes in graph.nodes(data=True):
        # Create a record for each node
        record = {
            'ID': node_id,
            'scientific_name': attributes.get('scientific_name', ''),
            'common_name': attributes.get('common_name', ''),
            'category': attributes.get('category', '')
        }
        node_data.append(record)
    
    # Create DataFrame
    df = pd.DataFrame(node_data)
    
    return df

# Convert to DataFrame
nodes_df = network_to_dataframe(G)

if not nodes_df.empty:
    print(f"Created DataFrame with {len(nodes_df)} records")
    print(f"Columns: {list(nodes_df.columns)}")
    print(f"\nDataFrame info:")
    print(nodes_df.info())
else:
    print("Failed to create DataFrame")

Created DataFrame with 1899 records
Columns: ['ID', 'scientific_name', 'common_name', 'category']

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1899 entries, 0 to 1898
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ID               1899 non-null   object
 1   scientific_name  1899 non-null   object
 2   common_name      1899 non-null   object
 3   category         1899 non-null   object
dtypes: object(4)
memory usage: 59.5+ KB
None


In [15]:
# Display sample data
if not nodes_df.empty:
    print("First 10 records:")
    print(nodes_df.head(10))
    
    print("\nLast 5 records:")
    print(nodes_df.tail())

First 10 records:
     ID          scientific_name                  common_name   category
0   526       Asimina reticulata                netted pawpaw    Plantae
1  4755         Thomisus onustus          Heather crab spider  Arachnida
2  3161                  Nerodia                  Watersnakes   Reptilia
3  4493    Stagmomantis carolina              Carolina Mantis    Insecta
4   757       Bridelia micrantha           Coastal Goldenleaf    Plantae
5  1253       Copsychus saularis        Oriental Magpie-Robin       Aves
6   702          Bombus pratorum             Early Bumble Bee    Insecta
7  4941        Tyrannus tyrannus             Eastern Kingbird       Aves
8  1361   Cucullia scrophulariae                 Water Betony    Insecta
9  2490  Lamprotornis chalybaeus  Greater Blue-eared Starling       Aves

Last 5 records:
        ID        scientific_name               common_name        category
1894  1791            Esox lucius             Northern Pike  Actinopterygii
1895  2940

In [16]:
# Data quality analysis
if not nodes_df.empty:
    print("Data Quality Analysis:")
    print("="*40)
    
    # Check for missing values
    print("Missing values:")
    missing_counts = nodes_df.isnull().sum()
    for col, count in missing_counts.items():
        print(f"  {col}: {count} ({count/len(nodes_df)*100:.1f}%)")
    
    # Check for empty strings
    print("\nEmpty string values:")
    for col in ['scientific_name', 'common_name', 'category']:
        if col in nodes_df.columns:
            empty_count = (nodes_df[col] == '').sum()
            print(f"  {col}: {empty_count} ({empty_count/len(nodes_df)*100:.1f}%)")
    
    # Show unique categories if available
    if 'category' in nodes_df.columns and not nodes_df['category'].empty:
        print(f"\nUnique categories ({nodes_df['category'].nunique()}):")
        category_counts = nodes_df['category'].value_counts()
        print(category_counts)
    
    # Check for duplicate IDs
    duplicate_ids = nodes_df['ID'].duplicated().sum()
    print(f"\nDuplicate IDs: {duplicate_ids}")
    
    # Check for duplicate scientific names
    if 'scientific_name' in nodes_df.columns:
        duplicate_sci_names = nodes_df['scientific_name'].duplicated().sum()
        print(f"Duplicate scientific names: {duplicate_sci_names}")

Data Quality Analysis:
Missing values:
  ID: 0 (0.0%)
  scientific_name: 0 (0.0%)
  common_name: 0 (0.0%)
  category: 0 (0.0%)

Empty string values:
  scientific_name: 0 (0.0%)
  common_name: 0 (0.0%)
  category: 0 (0.0%)

Unique categories (11):
category
Insecta           627
Plantae           428
Aves              319
Arachnida         133
Actinopterygii    107
Mammalia           90
Animalia           70
Reptilia           64
Amphibia           28
Mollusca           23
Fungi              10
Name: count, dtype: int64

Duplicate IDs: 0
Duplicate scientific names: 0


In [7]:
# Clean the DataFrame
if not nodes_df.empty:
    print("Cleaning DataFrame...")
    
    # Create a copy for cleaning
    nodes_df_clean = nodes_df.copy()
    
    # Replace empty strings with NaN for better handling
    nodes_df_clean = nodes_df_clean.replace('', pd.NA)
    
    # Fill missing values with appropriate defaults
    nodes_df_clean['scientific_name'] = nodes_df_clean['scientific_name'].fillna('Unknown')
    nodes_df_clean['common_name'] = nodes_df_clean['common_name'].fillna('Unknown')
    nodes_df_clean['category'] = nodes_df_clean['category'].fillna('Unknown')
    
    # Ensure ID is string type
    nodes_df_clean['ID'] = nodes_df_clean['ID'].astype(str)
    
    print(f"Cleaned DataFrame shape: {nodes_df_clean.shape}")
    print("\nCleaned data sample:")
    print(nodes_df_clean.head())
    
    # Final data summary
    print("\nFinal data summary:")
    print(f"Total records: {len(nodes_df_clean)}")
    print(f"Records with 'Unknown' scientific_name: {(nodes_df_clean['scientific_name'] == 'Unknown').sum()}")
    print(f"Records with 'Unknown' common_name: {(nodes_df_clean['common_name'] == 'Unknown').sum()}")
    print(f"Records with 'Unknown' category: {(nodes_df_clean['category'] == 'Unknown').sum()}")

Cleaning DataFrame...
Cleaned DataFrame shape: (1899, 4)

Cleaned data sample:
     ID        scientific_name          common_name   category
0   526     Asimina reticulata        netted pawpaw    Plantae
1  4755       Thomisus onustus  Heather crab spider  Arachnida
2  3161                Nerodia          Watersnakes   Reptilia
3  4493  Stagmomantis carolina      Carolina Mantis    Insecta
4   757     Bridelia micrantha   Coastal Goldenleaf    Plantae

Final data summary:
Total records: 1899
Records with 'Unknown' scientific_name: 0
Records with 'Unknown' common_name: 0
Records with 'Unknown' category: 0


In [17]:
# Optional: Display network statistics
if G is not None and not nodes_df.empty:
    print("Network Statistics Summary:")
    print("="*40)
    print(f"Total nodes in network: {G.number_of_nodes()}")
    print(f"Total edges in network: {G.number_of_edges()}")
    print(f"Records in DataFrame: {len(nodes_df)}")
    print(f"Network density: {nx.density(G):.4f}")
    
    # Node degree statistics
    degrees = [degree for node, degree in G.degree()]
    print(f"\nNode degree statistics:")
    print(f"  Average degree: {np.mean(degrees):.2f}")
    print(f"  Max degree: {max(degrees)}")
    print(f"  Min degree: {min(degrees)}")
    
    # Show nodes with highest degree (most connected)
    degree_dict = dict(G.degree())
    sorted_degrees = sorted(degree_dict.items(), key=lambda x: x[1], reverse=True)
    
    print(f"\nTop 5 most connected nodes:")
    for i, (node_id, degree) in enumerate(sorted_degrees[:5], 1):
        # Get node info from DataFrame
        node_info = nodes_df[nodes_df['ID'] == node_id]
        if not node_info.empty:
            sci_name = node_info.iloc[0]['scientific_name']
            common_name = node_info.iloc[0]['common_name']
            print(f"  {i}. {sci_name} ({common_name}) - Degree: {degree}")
        else:
            print(f"  {i}. Node {node_id} - Degree: {degree}")

Network Statistics Summary:
Total nodes in network: 1899
Total edges in network: 2313
Records in DataFrame: 1899
Network density: 0.0006

Node degree statistics:
  Average degree: 2.44
  Max degree: 70
  Min degree: 1

Top 5 most connected nodes:
  1. Lepidoptera (Butterflies and Moths) - Degree: 70
  2. Apis mellifera (Western Honey Bee) - Degree: 51
  3. Ardea herodias (Great Blue Heron) - Degree: 46
  4. Pterygota (Winged and Once-winged Insects) - Degree: 32
  5. Diptera (Flies) - Degree: 30


In [18]:
# Load the animal features dataset for joining
print("Loading animal features dataset...")

animal_features_file = '../data/combined_animal_features_final.csv'

try:
    animal_features_df = pd.read_csv(animal_features_file)
    print(f"Successfully loaded animal features dataset with {len(animal_features_df)} records")
    print(f"Columns: {list(animal_features_df.columns)}")
    
    # Convert habitat and continent columns back to lists if they're strings
    if 'habitat' in animal_features_df.columns:
        animal_features_df['habitat'] = animal_features_df['habitat'].apply(
            lambda x: eval(x) if isinstance(x, str) and x.startswith('[') else x
        )
    if 'continent' in animal_features_df.columns:
        animal_features_df['continent'] = animal_features_df['continent'].apply(
            lambda x: eval(x) if isinstance(x, str) and x.startswith('[') else x
        )
    
    print(f"\nAnimal features data sample:")
    print(animal_features_df[['scientific_name', 'common_name', 'diet', 'weight', 'size']].head())
    
except Exception as e:
    print(f"Error loading animal features dataset: {e}")
    animal_features_df = None

Loading animal features dataset...
Successfully loaded animal features dataset with 3540 records
Columns: ['scientific_name', 'common_name', 'weight', 'size', 'diet', 'life_span', 'habitat', 'continent']

Animal features data sample:
            scientific_name              common_name       diet  weight  size
0  Abantennarius sanguineus          Bloody frogfish  Carnivore    50.0  10.0
1         Abantis paradisea         Paradise Skipper  Herbivore     1.0   3.0
2       Abbottina rivularis    Chinese false gudgeon   Omnivore    20.0  10.0
3     Abisares viridipennis  African darkling beetle  Herbivore     5.0   1.0
4             Abramis brama                    Bream   Omnivore  2000.0  50.0


In [19]:
# Analyze the datasets before joining
if nodes_df_clean is not None and animal_features_df is not None:
    print("Pre-join Analysis:")
    print("="*50)
    
    # Check data availability
    print(f"Network nodes dataset: {len(nodes_df_clean)} records")
    print(f"Animal features dataset: {len(animal_features_df)} records")
    
    # Check scientific_name field quality
    print(f"\nNetwork nodes - scientific_name analysis:")
    network_sci_names = nodes_df_clean['scientific_name'].dropna()
    network_sci_names = network_sci_names[network_sci_names != 'Unknown']
    print(f"  Valid scientific names: {len(network_sci_names)}")
    print(f"  Unique scientific names: {len(network_sci_names.unique())}")
    print(f"  Sample names: {list(network_sci_names.head())}")
    
    print(f"\nAnimal features - scientific_name analysis:")
    features_sci_names = animal_features_df['scientific_name'].dropna()
    print(f"  Valid scientific names: {len(features_sci_names)}")
    print(f"  Unique scientific names: {len(features_sci_names.unique())}")
    print(f"  Sample names: {list(features_sci_names.head())}")
    
    # Check for potential matches
    network_names_set = set(network_sci_names.str.strip().str.lower())
    features_names_set = set(features_sci_names.str.strip().str.lower())
    
    potential_matches = network_names_set.intersection(features_names_set)
    print(f"\nPotential matches (case-insensitive): {len(potential_matches)}")
    
    if len(potential_matches) > 0:
        print(f"Sample potential matches: {list(potential_matches)[:10]}")
    else:
        print("No obvious matches found. Checking for partial matches...")
        # Show some examples from each dataset for manual inspection
        print(f"\nNetwork scientific names sample: {list(network_sci_names.head(10))}")
        print(f"Features scientific names sample: {list(features_sci_names.head(10))}")
else:
    print("Cannot perform analysis - one or both datasets are not available")

Pre-join Analysis:
Network nodes dataset: 1899 records
Animal features dataset: 3540 records

Network nodes - scientific_name analysis:
  Valid scientific names: 1899
  Unique scientific names: 1899
  Sample names: ['Asimina reticulata', 'Thomisus onustus', 'Nerodia', 'Stagmomantis carolina', 'Bridelia micrantha']

Animal features - scientific_name analysis:
  Valid scientific names: 3540
  Unique scientific names: 3540
  Sample names: ['Abantennarius sanguineus', 'Abantis paradisea', 'Abbottina rivularis', 'Abisares viridipennis', 'Abramis brama']

Potential matches (case-insensitive): 1254
Sample potential matches: ['lycosidae', 'orphulella pelidna', 'pugettia producta', 'celastrina argiolus', 'taurulus bubalis', 'baeolophus bicolor', 'agulla', 'egretta garzetta', 'noctuoidea', 'tyto furcata']


In [20]:
# Perform inner join on scientific_name
if nodes_df_clean is not None and animal_features_df is not None:
    print("Performing inner join on scientific_name...")
    print("="*50)
    
    # Clean scientific names for better matching
    # Create cleaned versions for joining
    nodes_df_clean['scientific_name_clean'] = nodes_df_clean['scientific_name'].str.strip().str.lower()
    animal_features_df['scientific_name_clean'] = animal_features_df['scientific_name'].str.strip().str.lower()
    
    # Remove records with Unknown or empty scientific names from network data
    nodes_for_join = nodes_df_clean[
        (nodes_df_clean['scientific_name'] != 'Unknown') & 
        (nodes_df_clean['scientific_name'].notna()) &
        (nodes_df_clean['scientific_name'] != '')
    ].copy()
    
    print(f"Network nodes available for joining: {len(nodes_for_join)}")
    print(f"Animal features records: {len(animal_features_df)}")
    
    # Perform the inner join
    merged_df = pd.merge(
        nodes_for_join, 
        animal_features_df, 
        on='scientific_name_clean', 
        how='inner',
        suffixes=('_network', '_features')
    )
    
    # Drop the temporary clean column
    merged_df = merged_df.drop('scientific_name_clean', axis=1)
    
    print(f"\nInner join completed!")
    print(f"Merged dataset shape: {merged_df.shape}")
    print(f"Records successfully joined: {len(merged_df)}")
    
    if len(merged_df) > 0:
        # Show column information
        print(f"\nMerged dataset columns:")
        for i, col in enumerate(merged_df.columns, 1):
            print(f"  {i:2d}. {col}")
        
        # Handle duplicate columns (keep features version for scientific_name and common_name)
        if 'scientific_name_network' in merged_df.columns and 'scientific_name_features' in merged_df.columns:
            merged_df['scientific_name'] = merged_df['scientific_name_features']
            merged_df = merged_df.drop(['scientific_name_network', 'scientific_name_features'], axis=1)
        
        if 'common_name_network' in merged_df.columns and 'common_name_features' in merged_df.columns:
            merged_df['common_name'] = merged_df['common_name_features']
            merged_df = merged_df.drop(['common_name_network', 'common_name_features'], axis=1)
        
        print(f"\nFinal merged dataset shape: {merged_df.shape}")
        print(f"Final columns: {list(merged_df.columns)}")
        
    else:
        print("\nWarning: No matches found between datasets!")
        merged_df = pd.DataFrame()
else:
    print("Cannot perform join - one or both datasets are not available")
    merged_df = pd.DataFrame()

Performing inner join on scientific_name...
Network nodes available for joining: 1899
Animal features records: 3540

Inner join completed!
Merged dataset shape: (1254, 12)
Records successfully joined: 1254

Merged dataset columns:
   1. ID
   2. scientific_name_network
   3. common_name_network
   4. category
   5. scientific_name_features
   6. common_name_features
   7. weight
   8. size
   9. diet
  10. life_span
  11. habitat
  12. continent

Final merged dataset shape: (1254, 10)
Final columns: ['ID', 'category', 'weight', 'size', 'diet', 'life_span', 'habitat', 'continent', 'scientific_name', 'common_name']


In [21]:
# Analyze the merged dataset
if not merged_df.empty:
    print("Merged Dataset Analysis:")
    print("="*50)
    
    print(f"Final merged dataset info:")
    print(f"  Shape: {merged_df.shape}")
    print(f"  Records: {len(merged_df)}")
    print(f"  Columns: {len(merged_df.columns)}")
    
    print(f"\nData types:")
    print(merged_df.dtypes)
    
    print(f"\nSample of merged data:")
    display_columns = ['ID', 'scientific_name', 'common_name', 'category', 'diet', 'weight', 'size', 'life_span']
    available_columns = [col for col in display_columns if col in merged_df.columns]
    print(merged_df[available_columns].head())
    
    print(f"\nMissing values in merged dataset:")
    missing_values = merged_df.isnull().sum()
    for col, count in missing_values.items():
        if count > 0:
            print(f"  {col}: {count} ({count/len(merged_df)*100:.1f}%)")
    
    # Diet distribution in merged data
    if 'diet' in merged_df.columns:
        print(f"\nDiet distribution in merged dataset:")
        print(merged_df['diet'].value_counts())
    
    # Category distribution in merged data
    if 'category' in merged_df.columns:
        print(f"\nCategory distribution in merged dataset:")
        print(merged_df['category'].value_counts())
    
    # Network and features coverage
    print(f"\nDataset join statistics:")
    if 'nodes_for_join' in locals():
        coverage_network = len(merged_df) / len(nodes_for_join) * 100
        print(f"  Network nodes with features: {len(merged_df)}/{len(nodes_for_join)} ({coverage_network:.1f}%)")
    
    if 'animal_features_df' in locals():
        coverage_features = len(merged_df) / len(animal_features_df) * 100
        print(f"  Feature records in network: {len(merged_df)}/{len(animal_features_df)} ({coverage_features:.1f}%)")
    
    print(f"\n‚úÖ Merged dataset 'merged_df' is ready for use!")
    print(f"   This dataset combines network topology with animal features.")
    print(f"   Use this dataset for further analysis and visualization.")
    
else:
    print("‚ùå Merged dataset is empty - no matches found between network and features data")

Merged Dataset Analysis:
Final merged dataset info:
  Shape: (1254, 10)
  Records: 1254
  Columns: 10

Data types:
ID                  object
category            object
weight             float64
size               float64
diet                object
life_span          float64
habitat             object
continent           object
scientific_name     object
common_name         object
dtype: object

Sample of merged data:
     ID        scientific_name                          common_name  \
0  4755       Thomisus onustus                   Flower Crab Spider   
1  3161                Nerodia  North American Water Snakes (Genus)   
2  4493  Stagmomantis carolina                      Carolina Mantis   
3  1253     Copsychus saularis                Oriental Magpie-Robin   
4   702        Bombus pratorum                      Early Bumblebee   

    category         diet   weight   size  life_span  
0  Arachnida  Insectivore    0.050    0.0       1.00  
1   Reptilia    Carnivore  550.000  105.

In [22]:
# Optional: Display network statistics with merged data context
if G is not None and not nodes_df.empty:
    print("Network Statistics Summary:")
    print("="*40)
    print(f"Total nodes in network: {G.number_of_nodes()}")
    print(f"Total edges in network: {G.number_of_edges()}")
    print(f"Network density: {nx.density(G):.4f}")
    
    # Node degree statistics
    degrees = [degree for node, degree in G.degree()]
    print(f"\nNode degree statistics:")
    print(f"  Average degree: {np.mean(degrees):.2f}")
    print(f"  Max degree: {max(degrees)}")
    print(f"  Min degree: {min(degrees)}")
    
    # Show nodes with highest degree (most connected)
    degree_dict = dict(G.degree())
    sorted_degrees = sorted(degree_dict.items(), key=lambda x: x[1], reverse=True)
    
    print(f"\nTop 5 most connected nodes:")
    for i, (node_id, degree) in enumerate(sorted_degrees[:5], 1):
        # Get node info from DataFrame
        node_info = nodes_df[nodes_df['ID'] == node_id]
        if not node_info.empty:
            sci_name = node_info.iloc[0]['scientific_name']
            common_name = node_info.iloc[0]['common_name']
            print(f"  {i}. {sci_name} ({common_name}) - Degree: {degree}")
        else:
            print(f"  {i}. Node {node_id} - Degree: {degree}")
    
    # Show information about merged data coverage
    if 'merged_df' in locals() and not merged_df.empty:
        print(f"\nMerged data coverage:")
        print(f"  Nodes with both network and features data: {len(merged_df)}")
        print(f"  Coverage of total network: {len(merged_df)/G.number_of_nodes()*100:.1f}%")
        
        # Show top connected nodes that have features
        merged_node_ids = set(merged_df['ID'].astype(str))
        connected_with_features = []
        
        for node_id, degree in sorted_degrees:
            if str(node_id) in merged_node_ids:
                connected_with_features.append((node_id, degree))
                if len(connected_with_features) >= 5:
                    break
        
        print(f"\nTop 5 most connected nodes WITH animal features:")
        for i, (node_id, degree) in enumerate(connected_with_features, 1):
            node_info = merged_df[merged_df['ID'] == str(node_id)]
            if not node_info.empty:
                sci_name = node_info.iloc[0]['scientific_name']
                common_name = node_info.iloc[0]['common_name']
                diet = node_info.iloc[0].get('diet', 'Unknown')
                print(f"  {i}. {sci_name} ({common_name}) - Degree: {degree}, Diet: {diet}")
    
    print(f"\nüéØ Network analysis complete! You now have:")
    print(f"   - Network topology data (nodes_df_clean)")
    print(f"   - Animal features data (animal_features_df)")
    print(f"   - Combined data (merged_df) for network + features analysis")

Network Statistics Summary:
Total nodes in network: 1899
Total edges in network: 2313
Network density: 0.0006

Node degree statistics:
  Average degree: 2.44
  Max degree: 70
  Min degree: 1

Top 5 most connected nodes:
  1. Lepidoptera (Butterflies and Moths) - Degree: 70
  2. Apis mellifera (Western Honey Bee) - Degree: 51
  3. Ardea herodias (Great Blue Heron) - Degree: 46
  4. Pterygota (Winged and Once-winged Insects) - Degree: 32
  5. Diptera (Flies) - Degree: 30

Merged data coverage:
  Nodes with both network and features data: 1254
  Coverage of total network: 66.0%

Top 5 most connected nodes WITH animal features:
  1. Lepidoptera (Butterflies and Moths) - Degree: 70, Diet: Herbivore
  2. Apis mellifera (European honey bee) - Degree: 51, Diet: Herbivore
  3. Ardea herodias (Great Blue Heron) - Degree: 46, Diet: Carnivore
  4. Pterygota (Winged Insects) - Degree: 32, Diet: Omnivore
  5. Halcyon albiventris albiventris (Brown-hooded Kingfisher) - Degree: 22, Diet: Carnivore

ü

# GNN Input Preparation

Transform the merged dataset into GNN-ready format with proper feature engineering and encoding.

In [36]:
# Prepare GNN input features from merged dataset
import sklearn.preprocessing as preprocessing
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

if not merged_df.empty:
    print("Preparing GNN input features...")
    print("="*60)
    
    # Create a copy for GNN processing
    gnn_df = merged_df.copy()
    
    # Select required columns for GNN
    required_columns = ['scientific_name', 'weight', 'size', 'diet', 'life_span', 'habitat', 'continent']
    
    print(f"Starting with {len(gnn_df)} records")
    print(f"Required columns: {required_columns}")
    
    # Check data quality before processing
    print(f"\nData quality check:")
    print(f"Missing values per column:")
    for col in required_columns:
        if col in gnn_df.columns:
            missing_count = gnn_df[col].isnull().sum()
            print(f"  {col}: {missing_count} ({missing_count/len(gnn_df)*100:.1f}%)")
    
    # Remove records with missing critical values
    print(f"\nRemoving records with missing critical values...")
    original_count = len(gnn_df)
    
    # Remove records with missing weight, size, or life_span
    gnn_df = gnn_df.dropna(subset=['weight', 'size', 'life_span'])
    
    print(f"Records after removing missing numerical values: {len(gnn_df)} (removed {original_count - len(gnn_df)})")
    
    # Verify units (from JSON processing notebook analysis)
    print(f"\nData units verification:")
    print(f"  Weight: grams (confirmed from JSON processing)")
    print(f"  Size: centimeters (confirmed from JSON processing)")
    print(f"  Life span: years (confirmed from JSON processing)")
    
    print(f"\nNumerical data ranges before normalization:")
    print(f"  Weight: {gnn_df['weight'].min():.2e} - {gnn_df['weight'].max():.2e} grams")
    print(f"  Size: {gnn_df['size'].min():.1f} - {gnn_df['size'].max():.1f} cm")
    print(f"  Life span: {gnn_df['life_span'].min():.3f} - {gnn_df['life_span'].max():.1f} years")
    
else:
    print("‚ùå No merged dataset available for GNN processing")
    gnn_df = pd.DataFrame()

Preparing GNN input features...
Starting with 1254 records
Required columns: ['scientific_name', 'weight', 'size', 'diet', 'life_span', 'habitat', 'continent']

Data quality check:
Missing values per column:
  scientific_name: 0 (0.0%)
  weight: 1 (0.1%)
  size: 0 (0.0%)
  diet: 0 (0.0%)
  life_span: 16 (1.3%)
  habitat: 0 (0.0%)
  continent: 0 (0.0%)

Removing records with missing critical values...
Records after removing missing numerical values: 1237 (removed 17)

Data units verification:
  Weight: grams (confirmed from JSON processing)
  Size: centimeters (confirmed from JSON processing)
  Life span: years (confirmed from JSON processing)

Numerical data ranges before normalization:
  Weight: 1.00e-06 - 5.00e+07 grams
  Size: 0.0 - 1575.0 cm
  Life span: 0.003 - 110.0 years


In [37]:
# Normalize numerical features (weight, size, life_span) to [0,1]
if not gnn_df.empty:
    print("Normalizing numerical features to [0,1]...")
    print("="*50)
    
    # Initialize scalers
    weight_scaler = MinMaxScaler()
    size_scaler = MinMaxScaler()
    lifespan_scaler = MinMaxScaler()
    
    # Apply normalization
    gnn_df['weight_normalized'] = weight_scaler.fit_transform(gnn_df[['weight']]).flatten()
    gnn_df['size_normalized'] = size_scaler.fit_transform(gnn_df[['size']]).flatten()
    gnn_df['life_span_normalized'] = lifespan_scaler.fit_transform(gnn_df[['life_span']]).flatten()
    
    print(f"Normalization completed!")
    print(f"\nNormalized ranges (should all be 0.0 - 1.0):")
    print(f"  Weight: {gnn_df['weight_normalized'].min():.3f} - {gnn_df['weight_normalized'].max():.3f}")
    print(f"  Size: {gnn_df['size_normalized'].min():.3f} - {gnn_df['size_normalized'].max():.3f}")
    print(f"  Life span: {gnn_df['life_span_normalized'].min():.3f} - {gnn_df['life_span_normalized'].max():.3f}")
    
    # Store scalers for potential inverse transform later
    scalers = {
        'weight_scaler': weight_scaler,
        'size_scaler': size_scaler,
        'lifespan_scaler': lifespan_scaler
    }
    
    print(f"\n‚úÖ Scalers stored for potential inverse transformation")
    
    # Sample of normalized vs original values
    print(f"\nSample comparison (original vs normalized):")
    comparison_cols = ['weight', 'weight_normalized', 'size', 'size_normalized', 'life_span', 'life_span_normalized']
    print(gnn_df[comparison_cols].head())
else:
    print("‚ùå No data available for normalization")

Normalizing numerical features to [0,1]...
Normalization completed!

Normalized ranges (should all be 0.0 - 1.0):
  Weight: 0.000 - 1.000
  Size: 0.000 - 1.000
  Life span: 0.000 - 1.000

‚úÖ Scalers stored for potential inverse transformation

Sample comparison (original vs normalized):
    weight  weight_normalized   size  size_normalized  life_span  \
0    0.050       9.994803e-10    0.0         0.000000       1.00   
1  550.000       1.099450e-05  105.0         0.066667       7.50   
2    2.000       3.997999e-08    5.0         0.003175       0.75   
3   35.000       6.996502e-07   21.0         0.013333      11.50   
4    0.175       3.498231e-09    1.0         0.000635       1.00   

   life_span_normalized  
0              0.009066  
1              0.068159  
2              0.006793  
3              0.104523  
4              0.009066  


In [38]:
# One-hot encode diet features
if not gnn_df.empty:
    print("One-hot encoding diet features...")
    print("="*50)
    
    # Check diet distribution
    print(f"Diet distribution:")
    diet_counts = gnn_df['diet'].value_counts()
    print(diet_counts)
    
    # Create one-hot encoding for diet (ensure integer output)
    diet_dummies = pd.get_dummies(gnn_df['diet'], prefix='diet', dtype=int)
    
    print(f"\nDiet one-hot encoded columns:")
    for col in diet_dummies.columns:
        print(f"  - {col}: {diet_dummies[col].sum()} records")
    
    # Verify data types are integers
    print(f"\nDiet feature data types:")
    for col in diet_dummies.columns:
        print(f"  - {col}: {diet_dummies[col].dtype}")
    
    # Add diet one-hot columns to dataframe
    gnn_df = pd.concat([gnn_df, diet_dummies], axis=1)
    
    print(f"\n‚úÖ Diet one-hot encoding completed!")
    print(f"Added {len(diet_dummies.columns)} diet feature columns (as integers)")
    
    # Show sample values to confirm they're 0s and 1s
    print(f"\nSample diet encoding values:")
    print(diet_dummies.head())
    
else:
    print("‚ùå No data available for diet encoding")

One-hot encoding diet features...
Diet distribution:
diet
Herbivore      408
Carnivore      299
Omnivore       299
Insectivore    231
Name: count, dtype: int64

Diet one-hot encoded columns:
  - diet_Carnivore: 299 records
  - diet_Herbivore: 408 records
  - diet_Insectivore: 231 records
  - diet_Omnivore: 299 records

Diet feature data types:
  - diet_Carnivore: int64
  - diet_Herbivore: int64
  - diet_Insectivore: int64
  - diet_Omnivore: int64

‚úÖ Diet one-hot encoding completed!
Added 4 diet feature columns (as integers)

Sample diet encoding values:
   diet_Carnivore  diet_Herbivore  diet_Insectivore  diet_Omnivore
0               0               0                 1              0
1               1               0                 0              0
2               0               0                 1              0
3               0               0                 1              0
4               0               1                 0              0


In [39]:
# Process habitat features - one-hot encode top 10 habitats + "other"
if not gnn_df.empty:
    print("Processing habitat features for one-hot encoding...")
    print("="*60)
    
    # Count all habitat occurrences
    habitat_counts = {}
    for habitats in gnn_df['habitat']:
        if isinstance(habitats, list):
            for habitat in habitats:
                habitat_counts[habitat] = habitat_counts.get(habitat, 0) + 1
    
    # Sort habitats by frequency and get top 10
    sorted_habitats = sorted(habitat_counts.items(), key=lambda x: x[1], reverse=True)
    top_10_habitats = [habitat for habitat, count in sorted_habitats[:10]]
    
    print(f"Total unique habitats: {len(habitat_counts)}")
    print(f"\nTop 10 most frequent habitats:")
    for i, (habitat, count) in enumerate(sorted_habitats[:10], 1):
        print(f"  {i:2d}. {habitat}: {count} records")
    
    print(f"\nRemaining habitats: {len(sorted_habitats) - 10}")
    other_count = sum(count for habitat, count in sorted_habitats[10:])
    print(f"Total 'other' habitat occurrences: {other_count}")
    
    # Create habitat encoding function
    def encode_habitat_features(habitat_list):
        """Convert habitat list to one-hot encoding with top 10 + other"""
        features = {f'habitat_{habitat}': 0 for habitat in top_10_habitats}
        features['habitat_other'] = 0
        
        if isinstance(habitat_list, list):
            for habitat in habitat_list:
                if habitat in top_10_habitats:
                    features[f'habitat_{habitat}'] = 1
                else:
                    features['habitat_other'] = 1
        
        return pd.Series(features, dtype=int)  # Ensure integer type
    
    # Apply habitat encoding
    habitat_features = gnn_df['habitat'].apply(encode_habitat_features)
    
    print(f"\nHabitat feature encoding summary:")
    for col in habitat_features.columns:
        count = habitat_features[col].sum()
        print(f"  - {col}: {count} records")
    
    # Verify data types are integers
    print(f"\nHabitat feature data types:")
    for col in habitat_features.columns:
        print(f"  - {col}: {habitat_features[col].dtype}")
    
    # Add habitat features to dataframe
    gnn_df = pd.concat([gnn_df, habitat_features], axis=1)
    
    print(f"\n‚úÖ Habitat one-hot encoding completed!")
    print(f"Added {len(habitat_features.columns)} habitat feature columns (10 top + 1 other, as integers)")
    
    # Show sample values to confirm they're 0s and 1s
    print(f"\nSample habitat encoding values:")
    print(habitat_features.head())
    
else:
    print("‚ùå No data available for habitat encoding")

Processing habitat features for one-hot encoding...
Total unique habitats: 311

Top 10 most frequent habitats:
   1. Forest: 610 records
   2. Grassland: 400 records
   3. Garden: 369 records
   4. Urban: 345 records
   5. Woodland: 326 records
   6. Shrubland: 230 records
   7. Wetland: 142 records
   8. Savanna: 128 records
   9. Meadow: 121 records
  10. Gardens: 115 records

Remaining habitats: 301
Total 'other' habitat occurrences: 3023

Habitat feature encoding summary:
  - habitat_Forest: 610 records
  - habitat_Grassland: 400 records
  - habitat_Garden: 369 records
  - habitat_Urban: 345 records
  - habitat_Woodland: 326 records
  - habitat_Shrubland: 230 records
  - habitat_Wetland: 142 records
  - habitat_Savanna: 128 records
  - habitat_Meadow: 121 records
  - habitat_Gardens: 115 records
  - habitat_other: 1072 records

Habitat feature data types:
  - habitat_Forest: int64
  - habitat_Grassland: int64
  - habitat_Garden: int64
  - habitat_Urban: int64
  - habitat_Woodland: 

In [40]:
# One-hot encode continent features
if not gnn_df.empty:
    print("One-hot encoding continent features...")
    print("="*50)
    
    # Count continent occurrences
    continent_counts = {}
    for continents in gnn_df['continent']:
        if isinstance(continents, list):
            for continent in continents:
                continent_counts[continent] = continent_counts.get(continent, 0) + 1
    
    print(f"Continent distribution:")
    sorted_continents = sorted(continent_counts.items(), key=lambda x: x[1], reverse=True)
    for continent, count in sorted_continents:
        print(f"  - {continent}: {count} records")
    
    # Get all unique continents for encoding
    all_continents = list(continent_counts.keys())
    
    # Create continent encoding function
    def encode_continent_features(continent_list):
        """Convert continent list to one-hot encoding"""
        features = {f'continent_{continent.replace(" ", "_")}': 0 for continent in all_continents}
        
        if isinstance(continent_list, list):
            for continent in continent_list:
                if continent in all_continents:
                    features[f'continent_{continent.replace(" ", "_")}'] = 1
        
        return pd.Series(features, dtype=int)  # Ensure integer type
    
    # Apply continent encoding
    continent_features = gnn_df['continent'].apply(encode_continent_features)
    
    print(f"\nContinent feature encoding summary:")
    for col in continent_features.columns:
        count = continent_features[col].sum()
        print(f"  - {col}: {count} records")
    
    # Verify data types are integers
    print(f"\nContinent feature data types:")
    for col in continent_features.columns:
        print(f"  - {col}: {continent_features[col].dtype}")
    
    # Add continent features to dataframe
    gnn_df = pd.concat([gnn_df, continent_features], axis=1)
    
    print(f"\n‚úÖ Continent one-hot encoding completed!")
    print(f"Added {len(continent_features.columns)} continent feature columns (as integers)")
    
    # Show sample values to confirm they're 0s and 1s
    print(f"\nSample continent encoding values:")
    print(continent_features.head())
    
else:
    print("‚ùå No data available for continent encoding")

One-hot encoding continent features...
Continent distribution:
  - North America: 791 records
  - Africa: 595 records
  - Asia: 537 records
  - Europe: 471 records
  - South America: 310 records
  - Oceania: 276 records
  - Central America: 161 records

Continent feature encoding summary:
  - continent_Europe: 471 records
  - continent_Asia: 537 records
  - continent_Africa: 595 records
  - continent_North_America: 791 records
  - continent_Central_America: 161 records
  - continent_South_America: 310 records
  - continent_Oceania: 276 records

Continent feature data types:
  - continent_Europe: int64
  - continent_Asia: int64
  - continent_Africa: int64
  - continent_North_America: int64
  - continent_Central_America: int64
  - continent_South_America: int64
  - continent_Oceania: int64

‚úÖ Continent one-hot encoding completed!
Added 7 continent feature columns (as integers)

Sample continent encoding values:
   continent_Europe  continent_Asia  continent_Africa  \
0                 

In [41]:
# Create final GNN input dataframe
if not gnn_df.empty:
    print("Creating final GNN input dataframe...")
    print("="*60)
    
    # Define the columns for the final GNN input
    # Original fields
    base_columns = ['scientific_name']
    
    # Normalized numerical features
    numerical_columns = ['weight_normalized', 'size_normalized', 'life_span_normalized']
    
    # One-hot encoded categorical features
    diet_columns = [col for col in gnn_df.columns if col.startswith('diet_')]
    habitat_columns = [col for col in gnn_df.columns if col.startswith('habitat_')]
    continent_columns = [col for col in gnn_df.columns if col.startswith('continent_')]
    
    # Combine all feature columns
    feature_columns = numerical_columns + diet_columns + habitat_columns + continent_columns
    final_columns = base_columns + feature_columns
    
    # Create final GNN dataframe
    gnn_input_df = gnn_df[final_columns].copy()
    
    print(f"Final GNN input dataframe created!")
    print(f"Shape: {gnn_input_df.shape}")
    print(f"Records: {len(gnn_input_df)}")
    print(f"Features: {len(feature_columns)}")
    
    print(f"\nFeature breakdown:")
    print(f"  - Numerical features (normalized): {len(numerical_columns)}")
    print(f"    * {', '.join(numerical_columns)}")
    print(f"  - Diet features (one-hot): {len(diet_columns)}")
    print(f"    * {', '.join(diet_columns)}")
    print(f"  - Habitat features (one-hot): {len(habitat_columns)}")
    print(f"    * Top 10 + other habitat categories")
    print(f"  - Continent features (one-hot): {len(continent_columns)}")
    print(f"    * {', '.join(continent_columns)}")
    
    # Data quality check
    print(f"\nData quality check:")
    print(f"Missing values in final dataframe:")
    missing_check = gnn_input_df.isnull().sum()
    for col, missing_count in missing_check.items():
        if missing_count > 0:
            print(f"  - {col}: {missing_count}")
    
    if missing_check.sum() == 0:
        print(f"  ‚úÖ No missing values in final dataframe!")
    
    # Show sample of final dataframe
    print(f"\nSample of final GNN input dataframe:")
    print("Columns:", list(gnn_input_df.columns))
    print("\nFirst 3 rows (showing first 10 columns):")
    sample_cols = final_columns[:min(10, len(final_columns))]
    print(gnn_input_df[sample_cols].head(3))
    
    print(f"\nüéØ GNN input dataframe is ready!")
    print(f"   Variable: 'gnn_input_df'")
    print(f"   Shape: {gnn_input_df.shape}")
    print(f"   Features: {len(feature_columns)} (ready for GNN input)")
    
else:
    print("‚ùå Cannot create GNN input - no processed data available")
    gnn_input_df = pd.DataFrame()

Creating final GNN input dataframe...
Final GNN input dataframe created!
Shape: (1237, 26)
Records: 1237
Features: 25

Feature breakdown:
  - Numerical features (normalized): 3
    * weight_normalized, size_normalized, life_span_normalized
  - Diet features (one-hot): 4
    * diet_Carnivore, diet_Herbivore, diet_Insectivore, diet_Omnivore
  - Habitat features (one-hot): 11
    * Top 10 + other habitat categories
  - Continent features (one-hot): 7
    * continent_Europe, continent_Asia, continent_Africa, continent_North_America, continent_Central_America, continent_South_America, continent_Oceania

Data quality check:
Missing values in final dataframe:
  ‚úÖ No missing values in final dataframe!

Sample of final GNN input dataframe:
Columns: ['scientific_name', 'weight_normalized', 'size_normalized', 'life_span_normalized', 'diet_Carnivore', 'diet_Herbivore', 'diet_Insectivore', 'diet_Omnivore', 'habitat_Forest', 'habitat_Grassland', 'habitat_Garden', 'habitat_Urban', 'habitat_Woodland

In [45]:
# save the dataframe to a csv file
gnn_input_df.to_csv("../data/animals_transformed.csv", index=False)

gnn_input_df.head()

Unnamed: 0,scientific_name,weight_normalized,size_normalized,life_span_normalized,diet_Carnivore,diet_Herbivore,diet_Insectivore,diet_Omnivore,habitat_Forest,habitat_Grassland,...,habitat_Meadow,habitat_Gardens,habitat_other,continent_Europe,continent_Asia,continent_Africa,continent_North_America,continent_Central_America,continent_South_America,continent_Oceania
0,Thomisus onustus,9.994803e-10,0.0,0.009066,0,0,1,0,0,0,...,1,0,0,1,1,1,0,0,0,0
1,Nerodia,1.09945e-05,0.066667,0.068159,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,Stagmomantis carolina,3.997999e-08,0.003175,0.006793,0,0,1,0,1,1,...,0,0,0,0,0,0,1,1,0,0
3,Copsychus saularis,6.996502e-07,0.013333,0.104523,0,0,1,0,1,0,...,0,0,0,0,1,0,0,0,0,0
4,Bombus pratorum,3.498231e-09,0.000635,0.009066,0,1,0,0,0,0,...,0,1,1,1,1,0,0,0,0,0
