# JSON Animal Data Processing

This notebook processes multiple JSON files containing animal data and combines them into a single standardized DataFrame.

In [74]:
import pandas as pd
import json
import glob
import re
import numpy as np
from typing import List, Union, Any

In [75]:
def parse_weight(weight_str: str) -> float:
    """
    Parse weight string and convert to grams.
    Handles ranges, different units, and various formats.
    """
    try:
        if weight_str is None or weight_str == '' or str(weight_str).lower() == 'nan': 
            return np.nan
    except:
        if weight_str is None: 
            return np.nan
    
    weight_str = str(weight_str).strip().lower()
    
    # Extract numeric values and handle ranges
    numbers = re.findall(r'\d+\.?\d*', weight_str)
    if not numbers:
        return np.nan
    
    # Calculate average if range
    if len(numbers) > 1:
        avg_value = sum(float(num) for num in numbers) / len(numbers)
    else:
        avg_value = float(numbers[0])
    
    # Convert to grams based on unit
    if 'kg' in weight_str:
        return avg_value * 1000  # kg to g
    elif 'g' in weight_str and 'kg' not in weight_str:
        return avg_value  # already in grams
    elif 'lb' in weight_str or 'pound' in weight_str:
        return avg_value * 453.592  # lb to g
    else:
        # Assume kg if no unit specified
        return avg_value * 1000

In [76]:
def parse_size(size_str: str) -> int:
    """
    Parse size string and convert to centimeters.
    Handles ranges, different units, and various formats.
    """
    try:
        if size_str is None or size_str == '' or str(size_str).lower() == 'nan': 
            return np.nan
    except:
        if size_str is None: 
            return np.nan
    
    size_str = str(size_str).strip().lower()
    
    # Extract numeric values and handle ranges
    numbers = re.findall(r'\d+\.?\d*', size_str)
    if not numbers:
        return np.nan
    
    # Calculate average if range
    if len(numbers) > 1:
        avg_value = sum(float(num) for num in numbers) / len(numbers)
    else:
        avg_value = float(numbers[0])
    
    # Convert to cm based on unit
    if 'm' in size_str and 'mm' not in size_str and 'cm' not in size_str:
        return int(avg_value * 100)  # m to cm
    elif 'cm' in size_str:
        return int(avg_value)  # already in cm
    elif 'mm' in size_str:
        return int(avg_value / 10)  # mm to cm
    elif 'in' in size_str or 'inch' in size_str:
        return int(avg_value * 2.54)  # inches to cm
    elif 'ft' in size_str or 'foot' in size_str or 'feet' in size_str:
        return int(avg_value * 30.48)  # feet to cm
    else:
        # Assume meters if no unit specified
        return int(avg_value * 100)

In [77]:
def parse_lifespan(lifespan_str: str) -> float:
    """
    Parse lifespan string and convert to years.
    Handles ranges, different units, and various formats.
    """
    try:
        if lifespan_str is None or lifespan_str == '' or str(lifespan_str).lower() == 'nan': 
            return np.nan
    except:
        if lifespan_str is None: 
            return np.nan
    
    lifespan_str = str(lifespan_str).strip().lower()
    
    # Extract numeric values and handle ranges
    numbers = re.findall(r'\d+\.?\d*', lifespan_str)
    if not numbers:
        return np.nan
    
    # Calculate average if range
    if len(numbers) > 1:
        avg_value = sum(float(num) for num in numbers) / len(numbers)
    else:
        avg_value = float(numbers[0])
    
    # Convert to years based on unit
    if 'month' in lifespan_str:
        return avg_value / 12  # months to years
    elif 'day' in lifespan_str:
        return avg_value / 365  # days to years
    elif 'week' in lifespan_str:
        return avg_value / 52  # weeks to years
    else:
        # Assume years
        return avg_value

In [78]:
def parse_array_field(field_value) -> List[str]:
    """
    Parse field into array of cleaned strings.
    Handles strings, lists, numpy arrays, and other formats.
    """
    # Handle None or empty cases first
    if field_value is None or field_value == '':
        return []
    
    # Handle numpy arrays or lists
    if isinstance(field_value, (list, np.ndarray)):
        try:
            items = [str(item).strip().title() for item in field_value if item is not None and str(item).strip()]
            return [item for item in items if item and item != 'Nan']
        except:
            return []
    
    # Handle strings
    try:
        field_str = str(field_value).strip()
        if not field_str or field_str.lower() == 'nan':
            return []
        items = [item.strip().title() for item in field_str.split(',')]
        return [item for item in items if item and item != 'Nan']
    except:
        return []

In [79]:
def process_json_file(file_path: str) -> pd.DataFrame:
    """
    Process a single JSON file and return a cleaned DataFrame.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    print(f"  Processing {len(df)} records...")
    
    # Apply parsing functions
    df['weight'] = df['weight'].apply(parse_weight)
    df['size'] = df['size'].apply(parse_size)
    df['life_span'] = df['life_span'].apply(parse_lifespan)
    df['habitat'] = df['habitat'].apply(parse_array_field)
    df['continent'] = df['continent'].apply(parse_array_field)
    
    # Clean diet field with safe string conversion
    df['diet'] = df['diet'].apply(lambda x: str(x).strip().title() if x is not None and str(x).lower() != 'nan' else 'Unknown')
    
    return df

In [80]:
# Find all JSON files in the data directory
json_files = glob.glob('../data/gemini_animal_features_*.json')
print(f"Found {len(json_files)} JSON files:")
for file in sorted(json_files):
    print(f"  - {file}")

Found 6 JSON files:
  - ../data/gemini_animal_features_0_1000.json
  - ../data/gemini_animal_features_1000_2000.json
  - ../data/gemini_animal_features_2000_3000.json
  - ../data/gemini_animal_features_3000_4000.json
  - ../data/gemini_animal_features_4000_5000.json
  - ../data/gemini_animal_features_5000_6000.json


In [81]:
# Process all JSON files and combine them
all_dataframes = []

for file_path in sorted(json_files):
    print(f"Processing {file_path}...")
    try:
        df = process_json_file(file_path)
        all_dataframes.append(df)
        print(f"  - Processed {len(df)} records")
    except Exception as e:
        print(f"  - Error processing {file_path}: {e}")

# Combine all dataframes
if all_dataframes:
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    print(f"\nCombined dataset shape: {combined_df.shape}")
else:
    print("No data to combine")
    combined_df = pd.DataFrame()

Processing ../data/gemini_animal_features_0_1000.json...
  Processing 774 records...
  - Processed 774 records
Processing ../data/gemini_animal_features_1000_2000.json...
  Processing 763 records...
  - Processed 763 records
Processing ../data/gemini_animal_features_2000_3000.json...
  Processing 785 records...
  - Processed 785 records
Processing ../data/gemini_animal_features_3000_4000.json...
  Processing 415 records...
  - Processed 415 records
Processing ../data/gemini_animal_features_4000_5000.json...
  Processing 737 records...
  - Processed 737 records
Processing ../data/gemini_animal_features_5000_6000.json...
  Processing 120 records...
  - Processed 120 records

Combined dataset shape: (3594, 8)


In [82]:
# Display basic information about the combined dataset
print("Dataset Info:")
print(f"Total records: {len(combined_df)}")
print(f"Columns: {list(combined_df.columns)}")
print("\nData types:")
print(combined_df.dtypes)
print("\nMissing values:")
print(combined_df.isnull().sum())

Dataset Info:
Total records: 3594
Columns: ['scientific_name', 'common_name', 'weight', 'size', 'diet', 'life_span', 'habitat', 'continent']

Data types:
scientific_name     object
common_name         object
weight             float64
size               float64
diet                object
life_span          float64
habitat             object
continent           object
dtype: object

Missing values:
scientific_name     0
common_name         0
weight             18
size                5
diet                0
life_span          56
habitat             0
continent           0
dtype: int64


In [83]:
# Display first few rows
print("First 5 rows:")
combined_df.head()

First 5 rows:


Unnamed: 0,scientific_name,common_name,weight,size,diet,life_span,habitat,continent
0,Abantennarius sanguineus,Bloody frogfish,50.0,10.0,Carnivore,6.0,"[Reefs, Sand, Rubble, Ocean, Coastal]","[Asia, Oceania, Africa, North America, South A..."
1,Abantis paradisea,Paradise Skipper,1.0,3.0,Herbivore,0.075,"[Forest, Woodland, Savannah, Garden]",[Africa]
2,Abbottina rivularis,Chinese false gudgeon,20.0,10.0,Omnivore,4.0,"[River, Stream, Lake, Pond, Canal]",[Asia]
3,Abisares viridipennis,African darkling beetle,5.0,1.0,Herbivore,1.5,"[Forest, Woodland, Desert, Savannah]",[Africa]
4,Abramis brama,Bream,2000.0,50.0,Omnivore,17.5,"[Lake, River, Pond, Canal, Reservoir]","[Europe, Asia]"


In [84]:
# Display summary statistics for numerical columns
print("Summary statistics:")
combined_df[['weight', 'size', 'life_span']].describe()

Summary statistics:


Unnamed: 0,weight,size,life_span
count,3576.0,3589.0,3538.0
mean,50524.1,23.672332,5.054717
std,1117806.0,77.008825,8.241806
min,1e-06,0.0,0.00274
25%,0.055,1.0,0.75
50%,1.0,4.0,1.5
75%,75.0,21.0,7.5
max,50025000.0,1575.0,110.0


In [85]:
# Check unique values in categorical columns
print("Unique diet types:")
print(combined_df['diet'].value_counts())

print("\nSample habitat arrays:")
print(combined_df['habitat'].head(10).tolist())

print("\nSample continent arrays:")
print(combined_df['continent'].head(10).tolist())

Unique diet types:
diet
Herbivore                                      1285
Carnivore                                       836
Omnivore                                        752
Insectivore                                     706
Insectivore, Herbivore                            7
Carnivore, Herbivore                              2
Fungivore                                         2
Detritivore                                       1
Unknown                                           1
Herbivore, Carnivore, Omnivore, Insectivore       1
Herbivore, Omnivore, Insectivore                  1
Name: count, dtype: int64

Sample habitat arrays:
[['Reefs', 'Sand', 'Rubble', 'Ocean', 'Coastal'], ['Forest', 'Woodland', 'Savannah', 'Garden'], ['River', 'Stream', 'Lake', 'Pond', 'Canal'], ['Forest', 'Woodland', 'Desert', 'Savannah'], ['Lake', 'River', 'Pond', 'Canal', 'Reservoir'], ['Forest', 'Woodland', 'Savannah'], ['Tree', 'Orchard', 'Garden', 'Forest'], ['Forest', 'Grassland', 'Urban', 'Wetlan

In [86]:
# Remove duplicates based on scientific name
print(f"Records before duplicate removal: {len(combined_df)}")
combined_df = combined_df.drop_duplicates(subset=['scientific_name'], keep='first')
print(f"Records after duplicate removal: {len(combined_df)}")

Records before duplicate removal: 3594
Records after duplicate removal: 3594


In [87]:
# Filter out records with multiple diets (containing commas)
print("Diet analysis:")
print("All diet types:")
print(combined_df['diet'].value_counts())

print(f"\nRecords with multiple diets (containing commas): {combined_df['diet'].str.contains(',').sum()}")
print("Multi-diet records:")
multi_diet_records = combined_df[combined_df['diet'].str.contains(',', na=False)]
print(multi_diet_records[['scientific_name', 'common_name', 'diet']])

print(f"\nFiltering out {len(multi_diet_records)} records with multiple diets...")
combined_df_single_diet = combined_df[~combined_df['diet'].str.contains(',', na=False)]
print(f"Records after filtering: {len(combined_df_single_diet)}")

print("\nDiet types after filtering:")
print(combined_df_single_diet['diet'].value_counts())

Diet analysis:
All diet types:
diet
Herbivore                                      1285
Carnivore                                       836
Omnivore                                        752
Insectivore                                     706
Insectivore, Herbivore                            7
Carnivore, Herbivore                              2
Fungivore                                         2
Detritivore                                       1
Unknown                                           1
Herbivore, Carnivore, Omnivore, Insectivore       1
Herbivore, Omnivore, Insectivore                  1
Name: count, dtype: int64

Records with multiple diets (containing commas): 11
Multi-diet records:
                      scientific_name                    common_name  \
25                              Acari                Mites and ticks   
26                        Acariformes                          Mites   
1032          Crithagra citrinipectus          Lemon-breasted Canary   
1033 

In [88]:
# Filter to keep only specific diet types: Herbivore, Carnivore, Omnivore, Insectivore
allowed_diets = ['Herbivore', 'Carnivore', 'Omnivore', 'Insectivore']

print("Filtering to keep only main diet types...")
print(f"Allowed diets: {allowed_diets}")

print(f"\nBefore diet filtering: {len(combined_df_single_diet)} records")
print("Diet distribution before filtering:")
print(combined_df_single_diet['diet'].value_counts())

# Filter to keep only records with allowed diet types
combined_df_filtered = combined_df_single_diet[combined_df_single_diet['diet'].isin(allowed_diets)]

print(f"\nAfter diet filtering: {len(combined_df_filtered)} records")
print(f"Removed {len(combined_df_single_diet) - len(combined_df_filtered)} records")

print("\nDiet distribution after filtering:")
print(combined_df_filtered['diet'].value_counts())

print("\nFinal diet percentages:")
diet_percentages = combined_df_filtered['diet'].value_counts(normalize=True) * 100
for diet, percentage in diet_percentages.items():
    print(f"{diet}: {percentage:.1f}%")

Filtering to keep only main diet types...
Allowed diets: ['Herbivore', 'Carnivore', 'Omnivore', 'Insectivore']

Before diet filtering: 3583 records
Diet distribution before filtering:
diet
Herbivore      1285
Carnivore       836
Omnivore        752
Insectivore     706
Fungivore         2
Detritivore       1
Unknown           1
Name: count, dtype: int64

After diet filtering: 3579 records
Removed 4 records

Diet distribution after filtering:
diet
Herbivore      1285
Carnivore       836
Omnivore        752
Insectivore     706
Name: count, dtype: int64

Final diet percentages:
Herbivore: 35.9%
Carnivore: 23.4%
Omnivore: 21.0%
Insectivore: 19.7%


In [89]:
# Clean and standardize continent values
allowed_continents = ['North America', 'Asia', 'Africa', 'Europe', 'South America', 'Oceania', 'Central America']
global_keywords = ['All', 'Global', 'Worldwide']

print("Cleaning continent values...")
print(f"Allowed continents: {allowed_continents}")
print(f"Global keywords (will be replaced with all continents): {global_keywords}")

# First, let's see what continent values we currently have
all_continent_values = set()
for continents in combined_df_filtered['continent']:
    if isinstance(continents, list):
        all_continent_values.update(continents)

print(f"\nCurrent unique continent values ({len(all_continent_values)}):")
for value in sorted(all_continent_values):
    print(f"  - {value}")

def clean_continent_array(continent_list):
    """Clean continent array according to specifications"""
    if not isinstance(continent_list, list):
        return []
    
    cleaned_continents = []
    
    for continent in continent_list:
        continent_str = str(continent).strip()
        
        # Check if it's a global keyword
        if continent_str in global_keywords:
            return allowed_continents.copy()  # Return all continents
        
        # Check if it's in our allowed list (exact match)
        if continent_str in allowed_continents:
            cleaned_continents.append(continent_str)
    
    # Remove duplicates while preserving order
    seen = set()
    result = []
    for continent in cleaned_continents:
        if continent not in seen:
            seen.add(continent)
            result.append(continent)
    
    return result

print(f"\nBefore continent cleaning: {len(combined_df_filtered)} records")

# Apply continent cleaning
combined_df_filtered['continent_cleaned'] = combined_df_filtered['continent'].apply(clean_continent_array)

# Check which records will have empty continent arrays
empty_continents = combined_df_filtered['continent_cleaned'].apply(len) == 0
print(f"Records with empty continent arrays after cleaning: {empty_continents.sum()}")

if empty_continents.sum() > 0:
    print("Sample records that will be removed (empty continents):")
    sample_empty = combined_df_filtered[empty_continents][['scientific_name', 'common_name', 'continent']].head()
    print(sample_empty.to_string(index=False))

# Remove records with empty continent arrays
combined_df_continent_cleaned = combined_df_filtered[~empty_continents].copy()

# Replace the original continent column with cleaned version
combined_df_continent_cleaned['continent'] = combined_df_continent_cleaned['continent_cleaned']
combined_df_continent_cleaned = combined_df_continent_cleaned.drop('continent_cleaned', axis=1)

print(f"\nAfter continent cleaning: {len(combined_df_continent_cleaned)} records")
print(f"Records removed due to invalid/empty continents: {len(combined_df_filtered) - len(combined_df_continent_cleaned)}")

# Show the cleaned continent values
cleaned_continent_values = set()
for continents in combined_df_continent_cleaned['continent']:
    if isinstance(continents, list):
        cleaned_continent_values.update(continents)

print(f"\nCleaned continent values ({len(cleaned_continent_values)}):")
for value in sorted(cleaned_continent_values):
    print(f"  - {value}")

Cleaning continent values...
Allowed continents: ['North America', 'Asia', 'Africa', 'Europe', 'South America', 'Oceania', 'Central America']
Global keywords (will be replaced with all continents): ['All', 'Global', 'Worldwide']

Current unique continent values (36):
  - Africa
  - All
  - All Continents
  - All Continents (Excluding Antarctica)
  - All Continents Except Antarctica
  - Antarctica
  - Arctic
  - Asia
  - Atlantic
  - Australia
  - Caribbean
  - Central America
  - Central_America
  - Cosmopolitan
  - Eurasia
  - Europe
  - Global
  - Global (Fossil)
  - Global (Introduced)
  - Globally
  - Indian
  - Madagascar
  - Middle East
  - Multiple Continents
  - North Africa
  - North America
  - North_Africa
  - North_America
  - Oceania
  - Pacific
  - South Africa
  - South America
  - South_America
  - Southern
  - Varies Widely
  - Worldwide

Before continent cleaning: 3579 records
Records with empty continent arrays after cleaning: 39
Sample records that will be removed (

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df_filtered['continent_cleaned'] = combined_df_filtered['continent'].apply(clean_continent_array)


In [90]:
# Analyze records by habitat (using continent-cleaned dataset)
print("HABITAT ANALYSIS (Final Cleaned Dataset)")
print("=" * 50)

# Count records for each unique habitat type
habitat_counts = {}
for habitats in combined_df_continent_cleaned['habitat']:
    if isinstance(habitats, list):
        for habitat in habitats:
            habitat_counts[habitat] = habitat_counts.get(habitat, 0) + 1

print("Records per habitat type:")
habitat_df = pd.DataFrame(list(habitat_counts.items()), columns=['Habitat', 'Count']).sort_values('Count', ascending=False)
print(habitat_df.to_string(index=False))

print(f"\nTotal unique habitat types: {len(habitat_counts)}")
print(f"Most common habitat: {habitat_df.iloc[0]['Habitat']} ({habitat_df.iloc[0]['Count']} records)")
print(f"Least common habitats: {habitat_df[habitat_df['Count'] == habitat_df['Count'].min()]['Habitat'].tolist()}")

HABITAT ANALYSIS (Final Cleaned Dataset)
Records per habitat type:
               Habitat  Count
                Forest   1918
                Garden   1172
              Woodland   1114
             Grassland   1110
                 Urban    844
             Shrubland    774
               Gardens    381
                Meadow    371
               Forests    364
               Wetland    340
                 Field    329
               Savanna    286
               Coastal    250
                Desert    235
             Woodlands    231
          Agricultural    215
                  Lake    212
                 River    210
              Farmland    189
                  Pond    188
            Grasslands    172
              Savannah    148
                 Marsh    143
                Stream    135
                Fields    135
                  Park    133
              Mountain    119
               Estuary    112
                 Ocean    111
                 Lakes    109
   

In [91]:
# Analyze records by continent (using continent-cleaned dataset)
print("CONTINENT ANALYSIS (Final Cleaned Dataset)")
print("=" * 50)

# Count records for each unique continent
continent_counts = {}
for continents in combined_df_continent_cleaned['continent']:
    if isinstance(continents, list):
        for continent in continents:
            continent_counts[continent] = continent_counts.get(continent, 0) + 1

print("Records per continent:")
continent_df = pd.DataFrame(list(continent_counts.items()), columns=['Continent', 'Count']).sort_values('Count', ascending=False)
print(continent_df.to_string(index=False))

print(f"\nTotal unique continents: {len(continent_counts)}")
print(f"Most common continent: {continent_df.iloc[0]['Continent']} ({continent_df.iloc[0]['Count']} records)")
print(f"Least common continent: {continent_df.iloc[-1]['Continent']} ({continent_df.iloc[-1]['Count']} records)")

# Show animals that appear on multiple continents
print("\nAnimals found on multiple continents:")
multi_continent_animals = combined_df_continent_cleaned[combined_df_continent_cleaned['continent'].apply(lambda x: len(x) if isinstance(x, list) else 0) > 1]
print(f"Number of animals found on multiple continents: {len(multi_continent_animals)}")
if len(multi_continent_animals) > 0:
    print("Sample multi-continent animals:")
    print(multi_continent_animals[['scientific_name', 'common_name', 'continent']].head().to_string(index=False))

# Show continent distribution percentages
print(f"\nContinent distribution percentages:")
continent_percentages = continent_df.set_index('Continent')['Count'] / continent_df['Count'].sum() * 100
for continent, percentage in continent_percentages.items():
    print(f"{continent}: {percentage:.1f}%")

CONTINENT ANALYSIS (Final Cleaned Dataset)
Records per continent:
      Continent  Count
  North America   1852
           Asia   1812
         Africa   1454
         Europe   1270
  South America    826
        Oceania    789
Central America    388

Total unique continents: 7
Most common continent: North America (1852 records)
Least common continent: Central America (388 records)

Animals found on multiple continents:
Number of animals found on multiple continents: 1788
Sample multi-continent animals:
         scientific_name              common_name                                             continent
Abantennarius sanguineus          Bloody frogfish [Asia, Oceania, Africa, North America, South America]
           Abramis brama                    Bream                                        [Europe, Asia]
            Acalyptratae        Acalyptrate flies  [Asia, Africa, Europe, North America, South America]
              Acanalonia Acanalonian planthoppers       [North America, Cent

In [92]:
# Save the final cleaned dataset (with standardized continents)
print("Saving final cleaned dataset (main diets + standardized continents)...")

# Save final cleaned CSV
final_cleaned_csv = '../data/combined_animal_features_final.csv'
combined_df_continent_cleaned.to_csv(final_cleaned_csv, index=False)

# Save final cleaned JSON
final_cleaned_json = '../data/combined_animal_features_final.json'
combined_df_continent_cleaned.to_json(final_cleaned_json, orient='records', indent=2)

print(f"Final cleaned dataset saved:")
print(f"  - CSV: {final_cleaned_csv}")
print(f"  - JSON: {final_cleaned_json}")
print(f"  - Records: {len(combined_df_continent_cleaned)}")

# Complete processing summary
print(f"\nComplete dataset processing summary:")
print(f"1. Original records: {len(combined_df)}")
print(f"2. After removing multi-diets: {len(combined_df_single_diet)}")
print(f"3. After filtering to main diets: {len(combined_df_filtered)}")
print(f"4. After cleaning continents: {len(combined_df_continent_cleaned)}")
print(f"Total records removed: {len(combined_df) - len(combined_df_continent_cleaned)}")
print(f"Final retention rate: {(len(combined_df_continent_cleaned) / len(combined_df) * 100):.1f}%")

print(f"\nFinal dataset characteristics:")
print(f"Shape: {combined_df_continent_cleaned.shape}")
print(f"\nDiet distribution:")
print(combined_df_continent_cleaned['diet'].value_counts())

print(f"\nContinent values (all standardized):")
final_continents = set()
for continents in combined_df_continent_cleaned['continent']:
    if isinstance(continents, list):
        final_continents.update(continents)
print(sorted(final_continents))

print(f"\nSample of final cleaned data:")
print(combined_df_continent_cleaned[['scientific_name', 'common_name', 'diet', 'continent']].head().to_string(index=False))

Saving final cleaned dataset (main diets + standardized continents)...
Final cleaned dataset saved:
  - CSV: ../data/combined_animal_features_final.csv
  - JSON: ../data/combined_animal_features_final.json
  - Records: 3540

Complete dataset processing summary:
1. Original records: 3594
2. After removing multi-diets: 3583
3. After filtering to main diets: 3579
4. After cleaning continents: 3540
Total records removed: 54
Final retention rate: 98.5%

Final dataset characteristics:
Shape: (3540, 8)

Diet distribution:
diet
Herbivore      1276
Carnivore       826
Omnivore        741
Insectivore     697
Name: count, dtype: int64

Continent values (all standardized):
['Africa', 'Asia', 'Central America', 'Europe', 'North America', 'Oceania', 'South America']

Sample of final cleaned data:
         scientific_name             common_name      diet                                             continent
Abantennarius sanguineus         Bloody frogfish Carnivore [Asia, Oceania, Africa, North Amer

In [93]:
# Save the combined and cleaned dataset
output_file = '../data/combined_animal_features.csv'
combined_df.to_csv(output_file, index=False)
print(f"Dataset saved to: {output_file}")

# Also save as JSON for preservation of array fields
output_json = '../data/combined_animal_features.json'
combined_df.to_json(output_json, orient='records', indent=2)
print(f"Dataset also saved as JSON to: {output_json}")

Dataset saved to: ../data/combined_animal_features.csv
Dataset also saved as JSON to: ../data/combined_animal_features.json


In [95]:
# Final verification - display the structure of our final dataset
print("Final Dataset Structure:")
print(f"Shape: {combined_df.shape}")
print(f"Columns: {list(combined_df.columns)}")
print("\nSample records:")
combined_df.head()

Final Dataset Structure:
Shape: (3594, 8)
Columns: ['scientific_name', 'common_name', 'weight', 'size', 'diet', 'life_span', 'habitat', 'continent']

Sample records:


Unnamed: 0,scientific_name,common_name,weight,size,diet,life_span,habitat,continent
0,Abantennarius sanguineus,Bloody frogfish,50.0,10.0,Carnivore,6.0,"[Reefs, Sand, Rubble, Ocean, Coastal]","[Asia, Oceania, Africa, North America, South A..."
1,Abantis paradisea,Paradise Skipper,1.0,3.0,Herbivore,0.075,"[Forest, Woodland, Savannah, Garden]",[Africa]
2,Abbottina rivularis,Chinese false gudgeon,20.0,10.0,Omnivore,4.0,"[River, Stream, Lake, Pond, Canal]",[Asia]
3,Abisares viridipennis,African darkling beetle,5.0,1.0,Herbivore,1.5,"[Forest, Woodland, Desert, Savannah]",[Africa]
4,Abramis brama,Bream,2000.0,50.0,Omnivore,17.5,"[Lake, River, Pond, Canal, Reservoir]","[Europe, Asia]"
