In [1]:
from prepare_data import *

prepare_functions = {
    'NyalaData': prepare_nyala_data,
    'ZindiTurtleRecall': prepare_zindi_turtle_recall,
    'BelugaID': prepare_beluga_id,
    'BirdIndividualID': prepare_bird_individual_id,
    'SealID': prepare_seal_id,
    'FriesianCattle2015': prepare_friesian_cattle_2015,
    'ATRW': prepare_atrw,
    'NDD20': prepare_ndd20,
    'SMALST': prepare_smalst,
    'SeaTurtleIDHeads': prepare_sea_turtle_id_heads,
    'AAUZebraFish': prepare_zebra_fish,
    'CZoo': prepare_czoo,
    'CTai': prepare_ctai,
    'Giraffes': prepare_giraffes,
    'HyenaID2022': prepare_hyena_id_2022,
    'MacaqueFaces': prepare_macaque_faces,
    'OpenCows2020': prepare_open_cows_2020,
    'StripeSpotter': prepare_stripe_spotter,
    'AerialCattle2017': prepare_aerial_cattle_2017,
    'GiraffeZebraID': prepare_giraffe_zebra_id,
    'IPanda50': prepare_ipanda_50,
    'WhaleSharkID': prepare_whaleshark_id,
    'FriesianCattle2017': prepare_friesian_cattle_2017,
    'Cows2021': prepare_cows2021,
    'LeopardID2022': prepare_leopard_id_2022,
    'NOAARightWhale': prepare_noaa_right_whale,
    'HappyWhale': prepare_happy_whale,
    'HumpbackWhaleID': prepare_humpback_whale_id,
    'LionData': prepare_lion_data
}

# 1. Process the datasets
**Processing includes:**
- Resize images
- Crop bounding boxes
- Crop black background of segmented images
- If multiple identities exist in one image, we crop them and split them into two images.


**We save two sets of images:**
- For inference with images resized to 518x518: CLIP, DINOv2, and MegaDescriptor-L-384
- For inference with images resized to 256x256: MegaDescriptor-T-224, MegaDescriptor-S-224, MegaDescriptor-B-224, MegaDescriptor-L-224


**Note**: Stored images were further transformed (e.g. resized to 224x224) depending on model during the inference. Inference with smaller models using the stored 518x518 images is possible but it gives slightly different results that in the paper.

In [None]:
datasets_folder = '/mnt/data/turtles/datasets/datasets'  # Path to downloaded datasets

# Create folders with images resized to 256 and 518
for name, prepare in prepare_functions.items():
    print(name)
    prepare(size=256, root=f'{datasets_folder}/{name}', new_root=f'images/size-256/{name}')
    prepare(size=518, root=f'{datasets_folder}/{name}', new_root=f'images/size-518/{name}')

    # Metadata should be the same
    metadata_256 = pd.read_csv(f'images/size-256/{name}/annotations.csv', index_col=0)
    metadata_518 = pd.read_csv(f'images/size-518/{name}/annotations.csv', index_col=0)
    assert metadata_256.equals(metadata_518)

# 2. Create split metadata for each dataset
**Split datasets:**
- Closed split, images with unknown identities are discarded
- Store the metadata for each dataset as CSV.
- Test set for each dataset is used for evaluation.

In [5]:
# Create dataframe with training / test set splits
from wildlife_datasets import splits
for name in prepare_functions:
    metadata = pd.read_csv(f'images/size-518/{name}/annotations.csv', index_col=0)
    splitter = splits.ClosedSetSplit(0.8, identity_skip='unknown', seed=666)
    idx_train, idx_test = splitter.split(metadata)[0]

    metadata.loc[metadata.index[idx_train], 'split'] = 'train'
    metadata.loc[metadata.index[idx_test], 'split'] = 'test'

    os.makedirs(f'metadata/datasets/{name}/', exist_ok=True)
    metadata.to_csv(f'metadata/datasets/{name}/metadata.csv')


# 3. Create metadata for aggregated training dataset
- Combine training sets from metadata of all datasets to single aggregated metadata
- The aggregated training set is used for training MegaDescriptors.
    - Adds dataset name to identity to prevent identity name collisions
    - Adds dataset name to the image path to enable loading the aggregated dataset using `WildlifeDataset`

In [4]:
import pandas as pd

results = []
for name in prepare_functions:
    metadata = pd.read_csv(f'metadata/datasets/{name}/metadata.csv', index_col=0)

    df = metadata.query("split == 'train'").copy()
    df['dataset'] = name
    df['identity'] = name + '_' + df['identity'].astype(str)
    df['path'] = name + '/' + df['path']
    results.append(df)

combined_all = pd.concat(results)

os.makedirs(f'metadata/combined/', exist_ok=True)
combined_all.to_csv(f'metadata/combined/combined_all.csv')