# NZ trailcams subset to Sentinel SQL
This notebook converts a LILA COCO dataset (a subset of NZ trailcams) to Sentinel SQL format. This notebook consists of three parts:
1. Loading data 
2. Species mapping
3. Uploading data 

The following environment is used to run this notebook:
```
conda create -n database python=3.11 pip -y
conda activate database
pip install pandas
pip install tqdm
pip install pyarrow
pip install SQLAlchemy
pip install --upgrade google-cloud-storage 
```

# 0. Set up

In [1]:
# import packages
import json
import pandas as pd
from tqdm import tqdm
import uuid
import glob

In [2]:
# set paths and variables 
workdir = os.getcwd() # where this notebook and the original data lies, and where all the work will be done
og_datapath = f"{workdir}/data" # where the original data is (just a subset of the NZ trailcam dataset for testing)
metadata_path = f'{og_datapath}/trail_camera_images_of_new_zealand_animals_1.00.json' # metadata (for ALL data in the NZ trailcam dataset)
datapath = f"{workdir}/downsized_data"
dataload_path = f"{workdir}/dataload"

# 1. Loading data

In [3]:
# load megadetector output (RDE filtered)
md_file =  f'{workdir}/postprocessing/nz-trailcams-aac-aiv/nz-trailcams-aac-aiv-2024-jun-07-v5a.0.0/combined_api_outputs/nz-trailcams-aac-aiv-2024-jun-07-v5a.0.0_detections.filtered_rde_0.100_0.850_15_0.200.json'
with open(md_file, 'r') as f:
    md_data = json.load(f)
    print(f"md_data.keys(): {md_data.keys()}")

# extract the bbox detections for each image and convert them into our database format 
images_df = pd.DataFrame(md_data['images'])
fn, xmin, ymin, xmax, ymax, label, confidence = [], [], [], [], [], [], []

# set up progress bar counter
pbar = tqdm(total=len(images_df))
for i in range(len(images_df)):
    detections = images_df['detections'][i]
    try:
        for detection in detections:
            fn.append(images_df['file'][i])
            label.append(detection['category'])
            if detection['bbox'] is None:
                xmin.append(None)
                ymin.append(None)
                xmax.append(None)
                ymax.append(None)
                confidence.append(None)
            else:
                xmin.append(detection['bbox'][0])
                ymin.append(detection['bbox'][1])
                xmax.append(detection['bbox'][0]+detection['bbox'][2])
                ymax.append(detection['bbox'][1]+detection['bbox'][3])
                confidence.append(detection['conf'])  
    except Exception as e:
        print(e)

    pbar.update(1)

pbar.close()

md_df = pd.DataFrame({'filename': fn, 'voc_xmin': xmin, 'voc_ymin': ymin, 'voc_xmax': xmax, 'voc_ymax': ymax, 'label': label, 'confidence': confidence})
md_df['filename'] = md_df['filename'].str.replace('\\', '/')
md_df['image_id'] = md_df['filename'].str.split('/').str[-1].str.split('.').str[0]

# generate unique id for each bbox detection
%time uuid = [str(uuid.uuid4()) for i in range(len(md_df))] 
md_df['bb_id'] = uuid

del images_df
del fn, xmin, ymin, xmax, ymax, label, confidence

md_data.keys(): dict_keys(['info', 'detection_categories', 'images'])


100%|██████████| 38/38 [00:00<00:00, 32461.01it/s]

CPU times: user 327 μs, sys: 469 μs, total: 796 μs
Wall time: 702 μs





In [4]:
# check extracted annotations
md_df.head(2)

Unnamed: 0,filename,voc_xmin,voc_ymin,voc_xmax,voc_ymax,label,confidence,image_id,bb_id
0,ACC/banded_rail/0067A52A-FB22-4CB4-B54A-1894E7...,0.0,0.084,0.052,0.248,1,0.015,0067A52A-FB22-4CB4-B54A-1894E7F2B1A5,6968b1eb-b871-4818-af4d-493357124534
1,ACC/banded_rail/0067A52A-FB22-4CB4-B54A-1894E7...,0.742,0.318,0.966,0.569,1,0.851,0067A52A-FB22-4CB4-B54A-1894E7F2B1A5,bcf5388a-d366-436f-83c6-0be380cdfb9c


In [5]:
print(f"There are {len(set(md_df['image_id']))} unique images and {len((md_df['image_id']))} bounding boxes detected.")

There are 38 unique images and 42 bounding boxes detected.


In [6]:
# load metadata 
with open(metadata_path, 'r') as f:
    metadata = json.load(f)
    print(f"metadata.keys(): {metadata.keys()}")

# retrieve and combine metadata about annotations and categories 
metadata_df = pd.merge(pd.DataFrame(metadata['annotations']), 
                    pd.DataFrame(metadata['categories']).rename(columns={'id': 'category_id'}), 
                    on='category_id')
metadata_df = metadata_df[['image_id','name']]

# convert the format of the image_id in metadata to match that of megadetector's
metadata_df['image_id'] = [os.path.basename(fn).split('.')[0] for fn in metadata_df['image_id']] 

# only include the metadata relevant to this data subset 
%time include_data = [True if image_id in list(md_df['image_id']) else False for image_id in metadata_df['image_id']]
metadata_df = metadata_df[include_data]

metadata.keys(): dict_keys(['images', 'categories', 'info', 'annotations'])
CPU times: user 13 s, sys: 2.58 ms, total: 13 s
Wall time: 13 s


In [7]:
# check unique common_name and their counts
metadata_df['name'].value_counts()

name
banded_rail    34
morepork        4
Name: count, dtype: int64

In [8]:
# combine and check relevant data from megadetector output and metadata 
df = pd.merge(md_df, metadata_df, on='image_id', how='outer')
df.head(2)

Unnamed: 0,filename,voc_xmin,voc_ymin,voc_xmax,voc_ymax,label,confidence,image_id,bb_id,name
0,ACC/banded_rail/0067A52A-FB22-4CB4-B54A-1894E7...,0.0,0.084,0.052,0.248,1,0.015,0067A52A-FB22-4CB4-B54A-1894E7F2B1A5,6968b1eb-b871-4818-af4d-493357124534,banded_rail
1,ACC/banded_rail/0067A52A-FB22-4CB4-B54A-1894E7...,0.742,0.318,0.966,0.569,1,0.851,0067A52A-FB22-4CB4-B54A-1894E7F2B1A5,bcf5388a-d366-436f-83c6-0be380cdfb9c,banded_rail


In [9]:
# save loaded data to file
if not os.path.exists(dataload_path):
    os.mkdir(dataload_path)

df.to_feather(f'{dataload_path}/nz-trailcams-test.feather')

# 2. Species mapping

In [10]:
# load data 
df = pd.read_feather(f'{dataload_path}/nz-trailcams-test.feather')
taxa_df = pd.read_csv(f'{og_datapath}/lila-taxonomy-mapping_release.csv')

# extract relevant data 
# set(taxa_df.dataset_name) # run this to check for dataset name 
taxa_df = taxa_df[taxa_df['dataset_name'] ==  'Trail Camera Images of New Zealand Animals']
taxa_df = taxa_df[['taxonomy_level','query','kingdom','phylum','class','order','family','genus','species','subspecies']]
taxa_df.rename(columns={'query':'name'}, inplace=True)

# only keep taxa info for species included in our dataset 
%time include_data = [True if name in set(df['name']) else False for name in taxa_df['name']]
taxa_df = taxa_df[include_data]

CPU times: user 5.82 ms, sys: 0 ns, total: 5.82 ms
Wall time: 5.87 ms


In [11]:
# merge taxa info and save to file 
df = pd.merge(df, taxa_df, on='name', how='outer')
df.reset_index(drop=True, inplace=True)
df.to_feather(f'{dataload_path}/nz-trailcams-test_taxa.feather')

# 3. Uploading data