## Evaluate Megadetector v5(a) false negatives

The purpose of this Notebooks is to test the detection rate of Megadetector v5(a) on real data from Animl and evaluate (a) whether adjusting the confidence threshold can reduce false negatives, and (b) what the detection rate looks like at the sequence/burst level compared to the individual image level.

The first part of the notebook pulls a block of image records down from MongoDB within a date range and uses their timestamps to group them into bursts/sequences. 

The second part of the notebook queries MongoDB for all* image records that contain false negatives for a given class/species within that same date range, downloads those actual image files into memory, submits them to the SageMaker-hosted MDv5 endpoint, and filters the results at a lower confidence threshold to determine whether lowering the confidence threshold would have reduced false negatives.

Finally, we evaluate the other images in the bursts of the remaining false negative records to check whether there were successful detections elsewhere in their respective sequences. 

\* there are some caveats to be aware of with this query. See the "Find false negative section below" for more info.


*NOTE: This notebook is intended to be run locally, and assumes the following:*
- you are currently running a virtual env with Python 3.9
- you have configured the awscli with an account called "animl" with the requisite permissions to read from S3 and invoke Sagemaker endpoints
- you have a MongoDB Atlas URL/connection string with read permissions stored in a .env file

*See README for assistence with any of the above*

## Setup

#### MongoDB Atlas Setup

In [None]:
%load_ext dotenv
%dotenv

import os
from pymongo import MongoClient

MONGODB_URL = os.getenv('MONGODB_URL')

db_client = MongoClient(MONGODB_URL)
db = db_client['animl-prod']
images = db['images']

#### AWS Setup

In [None]:
import boto3, time, json
import sagemaker
import os

os.environ['AWS_PROFILE'] = 'animl'
os.environ['AWS_DEFAULT_REGION'] = 'us-west-2'

sess = boto3.Session()
sm = sess.client('sagemaker')
region = sess.region_name
account = boto3.client('sts').get_caller_identity().get('Account')

img_bucket = 'animl-images-serving-prod'
class_map = { 1: 'animal', 2: 'person', 3: 'vehicle' }

#### Check status of SageMaker endpoint

In [None]:
%%time
endpoint_name = 'megadetectorv5-torchserve-serverless-prod'
resp = sm.describe_endpoint(EndpointName=endpoint_name)
status = resp['EndpointStatus']
print(f'Status: {status}')

#### Query variables

In [None]:
from datetime import datetime

project = 'sci_biosecurity'
start = datetime(2022, 7, 16)
end = datetime(2022, 11, 1)
category = 'rodent'

#### Functions

In [None]:
from io import BytesIO
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def get_image_records(q):
    img_count = images.count_documents(q)
    print(f'found {img_count} image records')
    img_rcrds = list(images.find(q))
    return img_rcrds

def download_image_files(img_rcrds):
    print('Downloading image files to memory...')
    ret = []
    for rec in img_rcrds:
        key = f"original/{rec['_id']}-original.jpg"
        img = boto3.client('s3').get_object(Bucket=img_bucket, Key=key)['Body'].read()
        ret.append({ 'name': rec['_id'], 'data': img })
    print(f'Downloaded {len(ret)} images to memory')
    return ret

def detect_objects(imgs):
    print('Submitting images to endpoint for object detection...')
    client = boto3.client('runtime.sagemaker')
    ret = []
    for i in range(len(imgs)):
        response = client.invoke_endpoint(
            EndpointName = endpoint_name,
            ContentType = 'application/x-image',
            Body = imgs[i]['data']
        )
        response = json.loads(response['Body'].read())
        ret.append({ 'name': imgs[i]['name'], 'objects': response })
        if i % 5 == 0:
            print(f'successfully detected objects in image {i + 1}/{len(imgs)}')
    return ret

def filter_dets(imgs, conf, classes):
    print(f'filtering detections below confidence threshold {conf}')
    def func(obj): 
        if obj['confidence'] < conf or obj['class'] not in classes:
            return False
        else:
            return True
    for img in imgs:
        img['filtered_objects'] = list(filter(func, img['objects']))
    return imgs

def draw_bounding_box_on_image(image,ymin,xmin,ymax,xmax,classification):
    color_map = { 1: 'red', 2: 'blue', 3: 'yellow' }
    color = color_map.get(classification)
    draw = ImageDraw.Draw(image)
    im_width, im_height = image.size
    (left, right, top, bottom) = (xmin * im_width, xmax * im_width,
                                  ymin * im_height, ymax * im_height)
    draw.line([(left, top), (left, bottom), (right, bottom),
               (right, top), (left, top)], width=4, fill=color)

In [None]:
# functions for sequence grouping

import uuid

def stage_for_grouping(delta_index, index_array):
    for i in [delta_index, delta_index + 1]:
        if i not in index_array: 
            index_array.append(i)

def group_as_sequence(dep_img_indexes, dep_df, images_df):
    # use indices to get image ids from deployments DataFrame
    img_ids = dep_df.iloc[dep_img_indexes]
    img_ids = img_ids['_id'].tolist()
    # find the corresponding images records in the images DataFrame
    # and assign them the same burstId
    burstId = uuid.uuid4()
    images_df.loc[images_df['_id'].isin(img_ids), 'burstId'] = burstId

## Associate image records with burst Ids
 - pull all image records (for a specific project & within date range) into a DataFrame
 - split out by deployment
 - sort each deployment's image records chronologically
 - create array of time deltas between each image
 - iterate deltas, if the delta is <= some fixed delta limit (say, 2 seconds), treat them as being in the same burst
 - as a sanity check, print out a list of all the images in chronological order along side an "image is in burst" or "image is not in burst" evaluation... the images IN bursts should be clustered together chronologically (assuming that setting could get turned on/off)
 - other interesting stats would be: avg number of images in bursts, count of outliers (e.g. bursts w/ 4+ images or 2 images)

End goal is be able to map an image to a burst, and get the rest of the images in that burst

In [None]:
query = { 
  'projectId': project,
  'dateAdded': { '$gt': start, '$lt': end }
}

# read image records into DataFrame
raw_img_rcrds = get_image_records(query)
images_df = pd.DataFrame(raw_img_rcrds)

# add burstId column, parse dateTimeOriginal values as datetime64, sort chronologically
images_df['burstId'] = None
images_df['dateTimeOriginal'] = images_df['dateTimeOriginal'].apply(pd.to_datetime)
images_df.sort_values('dateTimeOriginal', inplace=True)

In [None]:
# Pull out all possible dep_ids
deploymentIds = np.unique(images_df['deploymentId'].values)
print(f'identified {len(deploymentIds)} deployment(s)')

In [None]:
# Iterate over deployments and group images into sequences
max_delta = 2 # seconds

for deploymentId in deploymentIds:
    # create deployment DataFrame
    dep_df = images_df.loc[images_df['deploymentId'] == deploymentId]

    # get time deltas (as timedelta64's)
    deltas = np.diff(dep_df['dateTimeOriginal']).astype('float64')
    
    # iterate over the deltas and group images by sequence
    img_indexes_to_sequence = []
    for i, delta in enumerate(deltas):
        if delta/1e9 <= max_delta:
            # the two images are part of same sequence
            stage_for_grouping(i, img_indexes_to_sequence)
        else:
            # this is a gap between sequences
            if len(img_indexes_to_sequence) > 0:
                group_as_sequence(img_indexes_to_sequence, dep_df, images_df)
                img_indexes_to_sequence = []

        if i == len(deltas) - 1:
            # we've reached the last delta in the array, 
            # so group the last staged sequence if there is one
            if len(img_indexes_to_sequence) > 0:
                group_as_sequence(img_indexes_to_sequence, dep_df, images_df)


In [None]:
# optional - save each deployment to a CSV (helpful for QA/QCing the burst Ids)
for deploymentId in deploymentIds:
    dep_df = images_df.loc[images_df['deploymentId'] == deploymentId]
    dep_df.to_csv(f'imgs_with_burst_ids-{deploymentId}.csv', index = True)

## Find false negatives

#### MongoDB query
This query is an attempt to Id Megadetector v5a false negatives. For more info: https://docs.google.com/spreadsheets/d/1xaMsICF-e97Ndgm8A9hkrxNRQkJofPQSGOgO9ML8wHU/edit#gid=0

A few caveats to this approach:
- ideally we would measure label counts and false negatives at the object-level, but for now this is using images as a proxy (i.e., "Validated label count" does not mean the number of validated labels, it means the number of IMAGES that have at least one object with that particular label validated). We could correct for this & count up actual objects, but I'd have to write scripts, rather than DB queries, to do that. *This will result in a slight undercount.*
- the query below looks for all images for which MDv5 had predicted there was nothing in it (we give those an "empty" label), but then a user invalidated that empty label and added their own object to the image manually. This doesn't account for situations in which MDv5 correctly guessed that there was an object *somewhere else* in the image (thus it wasn't given an "empty" label), but it didn't correctly guess all of the objects in the image (it missed others). *This will result in a slight undercount.*
- because we're querying image records, the known label filtering bug (https://github.com/tnc-ca-geo/animl-api/issues/43) will slightly skew results. *This will result in a slight overcount*

In [None]:
query = {
  'projectId': project,
  'dateAdded': { '$gt': start, '$lt': end },
  'objects': {
      '$elemMatch': {
        '$and': [
          {'locked': True},
          {'labels': {
              '$elemMatch': {
                  '$and': [
                      {'type': 'ml'},
                      {'mlModel': 'megadetector'},
                      {'validation.validated': False},
                      {'category':'empty'}
                  ]
              }
          }}
        ]
      }
  },
  'objects.labels': {
      '$elemMatch': {
        '$and': [
            {'type': 'manual'},
            {'validation.validated': True},
            {'category': category}
        ]
      }
  }
}

#### Read image records & image files into memory, submit to MDv5

In [None]:
img_rcrds = get_image_records(query)
imgs = download_image_files(img_rcrds)
img_detections = detect_objects(imgs)

#### Filter detections below confidence threshold

In [None]:
 # class schema we use is 1 for animal, 2 for person, 3 for vehicle
conf = 0.1
classes_to_include = [1,2]  # suppress vehicles

imgs_with_filtered_detections = filter_dets(
  img_detections,
  conf,
  classes_to_include
)

count = 0 
imgs_with_dets_above_threshold = []
for i, img in enumerate(imgs_with_filtered_detections):
    if len(img['filtered_objects']) > 0:
        imgs_with_dets_above_threshold.append(img['name'])
    for obj in img['filtered_objects']:
        print(f"{i} --- {img['name']} --- {obj['class']} --- {obj['confidence']}")
        count = count + 1

print(f'found {count} objects with detections above {conf}')

## Check false negatives
for true positivies in their respective bursts

In [None]:
def img_has_true_positive(img):
    # return true if image has an object w/ a megadetector label AND
    # a validated label of our desired class
    ret = False
    for obj in img.objects:
        has_md_label = False
        has_validated_label = False
        for lbl in obj["labels"]:
            if (lbl["type"] == "ml" and 
                lbl["mlModel"] == "megadetector"):
                has_md_label = True
            if (lbl["category"] == category and 
                "validation" in lbl and 
                lbl["validation"]["validated"] == True):
                has_validated_label = True
        if has_md_label and has_validated_label:
            ret = True
    return ret

def burst_has_true_positive(img_rcrd):
    # print(f'checking img {img_rcrd["_id"]}')

    # find img's burstId
    burstId = images_df.loc[images_df['_id'] == img_rcrd['_id'], 'burstId'].tolist()
    # print(f'burstId: {burstId[0]}')

    # find rest of images in burst, filter out this img
    imgs_in_burst = images_df.loc[images_df['burstId'] == burstId[0]]
    # print(f'images in burst: \n{imgs_in_burst["_id"]}')

    # for each remaining image, check for true positive
    ret = False
    for row in imgs_in_burst.itertuples():
        has_true_positive = img_has_true_positive(row)
        if has_true_positive:
            ret = True
    return ret

def remove_true_positives(img):
    return img['_id'] not in imgs_with_dets_above_threshold


In [None]:
# check the bursts of all remaining false negatives
# (i.e., those that would have still been missed even with a lower conf. threshold)
# for true positives

imgs_to_check_bursts = list(filter(remove_true_positives, img_rcrds))
detection_found_in_burst_count = 0
for img in imgs_to_check_bursts:
    if burst_has_true_positive(img):
        detection_found_in_burst_count = detection_found_in_burst_count + 1

print(
    f'found {detection_found_in_burst_count} true positives in the bursts ' + 
    f'associated with {len(imgs_to_check_bursts)} images that had false negatives')

#### Spot-check individual images & objects

In [None]:
img_index = 3
img_to_draw = imgs_with_filtered_detections[img_index]
image = Image.open(BytesIO(imgs[img_index]['data']))

print(f"{img_index} --- {img_to_draw['name']}")
for obj in img_to_draw['filtered_objects']:
    print(f"object --- class: {obj['class']} ({class_map[obj['class']]}), confidence: {obj['confidence']}")
    draw_bounding_box_on_image(image, obj['y1'], obj['x1'], obj['y2'], obj['x2'], obj['class'])
image