This notebook includes all three stages of the training free RPC product recognition that implements Vith + DINO + Bbox label assignment.

Models used in the algorithm took some time for inference and predicting all
instances in the dataset might not fit into a single runtime. Therefore the whole algorithm split into three stages and each stage generate outputs for other stages to use.

# 1st Stage: ViT-H Object Proposer

Inputs:  Images of the dataset

Outputs: All predicted bounding box objects in the form of Prediction objects, if they are matched with a gt box it will also be included in the objects.



In [None]:
!git clone https://github.com/uonat/SS2023_DI-Lab_Precitaste.git

Cloning into 'SS2023_DI-Lab_Precitaste'...
remote: Enumerating objects: 334, done.[K
remote: Counting objects: 100% (52/52), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 334 (delta 27), reused 27 (delta 15), pack-reused 282[K
Receiving objects: 100% (334/334), 30.46 MiB | 21.54 MiB/s, done.
Resolving deltas: 100% (122/122), done.


In [None]:
%cd '/content/SS2023_DI-Lab_Precitaste'

/content/SS2023_DI-Lab_Precitaste
* [32mdinov2                            [m efba59f Adding mAP calculation to evaluation code.
  main                              [m 02b0215 generalized version of calculate_embeddings
  [31mremotes/origin/CLIP-implementation[m 134fe54 Updated CLIP Notebook
  [31mremotes/origin/HEAD               [m -> origin/main
  [31mremotes/origin/dinov2             [m efba59f Adding mAP calculation to evaluation code.
  [31mremotes/origin/main               [m 02b0215 generalized version of calculate_embeddings
  [31mremotes/origin/yushan             [m db636cc Updated RegionCLIP as baseline
Already on 'dinov2'
Your branch is up to date with 'origin/dinov2'.


In [None]:
!pip install . &> /dev/null

In [None]:
import distutils.core
import sys,os
# Note: This is a faster way to install detectron2 in Colab, but it does not include all functionalities.
# See https://detectron2.readthedocs.io/tutorials/install.html for full installation instructions
!git clone 'https://github.com/facebookresearch/detectron2'  &> /dev/null
dist = distutils.core.run_setup("./detectron2/setup.py")
!python -m pip install {' '.join([f"'{x}'" for x in dist.install_requires])} &> /dev/null
sys.path.insert(0, os.path.abspath('./detectron2'))

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


In [None]:
!wget 'https://dl.fbaipublicfiles.com/detectron2/ViTDet/LVIS/mask_rcnn_vitdet_h/332434656/model_final_866730.pkl'

--2023-06-28 18:59:59--  https://dl.fbaipublicfiles.com/detectron2/ViTDet/LVIS/mask_rcnn_vitdet_h/332434656/model_final_866730.pkl
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 65.8.248.127, 65.8.248.107, 65.8.248.124, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|65.8.248.127|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2669063758 (2.5G) [binary/octet-stream]
Saving to: ‘model_final_866730.pkl’


2023-06-28 19:01:34 (26.9 MB/s) - ‘model_final_866730.pkl’ saved [2669063758/2669063758]



In [None]:
!bash '/content/SS2023_DI-Lab_Precitaste/scripts/download_rpc.sh'

Downloading dataset!
Downloading retail-product-checkout-dataset.zip to /content/SS2023_DI-Lab_Precitaste
100% 25.3G/25.3G [05:19<00:00, 40.2MB/s]
100% 25.3G/25.3G [05:19<00:00, 85.3MB/s]
Unzipping dataset...


In [None]:
import numpy as np
import cv2
import torch, detectron2
from detectron2.utils.visualizer import Visualizer
from detectron2.config import LazyConfig,instantiate
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.utils.logger import setup_logger
setup_logger()
import json
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import math
import torchvision.transforms as T
from PIL import Image
from sklearn.model_selection import train_test_split
from dataset.RPCDataset import RPCDataset
from notebooks.utils.dino_v2 import crop_object_with_bbox, calculate_iou, find_gt_bboxes_of_pred
from notebooks.utils.Prediction import Prediction, dump_pred_objects, get_pred_objects_per_image, read_pred_objects_json
from utilities.non_maximum_suppression import run_nms
from utilities.bbox_postprocess import eliminate_boxes

def load_model(model_path,config_path): #mask_rcnn_vitdet_h_100ep.py
    cfg = LazyConfig.load(config_path)
    model =  instantiate(cfg.model)
    DetectionCheckpointer(model).load(model_path)
    return model

In [None]:
# Dataset paths
rpc_main_path = '/content/SS2023_DI-Lab_Precitaste/retail_product_checkout'

val_size = 0.2
random_seed = 12

In [None]:
val_dataset = RPCDataset(rpc_main_path, 'val')

sub_classes = val_dataset.get_class_names()
device = "cuda" if torch.cuda.is_available() else "cpu"
train_sub_classes, val_sub_classes = train_test_split(sub_classes, test_size=val_size, random_state=random_seed)


model_path ="/content/SS2023_DI-Lab_Precitaste/model_final_866730.pkl"
config_path = "/content/SS2023_DI-Lab_Precitaste/detectron2/projects/ViTDet/configs/LVIS/mask_rcnn_vitdet_h_100ep.py"
object_proposer_model = load_model(model_path,config_path)
object_proposer_model.to(device)
object_proposer_model.eval()
print()

[06/28 19:26:29 d2.checkpoint.detection_checkpoint]: [DetectionCheckpointer] Loading from /content/SS2023_DI-Lab_Precitaste/model_final_866730.pkl ...



In [None]:
img_idx = np.arange(0, val_dataset.get_num_imgs())
train_img_idx, val_img_idx = train_test_split(img_idx, test_size=val_size, random_state=random_seed)

In [None]:
config = {
    'train_sub_classes': train_sub_classes,
    'val_sub_classes': val_sub_classes,
    'train_img_idx': train_img_idx.tolist(),
    'val_img_idx': val_img_idx.tolist()
}
with open('../train_config.json', 'w') as jfile:
    json.dump(config, jfile)

In [None]:
def get_pred_objects(img_indices):
    prediction_objects = []
    post_processed_prediction_objects = []
    with torch.no_grad():
        for i in tqdm(img_indices):
            element = val_dataset.get_element_by_id(i)
            pil_img = Image.open(element['img_path'])
            el_gt_annots = element['annots']
            h,w = pil_img.size

            img = T.PILToTensor()(pil_img.convert("RGB"))
            resized_img = T.Resize((int(h/4), int(w/4)), antialias=None)(img)
            batch = [{'image':resized_img.to(device)}]
            model_result=object_proposer_model(batch)

            picked_boxes, picked_score = run_nms(
                model_result[0]['instances'].pred_boxes.tensor.to(torch.int).tolist(),
                model_result[0]['instances'].scores.tolist(),
                np.asarray(img),
                0.5
            )
            denormed_boxes = np.array(picked_boxes) * 4

            remaining_boxes, remaining_boxes_indices = eliminate_boxes(denormed_boxes, h, w, area_thres=0.6, eps=10, return_bbox_indices=True)
            remaining_scores = [picked_score[i] for i in remaining_boxes_indices]

            # Extract label and bounding box of annotations and convert bboxes to xyxy format
            el_gt_bboxes = np.array([annot[0] for annot in el_gt_annots])
            el_gt_labels = np.array([annot[1] for annot in el_gt_annots])
            el_gt_bboxes[:, 2] = el_gt_bboxes[:, 0] + el_gt_bboxes[:, 2]
            el_gt_bboxes[:, 3] = el_gt_bboxes[:, 1] + el_gt_bboxes[:, 3]

            corresponding_bbox_index, unmatched_bbox_index = find_gt_bboxes_of_pred(denormed_boxes, picked_score, el_gt_bboxes)
            corresponding_post_bbox_index, unmatched_post_bbox_index = find_gt_bboxes_of_pred(remaining_boxes, remaining_scores, el_gt_bboxes)

            for i, box in enumerate(denormed_boxes):
                # Discard very small objects
                if (box[2] - box[0]) < 56 and (box[3] - box[1]) < 56:
                    continue

                cur_object = Prediction(element['img_name'], element['img_path'], box, picked_score[i])

                if corresponding_bbox_index[i] != -1:
                    matched_gt_annot_idx = corresponding_bbox_index[i]
                    matched_gt_annot = el_gt_annots[matched_gt_annot_idx]

                    is_train_class = matched_gt_annot[1] in train_sub_classes

                    cur_object.add_gt_bbox(matched_gt_annot[0], matched_gt_annot[1], is_train_class)

                prediction_objects.append(cur_object)

            for ind in unmatched_bbox_index:
                not_found_bbox = el_gt_annots[ind]
                cur_object = Prediction(element['img_name'], element['img_path'], None, None)

                is_train_class = not_found_bbox[1] in train_sub_classes
                cur_object.add_gt_bbox(not_found_bbox[0], not_found_bbox[1], is_train_class)

                prediction_objects.append(cur_object)

            for i, box in enumerate(remaining_boxes):
                # Discard very small objects
                if (box[2] - box[0]) < 56 and (box[3] - box[1]) < 56:
                    continue

                cur_object = Prediction(element['img_name'], element['img_path'], box, remaining_scores[i])

                if corresponding_post_bbox_index[i] != -1:
                    matched_gt_annot_idx = corresponding_post_bbox_index[i]
                    matched_gt_annot = el_gt_annots[matched_gt_annot_idx]

                    is_train_class = matched_gt_annot[1] in train_sub_classes

                    cur_object.add_gt_bbox(matched_gt_annot[0], matched_gt_annot[1], is_train_class)

                post_processed_prediction_objects.append(cur_object)

            for ind in unmatched_post_bbox_index:
                not_found_bbox = el_gt_annots[ind]
                cur_object = Prediction(element['img_name'], element['img_path'], None, None)

                is_train_class = not_found_bbox[1] in train_sub_classes
                cur_object.add_gt_bbox(not_found_bbox[0], not_found_bbox[1], is_train_class)

                post_processed_prediction_objects.append(cur_object)

        return prediction_objects, post_processed_prediction_objects

In [None]:
print("Testing with some indicies")
pred_objects, processed_pred_objects = get_pred_objects([58, 59])

  0%|          | 0/5 [00:00<?, ?it/s]

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [None]:
pred_objects, processed_pred_objects = get_pred_objects(train_img_idx)
dump_pred_objects(pred_objects, "../vith_res_train_pred_objects.json")
dump_pred_objects(processed_pred_objects, "../vith_res_train_processed_pred_objects.json")

In [None]:
pred_objects, processed_pred_objects = get_pred_objects(val_img_idx)
dump_pred_objects(pred_objects, "../vith_res_val_pred_objects.json")
dump_pred_objects(processed_pred_objects, "../vith_res_val_processed_pred_objects.json")

# 2nd Stage: DINOv2 Feature Extraction

This stage extract features for all the bounding boxes found in the first stage.

Input:  Prediction objects generated in the first stage

Output: Prediction objects from the first stage extended with the features extracted with Dino for all the predicted bounding box

In [None]:
import numpy as np
import cv2
import torch
import json
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import math
import torchvision.transforms as T
from PIL import Image

In [None]:
from notebooks.utils.dino_v2 import crop_object_with_bbox

In [None]:
# Dataset paths
rpc_main_path = '/content/SS2023_DI-Lab_Precitaste/retail_product_checkout'
# Dimension of the feature vector obtained from DINO
FEATURE_DIM = 384

In [None]:
from dataset.RPCDataset import RPCDataset
val_dataset = RPCDataset(rpc_main_path, 'val')
sub_classes = val_dataset.get_class_names()
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
with open('/content/train_config.json', 'r') as jfile:
    train_config = json.load(jfile)

In [None]:
from notebooks.utils.Prediction import Prediction, get_pred_objects_per_image, read_pred_objects_json, dump_pred_objects

In [None]:
from models.DINO import DINOFeatureExtractor
feat_extractor = DINOFeatureExtractor()

Using cache found in /root/.cache/torch/hub/facebookresearch_dinov2_main


In [None]:
def find_feature_vector_for_bbox(pred_objects):
    pred_objects_per_image = get_pred_objects_per_image(pred_objects)
    processed_pred_objects = []
    with torch.no_grad():
        for img_name in tqdm(pred_objects_per_image):

            img_pred_objects = pred_objects_per_image[img_name]
            img_path = img_pred_objects[0].img_path

            pil_img = Image.open(img_path)
            np_img = np.asarray(pil_img)

            for pred_object in img_pred_objects:
                # Even there is no prediction for that still add the object
                if pred_object.pred_bbox is None:
                    processed_pred_objects.append(pred_object)
                    continue

                # The smallest gt object has a dimension of 50
                if (pred_object.pred_bbox[2] - pred_object.pred_bbox[0]) < 50 or (pred_object.pred_bbox[3] - pred_object.pred_bbox[1]) < 50:
                    continue

                cropped_object_np_img = crop_object_with_bbox(np_img, pred_object.pred_bbox)
                h,w,_ = cropped_object_np_img.shape

                sample_feature = feat_extractor.predict(cropped_object_np_img)
                sample_feature = sample_feature.cpu().numpy()

                pred_object.add_feature_vector(sample_feature)
                processed_pred_objects.append(pred_object)
    return processed_pred_objects

In [None]:
pred_objects = read_pred_objects_json("/content/vith_res_train_pred_objects.json")
feature_extracted_pred_objects = find_feature_vector_for_bbox(pred_objects)
dump_pred_objects(feature_extracted_pred_objects, "../vith_res_train_pred_objects_2.json")

pred_objects = read_pred_objects_json("/content/vith_res_train_processed_pred_objects.json")
feature_extracted_pred_objects = find_feature_vector_for_bbox(pred_objects)
dump_pred_objects(feature_extracted_pred_objects, "../vith_res_train_processed_pred_objects_2.json")

pred_objects = read_pred_objects_json("/content/vith_res_val_pred_objects.json")
feature_extracted_pred_objects = find_feature_vector_for_bbox(pred_objects)
dump_pred_objects(feature_extracted_pred_objects, "../vith_res_val_pred_objects_2.json")

pred_objects = read_pred_objects_json("/content/vith_res_val_processed_pred_objects.json")
feature_extracted_pred_objects = find_feature_vector_for_bbox(pred_objects)
dump_pred_objects(feature_extracted_pred_objects, "../vith_res_val_processed_pred_objects_2.json")

# 2.1 Stage: Extracting Training Features

This stage extracts Dino features of the ground truth bounding boxes and is required for the next stage: bounding box prediction.


In [None]:
!git clone https://github.com/uonat/SS2023_DI-Lab_Precitaste.git

In [None]:
%cd '/content/SS2023_DI-Lab_Precitaste'

In [1]:
import os
import json
import sys
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import torch
import math
import numpy as np
import torchvision.transforms as T
from PIL import Image

from dataset.RPCDataset import RPCDataset
from notebooks.utils.dino_v2 import crop_object_with_bbox
from models.DINO import DINOFeatureExtractor

In [None]:
train_img_names = os.listdir('/kaggle/input/retail-product-checkout-dataset/train2019')
train_dataset = RPCDataset('/kaggle/input/retail-product-checkout-dataset', 'train')

In [None]:
val_dataset = RPCDataset('/kaggle/input/retail-product-checkout-dataset', 'val')
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
feature_extractor = DINOFeatureExtractor()
feature_vector_parent_folder_path = 'rpc-train-efficientnetv2-feat'
object_img_parent_folder_path = 'rpc-train-efficientnetv2-imgs'

In [None]:
with torch.no_grad():
    for i in tqdm(range(train_dataset.get_num_imgs())):
        img_name = train_dataset.get_img_name_by_id(i)

        element = train_dataset.get_element_by_id(i)

        # Get object from bbox and make it compatible with dinov2
        bbox = element['annots'][0][0]
        object_name = element['annots'][0][1]
        np_cropped_object = crop_object_with_bbox(element['img'], bbox)

        feats = feature_extractor.predict(np_cropped_object)

        object_feature_folder_path = os.path.join(feature_vector_parent_folder_path, object_name)
        os.makedirs(object_feature_folder_path, exist_ok=True)
        npy_name = img_name.replace('jpg', '')
        object_feature_file_path = os.path.join(object_feature_folder_path, '{}.npy'.format(npy_name))

        with open(object_feature_file_path, 'wb') as npfile:
            np.save(npfile, feats.cpu().numpy(), allow_pickle=True)

# 3rd Stage: Bounding Box Label Assignment

Final stage of the algorithm makes prediction for the bounding box found till this point. Predictions are made by finding the label of the most similar item in the dataset. To find the most similar item in the dataset, this stage requires training object features extracted in the same way with the second stage using the ground truth boxes.


Along with the predictions this stage also generates some


Input: Prediction objects from the previous stage. Dino features of the training objects.

Output: mAP score of the algorithm

In [None]:
import os
import json
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.patches as mpatches
import cv2

from dataset.utils import select_random, select_uniform, select_per_cam, load_npy_files
from notebooks.utils.Prediction import get_pred_objects_per_image, read_pred_objects_json, dump_pred_objects

In [None]:
training_features_path = '/kaggle/input/idp-repo/rpc-train-dino-feat-full-img/rpc-train-dino-feat-full-img/rpc-train-dino-feat-full-img'
rpc_val_set_path = '/kaggle/input/retail-product-checkout-dataset/val2019/'

class_names = [fname for fname in os.listdir(training_features_path)]

In [None]:
# How many samples to select from each train class
N_SAMPLES = 1
# How to select training samples
# n_per_cam, uniform
SELECT_METHOD = 'random'
RANDOM_SEED = 24
VAL_SIZE = 0.4
# How to divide validation classes
# base, vanilla
# base will divide main classes and assigns fine labels to parents
# vanilla will divide directly
# In base it is not possible to have 11_tissue in train and 12_tissue in val
SPLIT_METHOD = 'base'
# Dimension of the extracted features
FEATURE_DIM = 384

In [None]:
def make_split(split_method):
    if split_method == 'vanilla':
        train_classes, val_classes = train_test_split(class_names, test_size=VAL_SIZE, random_state=RANDOM_SEED)
    elif split_method == 'base':
        base_class_names = list(set([''.join(cname.split('_')[1:]) for cname in class_names]))
        train_base_classes, val_base_classes = train_test_split(base_class_names, test_size=VAL_SIZE, random_state=RANDOM_SEED)
        train_classes = [cname for cname in class_names if ''.join(cname.split('_')[1:]) in train_base_classes]
        val_classes = [cname for cname in class_names if ''.join(cname.split('_')[1:]) in val_base_classes]
    return train_classes, val_classes

In [None]:
# BUILD_DATASETS
# Read feature vectors by the algorithm specified
def build_datasets(select_method, n_samples, tr_classes):
    train_features = []
    val_features = []

    train_labels = []
    val_labels = []

    for class_name in tqdm(class_names):
        class_folder_path = os.path.join(training_features_path, class_name)
        img_names = [fname for fname in os.listdir(class_folder_path) if '.npy' in fname]

        if select_method == 'n_per_cam':
            selected_img_names = select_per_cam(img_names, n_samples)
        elif select_method == 'random':
            selected_img_names = select_random(img_names, n_samples)
        elif select_method == 'uniform':
            selected_img_names = select_uniform(img_names, n_samples)

        folder_features = load_npy_files(class_folder_path, selected_img_names)
        if class_name in tr_classes:
            train_features.append(folder_features)
            train_labels = train_labels + [class_name] * len(folder_features)
        else:
            val_features.append(folder_features)
            val_labels = val_labels + [class_name] * len(folder_features)

    np_train_features = np.array(train_features).reshape(-1, FEATURE_DIM)
    np_train_labels = np.array(train_labels)

    np_val_features = np.array(val_features).reshape(-1, FEATURE_DIM)
    np_val_labels = np.array(val_labels)
    return np_train_features, np_train_labels, np_val_features, np_val_labels

In [None]:
# Finds the most similar feature vector and similarity to that
# in the training set for a prediction object created in the previous steps
def predict_bboxes(pred_objects, tr_class_names, tr_features, tr_labels, val_features, val_labels):
    train_pred_objects = []
    val_pred_objects = []

    for pred_objects in tqdm(pred_objects):
        if pred_objects.pred_bbox is not None:
            if pred_objects.gt_label is not None:
                # Do matching, assign to train
                if pred_objects.gt_label in tr_class_names:
                    similarities = cosine_similarity(np.array([pred_objects.pred_features]).reshape(1,-1), tr_features)
                    best_sim_index = np.argmax(similarities)
                    score = similarities[0, best_sim_index]
                    label = tr_labels[best_sim_index]
                    pred_objects.add_classification_res(label, score)
                    train_pred_objects.append(pred_objects)
                # Do matching, assign to val
                else:
                    similarities = cosine_similarity(np.array([pred_objects.pred_features]).reshape(1,-1), val_features)
                    best_sim_index = np.argmax(similarities)
                    score = similarities[0, best_sim_index]
                    label = val_labels[best_sim_index]
                    pred_objects.add_classification_res(label, score)
                    val_pred_objects.append(pred_objects)
            # No gt label -> FP
            # Measure dist to train, try to find a threshold
            else:
                similarities = cosine_similarity(np.array([pred_objects.pred_features]).reshape(1,-1), tr_features)
                best_sim_index = np.argmax(similarities)
                score = similarities[0, best_sim_index]
                label = tr_labels[best_sim_index]
                pred_objects.add_classification_res(label, score)
                train_pred_objects.append(pred_objects)
        # Missed pred nothing to do add train or val
        else:
            if pred_objects.gt_label in tr_class_names:
                train_pred_objects.append(pred_objects)
            else:
                val_pred_objects.append(pred_objects)

    return train_pred_objects, val_pred_objects

In [None]:
# Finds similarity distribution between elements of the same class and
# different class for the train split
def find_similarity_dist_train(tr_labels, tr_features):

    all_inner_class_sims = []
    all_inter_class_sims = []
    unique_train_labels = np.unique(tr_labels)

    for train_label in unique_train_labels:
        label_mask = np.where(np.array(tr_labels) == train_label)
        not_label_mask = np.where(np.array(tr_labels) != train_label)

        label_features = tr_features[label_mask]
        inner_similarity = cosine_similarity(label_features, label_features)
        inter_similarity = cosine_similarity(label_features, tr_features[not_label_mask])
        np.fill_diagonal(inner_similarity, -1)

        all_inner_class_sims += inner_similarity.flatten().tolist()
        all_inter_class_sims += inter_similarity.flatten().tolist()
    return all_inner_class_sims, all_inter_class_sims

In [None]:
def find_sim_dist_pred_object(pred_objects, tr_features, tr_labels):

    all_inner_class_sims = []
    all_inter_class_sims = []
    fp_sims = []

    for pred_object in tqdm(pred_objects):

        if pred_object.gt_label is not None and pred_object.gt_label in tr_labels and pred_object.pred_features is not None:

            train_label = pred_object.gt_label
            label_mask = np.where(np.array(tr_labels) == train_label)
            not_label_mask = np.where(np.array(tr_labels) != train_label)

            pred_object_features = np.array(pred_object.pred_features).reshape(1, -1)
            label_features = tr_features[label_mask]
            no_label_features = tr_features[not_label_mask]

            inner_similarity = cosine_similarity(pred_object_features, label_features)
            inter_similarity = cosine_similarity(pred_object_features, no_label_features)

            all_inner_class_sims += inner_similarity.flatten().tolist()
            all_inter_class_sims += inter_similarity.flatten().tolist()

        elif pred_object.pred_features is not None and pred_object.gt_label is None:

            pred_object_features = np.array(pred_object.pred_features).reshape(1, -1)
            fp_similarity = cosine_similarity(pred_object_features, tr_features)
            fp_sims += fp_similarity.flatten().tolist()

    return all_inner_class_sims, all_inter_class_sims, fp_sims

In [None]:
def plot_dist(title, dist1, label1, dist2=None, label2=None):
    labels = []

    plt.figure(figsize=(9,6))

    violin = plt.violinplot(dist1)

    color = violin["bodies"][0].get_facecolor().flatten()
    labels.append((mpatches.Patch(color=color), label1))

    if dist2 is not None:
        violin2 = plt.violinplot(dist2)
        color = violin2["bodies"][0].get_facecolor().flatten()
        labels.append((mpatches.Patch(color=color), label2))

    plt.title(title)
    plt.legend(*zip(*labels), loc=2)
    plt.show()

In [None]:
def eval_pred_objects(per_image_objects, th):
    all_image_stats = []
    for img_name in per_image_objects:
        image_stats = {'img_name': img_name, 'TH': th, 'FP': 0, 'TP': 0, 'FN': 0, 'Precision': 0.0, 'Recall': 0.0}
        image_pred_objects = per_image_objects[img_name]

        for pred_objects in image_pred_objects:
            if (pred_objects.pred_bbox is not None) and (pred_objects.class_score > th) and pred_objects.pred_label is not None:
                # Bounding box is found extra by the detection network
                if pred_objects.gt_bbox is None:
                    image_stats['FP'] += 1
                # Bounding box misclassified by the knn
                elif pred_objects.gt_label != pred_objects.pred_label:
                    image_stats['FP'] += 1
                # True detection
                elif pred_objects.gt_label == pred_objects.pred_label:
                    image_stats['TP'] += 1

            # Bounding box is missed by the detection network
            elif pred_objects.gt_label is not None:
                image_stats['FN'] += 1

        if (image_stats['TP'] + image_stats['FP']) > 0:
            image_stats['Precision'] = image_stats['TP'] / (image_stats['TP'] + image_stats['FP'])
        else:
            image_stats['Precision'] = None

        if (image_stats['TP'] + image_stats['FN']) > 0:
            image_stats['Recall'] = image_stats['TP'] / (image_stats['TP'] + image_stats['FN'])
        else:
            image_stats['Recall'] = None

        all_image_stats.append(image_stats)
    return all_image_stats

def eval_with_diff_th(per_image_objects):
    all_image_stats = []
    for th in np.linspace(0, 0.95, 20):
        all_image_stats += eval_pred_objects(per_image_objects, th)
    return all_image_stats

In [None]:
def export_per_img_pred_objects_with_th(per_image_pred_objects, gt_path, pred_path, bounding_box_th, class_score_th, use_base_classes=False):

    os.makedirs(gt_path, exist_ok=True)
    os.makedirs(pred_path, exist_ok=True)

    for img_name in per_image_pred_objects:
        txt_name = img_name.split('.')[0] + '.txt'
        pred_txt_dir = os.path.join(pred_path, txt_name)
        with open(pred_txt_dir, "w") as txtfile:
            for predict_object in per_image_pred_objects[img_name]:

                if predict_object.pred_bbox is not None and predict_object.pred_score_bbox > bounding_box_th and (predict_object.class_score is not None and predict_object.class_score > class_score_th):
                    label = predict_object.pred_label
                    if use_base_classes:
                        label = ''.join(label.split('_')[1:])
                    x1, y1, x2, y2 = predict_object.pred_bbox
                    conf = predict_object.class_score
                    txtfile.write("{} {} {} {} {} {}\n".format(label, conf, x1, y1, x2-x1, y2-y1))

        gt_txt_dir = os.path.join(gt_path, txt_name)
        with open(gt_txt_dir, "w") as txtfile:
            for predict_object in per_image_pred_objects[img_name]:

                if predict_object.gt_bbox is not None:
                    label = predict_object.gt_label
                    if use_base_classes:
                        label = ''.join(label.split('_')[1:])
                    x1, y1, w, h = predict_object.gt_bbox
                    # GT bbox is in the form of xywh
                    txtfile.write("{} {} {} {} {}\n".format(label, x1, y1, w, h))

In [None]:
def run_eval_script(gt_folder_path, pred_folder_path, save_folder_path):
    script = "python evaluation/object-detection-metrics/evaluate.py -gt {} -det {} -sp {}".format(gt_folder_path, pred_folder_path, save_folder_path)
    print(script)
    os.system(script)

def print_mAP(eval_res_path):
    metrics_df_path = os.path.join(eval_res_path, 'eval_results_per_image.csv')
    results_df = pd.read_csv(metrics_df_path)
    groupped_df = results_df.groupby(['class']).mean(numeric_only=True)
    print(groupped_df.mean())

In [None]:
def get_bboxes_from_txt(txtfile_path):
    bboxes = []
    with open(txtfile_path, 'r') as txtfile:
        lines = txtfile.readlines()
        for line in lines:
            splitted_lines = line.split(" ")
            bboxes.append([float(i) for i in splitted_lines[-4:]] + [splitted_lines[0]])
    return bboxes

def draw_bboxes(img_path, gt_boxes, pred_boxes, train_classes):
    # Draw parameters
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 2
    thickness = 2
    text_color = (113, 24, 200)
    tr_bbox_color = (12, 154, 242)
    val_bbox_color = (164, 12, 242)

    org_img = cv2.imread(img_path)
    org_img = cv2.cvtColor(org_img, cv2.COLOR_BGR2RGB)

    gt_drawn_img = org_img.copy()
    pred_drawn_img = org_img.copy()

    for bbox in gt_boxes:
        start_x, start_y, bbox_w, bbox_h = [int(i) for i in bbox[:4]]
        label = bbox[-1]
        if label in train_classes:
            bbox_color = tr_bbox_color
        else:
            bbox_color = val_bbox_color
        cv2.putText(gt_drawn_img, label, (start_x, start_y-10), font, font_scale, text_color, thickness)
        cv2.rectangle(gt_drawn_img, (start_x, start_y), (start_x + bbox_w, start_y+bbox_h), bbox_color, 4)

    for bbox in pred_boxes:
        start_x, start_y, bbox_w, bbox_h = [int(i) for i in bbox[:4]]
        label = bbox[-1]
        if label in train_classes:
            bbox_color = tr_bbox_color
        else:
            bbox_color = val_bbox_color
        cv2.putText(pred_drawn_img, label, (start_x, start_y-10), font, font_scale, text_color, thickness)
        cv2.rectangle(pred_drawn_img, (start_x, start_y), (start_x + bbox_w, start_y+bbox_h), bbox_color, 4)

    return gt_drawn_img, pred_drawn_img

def plot_and_save_fig(gt_img, pred_img, fig_save_path):

    fig, axis = plt.subplots(1, 2, figsize=(12, 5))
    axis[0].imshow(gt_img)
    axis[0].set_title('GT')
    axis[0].set_xticks([])
    axis[0].set_yticks([])

    axis[1].imshow(pred_img)
    axis[1].set_title('Pred')
    axis[1].set_xticks([])
    axis[1].set_yticks([])

    plt.savefig(fig_save_path)
    plt.close()

def visualize_samples(gt_annots_path, pred_annots_path, eval_res_path, fig_save_path, train_classes):

    results_csv_path = os.path.join(eval_res_path, 'eval_results_per_image.csv')
    results_df = pd.read_csv(results_csv_path)

    filtered_results_df = results_df[~results_df['AP'].isna()]
    ap_sorted_df = filtered_results_df.sort_values(by=['Precision', 'Recall'])
    os.makedirs(fig_save_path, exist_ok=True)

    for start in [0, len(results_df) // 2, len(results_df) - 12]:
        end = start + 10
        for i, row in ap_sorted_df[start:end].iterrows():
            img_name = row['img_name']
            txt_name = img_name + '.txt'

            pred_txt_path = os.path.join(pred_annots_path, txt_name)
            gt_txt_path = os.path.join(gt_annots_path, txt_name)

            pred_boxes = get_bboxes_from_txt(pred_txt_path)
            gt_boxes = get_bboxes_from_txt(gt_txt_path)

            img_path = os.path.join('/kaggle/input/retail-product-checkout-dataset/val2019/', img_name + '.jpg')

            gt_img, pred_img = draw_bboxes(img_path, gt_boxes, pred_boxes, train_classes)
            fig_img_save_path = os.path.join(fig_save_path, "fig_{}.png".format(i))
            plot_and_save_fig(gt_img, pred_img, fig_img_save_path)

In [None]:
def run_algo(select_method, n_samples, directory_prefix):
    train_classes, val_classes = make_split(SPLIT_METHOD)
    np_train_features, np_train_labels, np_val_features, np_val_labels = build_datasets(select_method,
                                                                                        n_samples,
                                                                                        train_classes)

    print("Training set with size: {} label size: {}".format(np_train_features.shape, np_train_labels.shape))
    print("Val set with size: {} label size: {}".format(np_val_features.shape, np_val_labels.shape))

    train_pred_objects = read_pred_objects_json("/kaggle/input/di-lab-idea2-artifacts/Dino-Full-Img/vith_res_train_processed_pred_objects_full_img_2.json")
    val_pred_objects = read_pred_objects_json("/kaggle/input/di-lab-idea2-artifacts/Dino-Full-Img/vith_res_val_processed_pred_objects_full_img_2.json")

    all_pred_objects = train_pred_objects + val_pred_objects

    print("Loaded prediction objects")
    # Make a prediction for all prediction objects, then divide them by classes
    train_pred_objects, val_pred_objects = predict_bboxes(all_pred_objects, train_classes, np_train_features,
                                                          np_train_labels, np_val_features, np_val_labels)
    print("Prediction completed")
    # For each object in train set(train split) of RPC, finds its similarity with other elements in the same subset
    #tr_inner_sim, tr_inter_sim = find_similarity_dist_train(np_train_labels, np_train_features)

    #plot_dist("Training set similarities", [i for i in tr_inner_sim if i != -1],
    #          "Same class", tr_inter_sim, "Diff class")

    # For each train object in validation split, find similarity to train classes and fn
    tr_pred_object_inner_sim, tr_pred_object_inter_sim, fn_sim = find_sim_dist_pred_object(train_pred_objects, np_train_features, np_train_labels)

    sample_to_vis = len(tr_pred_object_inner_sim)
    subset_dist_2 = np.random.choice(tr_pred_object_inter_sim, size=sample_to_vis)
    plot_dist("Train split similarity dist", tr_pred_object_inner_sim, "Same", subset_dist_2, "Diff")
    subset_dist3 = np.random.choice(fn_sim, size=sample_to_vis)
    plot_dist("Train split FN similarities", subset_dist3, "Similarities")

    # Group pred objects by image name for metric calculations
    train_per_image_objects = get_pred_objects_per_image(train_pred_objects)

    # Find metrics for changing th in the training subset
    tr_image_stats = eval_with_diff_th(train_per_image_objects)
    metric_df = pd.DataFrame(tr_image_stats)

    # Calculate precision and recall on different thresholds and visualise the metrics
    precisions = []
    recalls = []
    for th in np.linspace(0.0, 0.95, 20):
        filtered_df = metric_df[metric_df['TH'] == th]
        mean_precision = filtered_df['Precision'].mean()
        mean_recall = filtered_df['Recall'].mean()
        precisions.append(mean_precision)
        recalls.append(mean_recall)

    plt.figure(figsize=(8, 4))
    plt.xticks(ticks=np.arange(0, 20), labels=[np.round(i,2) for i in np.linspace(0.0, 0.95, 20)])
    plt.plot(precisions, label='Precision')
    plt.plot(recalls, label='Recall')
    plt.title('Precision-Recall on bbox score')
    plt.legend()
    plt.show()

    # Group pred objects by image name for evaluating
    val_per_image_objects = get_pred_objects_per_image(val_pred_objects)
    val_image_stats = eval_pred_objects(val_per_image_objects, 0.4)

    metric_df = pd.DataFrame(val_image_stats)
    mean_precision = metric_df['Precision'].mean()
    mean_recall = metric_df['Recall'].mean()
    print("Val Set: Precision: {:2f}, Recall: {:2f}".format(mean_precision, mean_recall))

    # Export training preds
    gt_annot_dir = os.path.join('/kaggle/working/', directory_prefix, 'tr_gt_annot_full/')
    pred_annot_dir = os.path.join('/kaggle/working/', directory_prefix, 'tr_pred_annot_full/')
    res_save_dir = os.path.join('/kaggle/working/', directory_prefix, 'tr_pred_res_full/')

    export_per_img_pred_objects_with_th(train_per_image_objects, gt_annot_dir, pred_annot_dir, 0.2, 0.4, False)
    run_eval_script(gt_annot_dir, pred_annot_dir, res_save_dir)
    print_mAP(res_save_dir)

    gt_annot_dir = os.path.join('/kaggle/working/', directory_prefix, 'tr_gt_annot_base/')
    pred_annot_dir = os.path.join('/kaggle/working/', directory_prefix, 'tr_pred_annot_base/')
    res_save_dir = os.path.join('/kaggle/working/', directory_prefix, 'tr_pred_res_base/')

    export_per_img_pred_objects_with_th(train_per_image_objects, gt_annot_dir, pred_annot_dir, 0.2, 0.4, True)
    run_eval_script(gt_annot_dir, pred_annot_dir, res_save_dir)
    print_mAP(res_save_dir)

    # Export validation preds
    gt_annot_dir = os.path.join('/kaggle/working/', directory_prefix, 'val_gt_annot_full/')
    pred_annot_dir = os.path.join('/kaggle/working/', directory_prefix, 'val_pred_annot_full/')
    res_save_dir = os.path.join('/kaggle/working/', directory_prefix, 'val_pred_res_full/')
    fig_save_dir = os.path.join('/kaggle/working/', directory_prefix, 'val_pred_figures_full/')

    export_per_img_pred_objects_with_th(val_per_image_objects, gt_annot_dir, pred_annot_dir, 0.2, 0.4, False)
    run_eval_script(gt_annot_dir, pred_annot_dir, res_save_dir)
    print_mAP(res_save_dir)

    gt_annot_dir = os.path.join('/kaggle/working/', directory_prefix, 'val_gt_annot_base/')
    pred_annot_dir = os.path.join('/kaggle/working/', directory_prefix, 'val_pred_annot_base/')
    res_save_dir = os.path.join('/kaggle/working/', directory_prefix, 'val_pred_res_base/')
    fig_save_dir = os.path.join('/kaggle/working/', directory_prefix, 'val_pred_figures_base/')

    export_per_img_pred_objects_with_th(val_per_image_objects, gt_annot_dir, pred_annot_dir, 0.2, 0.4, True)
    run_eval_script(gt_annot_dir, pred_annot_dir, res_save_dir)
    print_mAP(res_save_dir)

    # Export again to visualize this time export all
    all_per_image_objects = get_pred_objects_per_image(val_pred_objects + train_pred_objects)

    gt_annot_dir = os.path.join('/kaggle/working/', directory_prefix, 'all_gt_annot_full/')
    pred_annot_dir = os.path.join('/kaggle/working/', directory_prefix, 'all_pred_annot_full/')
    res_save_dir = os.path.join('/kaggle/working/', directory_prefix, 'all_pred_res_full/')
    fig_save_dir = os.path.join('/kaggle/working/', directory_prefix, 'all_pred_figures_full/')

    export_per_img_pred_objects_with_th(all_per_image_objects, gt_annot_dir, pred_annot_dir, 0.2, 0.4, False)
    run_eval_script(gt_annot_dir, pred_annot_dir, res_save_dir)
    print_mAP(res_save_dir)

    visualize_samples(gt_annot_dir, pred_annot_dir, res_save_dir, fig_save_dir, train_classes)

    gt_annot_dir = os.path.join('/kaggle/working/', directory_prefix, 'all_gt_annot_base/')
    pred_annot_dir = os.path.join('/kaggle/working/', directory_prefix, 'all_pred_annot_base/')
    res_save_dir = os.path.join('/kaggle/working/', directory_prefix, 'all_pred_res_base/')
    fig_save_dir = os.path.join('/kaggle/working/', directory_prefix, 'all_pred_figures_base/')

    export_per_img_pred_objects_with_th(all_per_image_objects, gt_annot_dir, pred_annot_dir, 0.2, 0.4, True)
    run_eval_script(gt_annot_dir, pred_annot_dir, res_save_dir)
    print_mAP(res_save_dir)

    visualize_samples(gt_annot_dir, pred_annot_dir, res_save_dir, fig_save_dir, train_classes)

In [None]:
run_algo('random', 1, "setup_1")