## Runs

#### Baseline using random noise:
```py
tbdc = 0.09724496333736742
rbdc = 0.04066706907997272
```

#### decoder_t8_preds_detr101dc5_adamw_numblocks_1_ffdim_4096_heads_16
```py
tbdc = 0.26097236334297624
rbdc = 0.2478526008210725
```

#### decoder_t8_preds_detr101dc5_adamw_numblocks_1_ffdim_4096_heads_16_v2 (trained on Shanghai)
```py
tbdc = 0.10937038211545225
rbdc = 0.1109098488822660
```

#### decoder_state_dict_adamw_40e_numblocks_2_ffdim_4096_heads_16_mseloss_0.05
```py
tbdc = 0.27224404650189515
rbdc = 0.2695939567044897
```

#### decoder_v2_t8_preds_detr101dc5_adamw_numblocks_1_ffdim_4096_heads_1 (v3)
```py
tbdc = 0.28874330979762747
rbdc = 0.28412440159860
```

#### decoder_v2_t8_preds_detr101dc5_adamw_numblocks_1_ffdim_4096_heads_32.pth
```py
tbdc = 0.2881355018110197
rbdc = 0.2746187377346972

Macro AUC: 0.695; Micro AUC: 0.668
```

#### decoder_v2_t8_preds_yolov8_adamw_numblocks_1_ffdim_4096_heads_16.pth with YOLOv8 preds
```py
tbdc = 0.3000759355497022
rbdc = 0.2961878808637868

Macro AUC:0.66?; Micro AUC:0.625

Baseline: 
tbdc = 0.000
rbdc = 0.000
```

#### autoencoder_ft_t8_decoder_v2_epoch_5 with DETR 101 DC5 preds, obj_dect_avenue_detr_resnet101_dc5 obj
```py
tbdc = 0.18709753619342853
rbdc = 0.20189405352643836

Macro AUC:0.660 ; Micro AUC0.664:
```

#### autoencoder_ft_t8_decoder_v2_trained_avenue_loss_0.003 with DETR 101 DC5 Preds, obj_dect_avenue_detr_resnet101_dc5 obj
```py
tbdc = 0.17969921324730181
rbdc = 0.18816193287500632

Macro AUC:0.653 ; Micro AUC 0.655:
`````

#### autoencoder_ft_t8_decoder_v2_yolov8_epoch_17 with YOLO v8 pre (no predict params)
```py
tbdc = 0.14231983104340193
rbdc = 0.15282977450316226
Macro AUC: 0.727; Micro AUC: 0.43

```

#### autoencoder_ft_t8_decoder_v2_yolov8_new_epoch_20_mseloss_0.007056034170091152
```py
tbdc = 0.18441444414961888
rbdc = 0.183827570099877
Macro AUC: 0.5826279864736656; Micro AUC: 0.59926857078780697
```


In [1]:
import sys
import glob
import pickle
import os
import random
from enum import Enum

import cv2 as cv
import scipy.io as sio
import pickle
import pdb
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn import metrics
from PIL import Image

sys.path.append('./abnorm_event_detect/evaluation/')
os.chdir('abnorm_event_detect')

from evaluation.merge_tracks import ContinuousTrack

os.chdir('..')

!pwd

/mnt/c/Users/Sergiu/Desktop/AnomalyDetection


## Define Functions for RBDC, TBDC

In [2]:
class TrackState(Enum):
    CREATED = "created"
    UPDATED = "updated"
    CLOSED = "closed"

class Track:
    def __init__(self, start_idx=0, end_idx=None, mask=0, video_name=""):
        self.start_idx = start_idx
        self.end_idx = end_idx
        self.bboxes = {}
        self.mask = mask
        self.state = TrackState.CREATED
        self.video_name = video_name

    def __str__(self):
        return str(self.__class__) + ": " + str(self.__dict__)

class AnomalyDetection:
    def __init__(self, frame_idx, bbox, score, video_name, track_id=-1):
        self.frame_idx = frame_idx
        self.bbox = bbox
        self.score = score
        self.video_name = video_name
        self.track_id = track_id

    def __str__(self):
        return str(self.__class__) + ": " + str(self.__dict__)

In [3]:
def bb_intersection_over_union(boxA, boxB):
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    # compute the area of intersection rectangle
    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the intersection area
    iou = interArea / float(boxAArea + boxBArea - interArea)
    # return the intersection over union value
    return iou

In [4]:
def get_loc_v3(video_info_path):
    file_names = os.listdir(os.path.join(video_info_path, "meta_0.800"))
    video_loc_v3 = []
    for file_name in file_names:
        loc_v3 = np.loadtxt(os.path.join(video_info_path, "meta_0.800", file_name))

        video_loc_v3.append(loc_v3[:5])
    return video_loc_v3

In [5]:
def get_predicted_anomalies_per_video(output_path, video_name, size):
    """
    :param output_path
    :param video_name
    :param size = H, W
    """

    # compute anomaly detection from predicted heat map
    loc_v3 = np.load(os.path.join(args.output_folder_base, args.database_name, "test",
                                  video_name, "loc_v3_%f.npy" % args.lambda_))
    # locv3 format [[frame_idx, x_min, y_min, x_max, y_max]]

    ab_event = np.load(os.path.join(args.output_folder_base, args.database_name, "test",
                                    video_name, "ab_event3_%f.npy" % args.lambda_))

    ab_event_resized = []
    for i in range(ab_event.shape[2]):
        res = cv.resize(ab_event[:, :, i], (size[1], size[0]))
        ab_event_resized.append(res)

    pred_anomalies_detected = []
    for idx in range(len(loc_v3)):
        frame_idx = int(loc_v3[idx][0])
        bbox = loc_v3[idx][1:]
        bbox = [int(b) for b in bbox]
        crop_frame = ab_event_resized[frame_idx][bbox[1]: bbox[3], bbox[0]: bbox[2]]
        pred_anomalies_detected.append(AnomalyDetection(frame_idx, bbox, crop_frame.max(), video_name))

    return pred_anomalies_detected

In [6]:
def get_all_predicted_anomalies(output_path, resolution=()):
    video_names = os.listdir(output_path)
    video_names.sort()
    pred_anomalies = []
    num_frames = 0
    for video_name in video_names:
        if os.path.isfile(os.path.join(output_path, video_name)):
            continue
        video_meta_data = pickle.load(open(os.path.join(output_path, video_name, "video_meta_data.pkl"), 'rb'))
        video_size = (video_meta_data['height'], video_meta_data['width'])  # h, w
        num_frames_video = video_meta_data['num_frames']
        pred = get_predicted_anomalies_per_video(output_path, video_name, video_size)
        # save_txt_predicted(pred, video_name)
        pred_anomalies += pred
        num_frames += num_frames_video

    return pred_anomalies, num_frames

In [7]:
def compute_iou(pred_anomaly, gt_anomalies_per_frame):
    max_iou = 0
    idx = -1
    for index, gt_anomaly in enumerate(gt_anomalies_per_frame):
        iou = bb_intersection_over_union(gt_anomaly.bbox, pred_anomaly.bbox)
        if max_iou < iou:
            max_iou = iou
            idx = index

    return max_iou, idx

In [8]:
def get_matching_gt_indices(pred_anomaly, gt_anomalies_per_frame, beta):
    indices = []
    for index, gt_anomaly in enumerate(gt_anomalies_per_frame):
        iou = bb_intersection_over_union(gt_anomaly.bbox, pred_anomaly.bbox)
        if iou >= beta:
            indices.append(index)

    return indices

In [9]:
def compute_tbdr(gt_tracks, num_matched_detections_per_track, alpha):
    percentages = np.array([x / len(y.bboxes) for x, y in zip(num_matched_detections_per_track, gt_tracks)])
    return np.sum(percentages >= alpha) / len(num_matched_detections_per_track)

## V2

In [10]:
import glob
from enum import Enum

import cv2 as cv
import scipy.io as sio
from sklearn import metrics
import pickle
import pdb
import numpy as np
import os
import matplotlib.pyplot as plt

from evaluation.merge_tracks import ContinuousTrack


class TrackState(Enum):
    CREATED = "created"
    UPDATED = "updated"
    CLOSED = "closed"


class Track:
    def __init__(self, start_idx=0, end_idx=None, mask=0, video_name=""):
        self.start_idx = start_idx
        self.end_idx = end_idx
        self.bboxes = {}
        self.mask = mask
        self.state = TrackState.CREATED
        self.video_name = video_name

    def __str__(self):
        return str(self.__class__) + ": " + str(self.__dict__)

class AnomalyDetection:
    def __init__(self, frame_idx, bbox, score, video_name, track_id=-1):
        self.frame_idx = frame_idx
        self.bbox = bbox
        self.score = score
        self.video_name = video_name
        self.track_id = track_id

    def __str__(self):
        return str(self.__class__) + ": " + str(self.__dict__)


def bb_intersection_over_union(boxA, boxB):
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    # compute the area of intersection rectangle
    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the intersection area
    iou = interArea / float(boxAArea + boxBArea - interArea)
    # return the intersection over union value
    return iou


def get_loc_v3(video_info_path):
    file_names = os.listdir(os.path.join(video_info_path, "meta_0.800"))
    video_loc_v3 = []
    for file_name in file_names:
        loc_v3 = np.loadtxt(os.path.join(video_info_path, "meta_0.800", file_name))

        video_loc_v3.append(loc_v3[:5])
    return video_loc_v3


def get_predicted_anomalies_per_video(output_path, video_name, size):
    """
    :param output_path
    :param video_name
    :param size = H, W
    """

    # compute anomaly detection from predicted heat map
    loc_v3 = np.load(os.path.join(args.output_folder_base, args.database_name, "test",
                                  video_name, "loc_v3_%f.npy" % args.lambda_))
    # locv3 format [[frame_idx, x_min, y_min, x_max, y_max]]

    ab_event = np.load(os.path.join(args.output_folder_base, args.database_name, "test",
                                    video_name, "ab_event3_%f.npy" % args.lambda_))

    ab_event_resized = []
    for i in range(ab_event.shape[2]):
        res = cv.resize(ab_event[:, :, i], (size[1], size[0]))
        ab_event_resized.append(res)

    pred_anomalies_detected = []
    for idx in range(len(loc_v3)):
        frame_idx = int(loc_v3[idx][0])
        bbox = loc_v3[idx][1:]
        bbox = [int(b) for b in bbox]
        crop_frame = ab_event_resized[frame_idx][bbox[1]: bbox[3], bbox[0]: bbox[2]]
        pred_anomalies_detected.append(AnomalyDetection(frame_idx, bbox, crop_frame.max(), video_name))

    return pred_anomalies_detected


def save_txt_predicted(preds, video_name):
    predictions = []
    for pred in preds:  # [frame_id, x_min, y_min, x_max, y_max, anomaly_score]
        predictions.append([pred.frame_idx] + pred.bbox + [pred.score])
    np.savetxt(f'avenue/det/{video_name}.txt', predictions, delimiter=',')


def get_all_predicted_anomalies(output_path):
    video_names = os.listdir(output_path)
    video_names.sort()
    pred_anomalies = []
    num_frames = 0
    for video_name in video_names:
        if os.path.isfile(os.path.join(output_path, video_name)):
            continue
        video_meta_data = pickle.load(open(os.path.join(output_path, video_name, "video_meta_data.pkl"), 'rb'))
        video_size = (video_meta_data['height'], video_meta_data['width'])  # h, w
        num_frames_video = video_meta_data['num_frames']
        pred = get_predicted_anomalies_per_video(output_path, video_name, video_size)
        # save_txt_predicted(pred, video_name)
        pred_anomalies += pred
        num_frames += num_frames_video

    return pred_anomalies, num_frames


def compute_iou(pred_anomaly, gt_anomalies_per_frame):
    max_iou = 0
    idx = -1
    for index, gt_anomaly in enumerate(gt_anomalies_per_frame):
        iou = bb_intersection_over_union(gt_anomaly.bbox, pred_anomaly.bbox)
        if max_iou < iou:
            max_iou = iou
            idx = index

    return max_iou, idx


def get_matching_gt_indices(pred_anomaly, gt_anomalies_per_frame, beta):
    indices = []
    for index, gt_anomaly in enumerate(gt_anomalies_per_frame):
        iou = bb_intersection_over_union(gt_anomaly.bbox, pred_anomaly.bbox)
        if iou >= beta:
            indices.append(index)

    return indices


def compute_tbdr(gt_tracks, num_matched_detections_per_track, alpha):
    percentages = np.array([x / len(y.bboxes) for x, y in zip(num_matched_detections_per_track, gt_tracks)])
    return np.sum(percentages >= alpha) / len(num_matched_detections_per_track)


def compute_fpr_rbdr(pred_anomalies_detected: [AnomalyDetection], gt_anomalies: [AnomalyDetection], all_gt_tracks,
                     num_frames, num_tracks, alpha=0.1, beta=0.1):
    num_matched_detections_per_track = [0] * num_tracks

    # TODO: add pixel level IOU
    num_detected_anomalies = len(pred_anomalies_detected)
    gt_anomaly_video_per_frame_dict = {}
    found_gt_anomaly_video_per_frame_dict = {}

    for anomaly in gt_anomalies:
        anomalies_per_frame = gt_anomaly_video_per_frame_dict.get((anomaly.video_name, anomaly.frame_idx), None)
        if anomalies_per_frame is None:
            gt_anomaly_video_per_frame_dict[(anomaly.video_name, anomaly.frame_idx)] = [anomaly]
            found_gt_anomaly_video_per_frame_dict[(anomaly.video_name, anomaly.frame_idx)] = [0]
        else:
            gt_anomaly_video_per_frame_dict[(anomaly.video_name, anomaly.frame_idx)].append(anomaly)
            found_gt_anomaly_video_per_frame_dict[(anomaly.video_name, anomaly.frame_idx)].append(0)

    tp = np.zeros(num_detected_anomalies)
    fp = np.zeros(num_detected_anomalies)
    tbdr = np.zeros(num_detected_anomalies)
    remove_idx = []
    pred_anomalies_detected.sort(key=lambda anomaly_detection: anomaly_detection.score, reverse=True)
    for idx, pred_anomaly in enumerate(pred_anomalies_detected):
        gt_anomalies_per_frame = gt_anomaly_video_per_frame_dict.get((pred_anomaly.video_name, pred_anomaly.frame_idx),
                                                                     None)

        if gt_anomalies_per_frame is None:
            fp[idx] = 1
        else:
            matching_gt_bboxes_indices = get_matching_gt_indices(pred_anomaly, gt_anomalies_per_frame, beta)
            if len(matching_gt_bboxes_indices) > 0:
                non_matched_indices = []
                for matched_ind in matching_gt_bboxes_indices:
                    if found_gt_anomaly_video_per_frame_dict.get((pred_anomaly.video_name,
                                                                  pred_anomaly.frame_idx))[matched_ind] == 0:
                        non_matched_indices.append(matched_ind)
                        found_gt_anomaly_video_per_frame_dict.get((pred_anomaly.video_name, pred_anomaly.frame_idx))[
                            matched_ind] = 1
                        num_matched_detections_per_track[gt_anomalies_per_frame[matched_ind].track_id] += 1

                tp[idx] = len(non_matched_indices)

            else:
                fp[idx] = 1

        tbdr[idx] = compute_tbdr(all_gt_tracks, num_matched_detections_per_track, alpha)

    cum_false_positive = np.cumsum(fp)
    cum_true_positive = np.cumsum(tp)
    # add the point (0, 0) for each vector
    cum_false_positive = np.concatenate(([0], cum_false_positive))
    cum_true_positive = np.concatenate(([0], cum_true_positive))
    tbdr = np.concatenate(([0], tbdr))

    rbdr = cum_true_positive / len(gt_anomalies)
    fpr = cum_false_positive / num_frames

    idx_1 = np.where(fpr <= 1)[0][-1] + 1

    if fpr[idx_1 - 1] != 1:
        print('fpr does not reach 1')
        rbdr = np.insert(rbdr, idx_1, rbdr[idx_1 - 1])
        tbdr = np.insert(tbdr, idx_1, tbdr[idx_1 - 1])
        fpr = np.insert(fpr, idx_1, 1)
        idx_1 += 1

    tbdc = metrics.auc(fpr[:idx_1], tbdr[:idx_1])
    rbdc = metrics.auc(fpr[:idx_1], rbdr[:idx_1])

    print('tbdc = ' + str(tbdc))
    print('rbdc = ' + str(rbdc))
    return rbdc, tbdc

    # print(tbdr[idx_1 - 1], rbdr[idx_1 - 1])
    # plt.plot(fpr, rbdr, '-')
    # plt.xlabel('FPR')
    # plt.ylabel('RBDR')
    # plt.show()


# def save_tracks_as_txt(tracks, video_name):
#     regions = []
#     for track_id, track in enumerate(tracks):
#         for frame_idx, bbox in track.bboxes.items():
#             regions.append([track_id] + [frame_idx] + bbox)
#     np.savetxt(f'avenue/tracks/{video_name}.txt', regions, delimiter=',')


def compute_rbdc_tbdc_func(predictions_dir, gt_dir, num_frames):
    video_names = [os.path.basename(x) for x in glob.glob(os.path.join(predictions_dir, "*")) if os.path.isdir(x)]

    # Get GT for RBDC and TBDC
    all_gt_tracks = []
    num_tracks = 0
    for vn in video_names:
        tracks = pickle.load(open(os.path.join(gt_dir, f"{vn}.pkl"), 'rb'))
        all_gt_tracks += tracks
        num_tracks += len(tracks)

    gt_anomalies = []
    for track_id, track in enumerate(all_gt_tracks):
        for frame_idx, bbox in track.bboxes.items():
            gt_anomalies.append(AnomalyDetection(frame_idx, bbox, 1, track.video_name, track_id=track_id))

    # Get Preds for RBDC and TBDC
    nr_thresholds = 10
    all_pred_ano = []
    for vn in video_names:
        frames = list(glob.glob(os.path.join(predictions_dir, vn, "*.jpg")))
        frames = sorted(frames, key=lambda filename: int(''.join(filter(str.isdigit, filename))))
        # for frame in frames:
        for f_idx, frame in enumerate(frames):
            # frame read
            f = np.array(np.load(frame))
            thresholds = np.linspace(f.min(), f.max(), nr_thresholds + 2)[1:-1]
            for thr in thresholds:
                th_f = f.copy()
                th_f[f < thr] = 0.0
                th_f[f >= thr] = 1.0

                num_labels, labels, bboxes, centroids = cv.connectedComponentsWithStats(th_f.astype(np.uint8), connectivity=8)
                for box in bboxes:
                    b = np.array([box[0], box[0], box[0] + box[2], box[1] + box[3]]) # x_min, y_min, x_max, y_max
                    score = f[b[1]:b[3], b[0]:b[2]].max()
                    all_pred_ano.append(AnomalyDetection(f_idx, b, score, vn, track_id=-1))

    return compute_fpr_rbdr(gt_anomalies, gt_anomalies, all_gt_tracks, num_frames, num_tracks)

In [11]:
predictions_dir = './datasets/Avenue Dataset/objects/test/'
gt_dir = './tracks/tracks_avenue/'
compute_rbdc_tbdc_func(predictions_dir=gt_dir, gt_dir=gt_dir, num_frames=1000)

fpr does not reach 1
tbdc = 0.0
rbdc = nan


  rbdr = cum_true_positive / len(gt_anomalies)


(nan, 0.0)

## Initialize Ground Truths for Anomaly Det

In [12]:
def list_files(directory, file_types=['*.pkl']):
    files = []
    for file_type in file_types:
        files.extend(glob.glob(os.path.join(directory, file_type)))
    return files

In [13]:
path = './tracks/tracks_avenue/'
video_names = [file.split('/')[-1].split('.')[0] for file in list_files(path) if 'cont' not in file]
pkl_files = [file for file in list_files(path) if 'cont' not in file]
print(video_names[:3])
print(pkl_files[:3])

['01', '02', '03']
['./tracks/tracks_avenue/01.pkl', './tracks/tracks_avenue/02.pkl', './tracks/tracks_avenue/03.pkl']


In [14]:
# Get GT for RBDC and TBDC
all_gt_tracks = []
num_tracks = 0

for vn, pkl_file in zip(video_names, pkl_files):
    tracks = pickle.load(open(os.path.join(pkl_file), 'rb'))
    all_gt_tracks += tracks

num_tracks = len(all_gt_tracks)
print("All GT Tracks: ", num_tracks, all_gt_tracks[:2])

gt_anomalies = []
for track_id, track in enumerate(all_gt_tracks):
    for frame_idx, bbox in track.bboxes.items():
        gt_anomalies.append(AnomalyDetection(frame_idx, bbox, 1, track.video_name, track_id=track_id))

print("GT Anomalies: ", len(gt_anomalies), gt_anomalies[:2])

All GT Tracks:  121 [<evaluation.track.Track object at 0x7fa5e07088e0>, <evaluation.track.Track object at 0x7fa5968cc790>]
GT Anomalies:  3914 [<__main__.AnomalyDetection object at 0x7fa5f37743a0>, <__main__.AnomalyDetection object at 0x7fa5968cd270>]


In [15]:
# %%time
# dataset_test_dir = './datasets/Avenue Dataset/predictions/test'

# # Get Preds for RBDC and TBDC
# nr_thresholds = 10
# all_pred_ano = []
# for vn in video_names:
#     frames = list(glob.glob(os.path.join(dataset_test_dir, vn, "*.jpg")))
#     frames = sorted(frames, key=lambda filename: int(''.join(filter(str.isdigit, filename))))
#     # for frame in frames:
#     for f_idx, frame in enumerate(frames):
#         # frame read
#         f = np.array(Image.open(frame))
#         thresholds = np.linspace(f.min(), f.max(), nr_thresholds + 2)[1:-1]
#         for thr in thresholds:
#             th_f = f.copy()
#             th_f[f < thr] = 0.0
#             th_f[f >= thr] = 1.0

#             src = cv.cvtColor(th_f, cv.COLOR_BGR2GRAY)
            
#             num_labels, labels, bboxes, centroids = cv.connectedComponentsWithStats(src.astype(np.uint8), connectivity=8)
#             for box in bboxes:
#                 b = np.array([box[0], box[0], box[0] + box[2], box[1] + box[3]]) # x_min, y_min, x_max, y_max
#                 try:
#                     score = f[b[1]:b[3], b[0]:b[2]].max()
#                 except:
#                     pass
#                 all_pred_ano.append(AnomalyDetection(f_idx, b, score, vn, track_id=-1))

In [16]:
# print(len(all_pred_ano), all_pred_ano[:2])

In [17]:
# all_pred_ano[1].frame_idx, all_pred_ano[1].bbox

In [18]:
gt_anomalies[0].frame_idx

77

- in loc de gt_anomalies o sa am all_pred_ano pe care le compun cum am discutat
- suma tuturor frame urilor peste tot pe train/test/de testat (num_frames) -> le adun in baza unui index

In [15]:
compute_fpr_rbdr(gt_anomalies, gt_anomalies, all_gt_tracks, num_frames=1000, num_tracks=num_tracks)

fpr does not reach 1
tbdc = 1.0
rbdc = 1.0


(1.0, 1.0)

1. Cum construiesc si ce am nevoie pt prediction tracks?
2. Cum 'detectez' o anomalie pentru a construi aceste anomaly tracks?
3. Ca si cod, cam care e ordinea rularilor? De ce mai am nevoie? Ruleaza deja bucata de `compute_rbdc_tbdc_func`, dar nu stiu exact cum sa apelez corect `compute_fpr_rbdr`.
4. Obtin all_pred_ano sa zicem - ar trebui salvat unedva pentru a fi apelat acel prediction path de catre `compute_fpr_rbdr`?

- am bounding boxes pt obiecte, le bag in AE si obtin mle loss (anomaly score).
- frame index, b - bounding box [xmin, ymin, xmax, ymax], score=score (mle), vn= video name, track_id=-1
- eventual saLVEZ TOTUL INTR-UN CSV PE CARE IL IMPORT CU PANDAS
- trebuie sa imi fac eu parsarea de frame uri, bounding boxes pe obj det, si creez tot
- all_pred_ano.append(AnomalyDetection(f_idx, b, score, vn, track_id=-1))
- sfat: sa imi creez pentru fiecare video, sa imi fac un pkl/obj pe care il salvez pe disk in care am (f_idx, boundingbox) -> ca asta scoate object detectorul si asta salvez pe disk.
- 

## Import AutoEncoder which will be used to compute Anomaly Score

In [16]:
import sys
sys.path.append('./ml-fastvit')

import timm
import torch
import torch.optim as optim
import torchvision.utils as vutils
import torchvision.transforms as T
import torch.nn as nn
import models
import math
import matplotlib.pyplot as plt
import numpy as np
import gc

from torch import Tensor
from torch.nn import functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from timm.models import create_model
from models.modules.mobileone import reparameterize_model
from PIL import Image
from urllib.request import urlopen

# from UpsampleTransformerDecoder import UpsampleTransformerDecoder
from UpsampleTransformerDecoderV2 import UpsampleTransformerDecoderV2

# Before starting the training, make sure to clear any residual memory
gc.collect()
torch.cuda.empty_cache()

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

If for semantic segmentation, please install mmsegmentation first
If for detection, please install mmdetection first
cuda


In [17]:
autoencoder_transform = T.Compose([
        T.Resize((64, 64)),
        T.ToTensor(),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

## AutoEncoder

In [18]:
encoder = create_model("fastvit_t8", fork_feat=True) # can turn fork_feat to False
encoder = encoder.to(device)

decoder = UpsampleTransformerDecoderV2(
    input_channels=384,
    num_upsamples=5,  # Adjusted to 5 upsampling steps
    num_blocks=1,
    num_heads=8,
    ff_dim=2048,
    output_channels=3
)
decoder = decoder.to(device)

In [19]:
autoencoder_path = 'autoencoder_ft_t8_decoder_v2_yolov8_new_epoch_20_mseloss_0.007056034170091152.pth'
encoder.load_state_dict(torch.load(autoencoder_path)['encoder_state_dict'])
decoder.load_state_dict(torch.load(autoencoder_path)['decoder_state_dict'])

<All keys matched successfully>

In [20]:
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x[-1])
        return x

autoencoder = Autoencoder()
autoencoder = autoencoder.to(device)

### Encoder

In [25]:
# # To Train from scratch/fine-tuning
# encoder = create_model("fastvit_t8", fork_feat=True) # can turn fork_feat to False

# checkpoint = torch.load('./pretrained/fastvit_t8.pth.tar')
# encoder.load_state_dict(checkpoint['state_dict'], strict=False)

# # For inference
# # model.eval()      
# encoder = reparameterize_model(encoder)
# encoder = encoder.to(device)
# encoder.eval()

### Decoder

In [26]:
# # Initialize the decoder
# decoder = UpsampleTransformerDecoderV2(
#     input_channels=384,
#     num_upsamples=5,  # Adjusted to 5 upsampling steps
#     num_blocks=1,
#     num_heads=16,
#     ff_dim=4096,
#     output_channels=3
# )

# # decoder_model_path = 'decoder_t8_preds_detr101dc5_adamw_numblocks_1_ffdim_4096_heads_16.pth'
# # decoder_model_path = 'decoder_t8_preds_detr101dc5_adamw_numblocks_1_ffdim_4096_heads_16_v2.pth'
# # decoder_model_path = './pretrained/decoder_state_dict_adamw_40e_numblocks_2_ffdim_4096_heads_16_mseloss_0.05.pth'
# # decoder_model_path = 'decoder_v2_t8_preds_detr101dc5_adamw_numblocks_1_ffdim_4096_heads_32.pth'
# decoder_model_path = 'decoder_v2_t8_preds_yolov8_adamw_numblocks_1_ffdim_4096_heads_16.pth'



# # Load the state dictionary
# decoder.load_state_dict(torch.load(decoder_model_path))

# # This should be turned on only for test time. If we want to retrain, comment this line
# # If you are using a GPU for the model, don't forget to move the decoder to the GPU
# decoder.to(device)
# decoder.eval()

In [27]:
# Freeze encoder weights
for param in autoencoder.encoder.parameters():
    param.requires_grad = False

# Freeze encoder weights
for param in autoencoder.decoder.parameters():
    param.requires_grad = False

# Set up loss function and optimizer
criterion = nn.MSELoss()

## Avenue File Setup

In [21]:
import torch
import torchvision.transforms as T
import matplotlib.pyplot as plt
import requests
import numpy as np

from PIL import Image

In [22]:
from object_detection_utils import (
    COLORS,
    preprocess, 
    box_cxcywh_to_xyxy, 
    rescale_bboxes,
    batch_detect,
    detect, 
    plot_results,
    plot_batch_detections,
    plot_batch_detections,
    plot_results_avenue,
    load_images_from_folder,
    list_image_files,
    save_cropped_images,
)

In [23]:
test_dir = "./datasets/Avenue Dataset/test__/"
test_video_dirs = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21"]
test_video_paths = []
total_frames = 0
for dir in test_video_dirs:
    cur_dir = os.path.join(test_dir, dir)
    test_video_paths.append(cur_dir)
    jpg_files = [f for f in os.listdir(cur_dir) if f.endswith('.jpg')]
    total_frames += len(jpg_files)
    print(cur_dir, len(jpg_files))
print(f"Total Frames: {total_frames}")

./datasets/Avenue Dataset/test__/01 1439
./datasets/Avenue Dataset/test__/02 1211
./datasets/Avenue Dataset/test__/03 923
./datasets/Avenue Dataset/test__/04 947
./datasets/Avenue Dataset/test__/05 1007
./datasets/Avenue Dataset/test__/06 1283
./datasets/Avenue Dataset/test__/07 605
./datasets/Avenue Dataset/test__/08 36
./datasets/Avenue Dataset/test__/09 1175
./datasets/Avenue Dataset/test__/10 841
./datasets/Avenue Dataset/test__/11 472
./datasets/Avenue Dataset/test__/12 1271
./datasets/Avenue Dataset/test__/13 549
./datasets/Avenue Dataset/test__/14 507
./datasets/Avenue Dataset/test__/15 1001
./datasets/Avenue Dataset/test__/16 740
./datasets/Avenue Dataset/test__/17 426
./datasets/Avenue Dataset/test__/18 294
./datasets/Avenue Dataset/test__/19 248
./datasets/Avenue Dataset/test__/20 273
./datasets/Avenue Dataset/test__/21 76
Total Frames: 15324


In [24]:
image_names = [img.split('/')[-1] for img in list_image_files(test_video_paths[-1])]
print(image_names)

['00.jpg', '01.jpg', '02.jpg', '03.jpg', '04.jpg', '05.jpg', '06.jpg', '07.jpg', '08.jpg', '09.jpg', '10.jpg', '11.jpg', '12.jpg', '13.jpg', '14.jpg', '15.jpg', '16.jpg', '17.jpg', '18.jpg', '19.jpg', '20.jpg', '21.jpg', '22.jpg', '23.jpg', '24.jpg', '25.jpg', '26.jpg', '27.jpg', '28.jpg', '29.jpg', '30.jpg', '31.jpg', '32.jpg', '33.jpg', '34.jpg', '35.jpg', '36.jpg', '37.jpg', '38.jpg', '39.jpg', '40.jpg', '41.jpg', '42.jpg', '43.jpg', '44.jpg', '45.jpg', '46.jpg', '47.jpg', '48.jpg', '49.jpg', '50.jpg', '51.jpg', '52.jpg', '53.jpg', '54.jpg', '55.jpg', '56.jpg', '57.jpg', '58.jpg', '59.jpg', '60.jpg', '61.jpg', '62.jpg', '63.jpg', '64.jpg', '65.jpg', '66.jpg', '67.jpg', '68.jpg', '69.jpg', '70.jpg', '71.jpg', '72.jpg', '73.jpg', '74.jpg', '75.jpg']


In [25]:
# transform = T.Compose([
#     T.Resize((640, 360)),
#     T.ToTensor(),
#     T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
# ])

In [26]:
video_names = [video.split('/')[-1] for video in test_video_paths]
print(video_names)

['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21']


In [30]:
# Read obj_det_avenue_test
with open('obj_dect_avenue_yolov8', 'rb') as file:
    obj_dect_avenue = pickle.load(file)

In [31]:
%%time
all_pred_ano = []
anomaly_scores_dict = {}
for i, video_path in enumerate(test_video_paths):
    print(video_path)
    image_names = [img.split('/')[-1] for img in list_image_files(test_video_paths[i])]

    # Get dict containing {frame_idx: bounding boxes} for current video
    bbox_temp = obj_dect_avenue[video_names[i]]

    # For each video, all frames will have an associated anomaly score given by the max MLE loss on all objects in that frame
    anomaly_scores_dict[video_names[i]] = {}

    # Iterate through 
    for frame_idx, image_name in zip(bbox_temp, image_names):

        # Get full path to frame/image
        full_image_path = os.path.join(test_video_paths[i], image_name)

        image = Image.open(full_image_path)

        # Get list of bounding boxes
        boxes = bbox_temp[frame_idx]

        current_frame_max_anomaly_score = float('-inf')
              
        # Go through all bounding boxes of that frame, and crop the objects
        for bbox in boxes:
            cropped_obj = image.crop((bbox[0], bbox[1], bbox[2], bbox[3]))
            
            # Plot the cropped object for debugging
            # cropped_obj.show()  # This will display the cropped image

            # Transform cropped obj to desired 64 * 64 shape
            transformed_obj = autoencoder_transform(cropped_obj)

            # Ensure the transformed object is in the right shape for the model
            transformed_obj = transformed_obj.unsqueeze(0)  # Add batch dimension

            with torch.no_grad():
                # Forward pass through the encoder and then the decoder
                # latent_representation = encoder(transformed_obj.to(device))[-1]  # Assuming the last output is the latent representation
                # reconstructed_img = decoder(latent_representation)
                reconstructed_img = autoencoder(transformed_obj.to(device))

            # Ensure cropped_obj is a tensor and in the correct shape for loss calculation
            cropped_obj_tensor = autoencoder_transform(cropped_obj)
            cropped_obj_tensor = cropped_obj_tensor.unsqueeze(0) # Add batch dimension

            # Compute the reconstruction loss
            score = F.mse_loss(reconstructed_img, cropped_obj_tensor.to(device))

            # We want to store the max anomaly score for this frame
            if score > current_frame_max_anomaly_score:
                current_frame_max_anomaly_score = score
                anomaly_bbox = bbox
            # current_frame_max_anomaly_score = max(score, current_frame_max_anomaly_score)

            # print(score)

            # We should now have bounding box, frame idx, maximum anomaly score, 
            # all_pred_ano.append(AnomalyDetection(frame_idx, bbox, score, video_names[i], track_id=-1))
        all_pred_ano.append(AnomalyDetection(frame_idx, anomaly_bbox, score, video_names[i], track_id=-1))
        
        anomaly_scores_dict[video_names[i]][frame_idx] = current_frame_max_anomaly_score

./datasets/Avenue Dataset/test__/01
./datasets/Avenue Dataset/test__/02
./datasets/Avenue Dataset/test__/03
./datasets/Avenue Dataset/test__/04
./datasets/Avenue Dataset/test__/05
./datasets/Avenue Dataset/test__/06
./datasets/Avenue Dataset/test__/07
./datasets/Avenue Dataset/test__/08
./datasets/Avenue Dataset/test__/09
./datasets/Avenue Dataset/test__/10
./datasets/Avenue Dataset/test__/11
./datasets/Avenue Dataset/test__/12
./datasets/Avenue Dataset/test__/13
./datasets/Avenue Dataset/test__/14
./datasets/Avenue Dataset/test__/15
./datasets/Avenue Dataset/test__/16
./datasets/Avenue Dataset/test__/17
./datasets/Avenue Dataset/test__/18
./datasets/Avenue Dataset/test__/19
./datasets/Avenue Dataset/test__/20
./datasets/Avenue Dataset/test__/21
CPU times: user 9min 11s, sys: 1min 23s, total: 10min 34s
Wall time: 10min 57s


In [32]:
compute_fpr_rbdr(all_pred_ano, gt_anomalies, all_gt_tracks, num_frames=15324, num_tracks=num_tracks)

fpr does not reach 1
tbdc = 0.2126141460162959
rbdc = 0.14000285037200888


(0.14000285037200888, 0.2126141460162959)

#### Dummy run with random score

In [None]:
%%time
all_pred_ano = []
anomaly_scores_dict = {}
for i, video_path in enumerate(test_video_paths):
    print(video_path)
    image_names = [img.split('/')[-1] for img in list_image_files(test_video_paths[i])]

    # Get dict containing {frame_idx: bounding boxes} for current video
    bbox_temp = obj_dect_avenue[video_names[i]]

    # For each video, all frames will have an associated anomaly score given by the max MLE loss on all objects in that frame
    anomaly_scores_dict[video_names[i]] = {}

    # Iterate through 
    for frame_idx, image_name in zip(bbox_temp, image_names):

        # Get full path to frame/image
        full_image_path = os.path.join(test_video_paths[i], image_name)

        image = Image.open(full_image_path)

        # Get list of bounding boxes
        boxes = bbox_temp[frame_idx]

        # current_frame_max_anomaly_score = float('-inf')
              
        # # Go through all bounding boxes of that frame, and crop the objects
        # for bbox in boxes:
        #     cropped_obj = image.crop((bbox[0], bbox[1], bbox[2], bbox[3]))
            
        #     # Plot the cropped object for debugging
        #     # cropped_obj.show()  # This will display the cropped image

        #     # TODO: Transform cropped obj to desired 64 * 64 shape
        #     transformed_obj = autoencoder_transform(cropped_obj)

        #     # Ensure the transformed object is in the right shape for the model
        #     transformed_obj = transformed_obj.unsqueeze(0)  # Add batch dimension

        #     with torch.no_grad():
        #         # Forward pass through the encoder and then the decoder
        #         latent_representation = encoder(transformed_obj.to(device))[-1]  # Assuming the last output is the latent representation
        #         reconstructed_img = decoder(latent_representation)

        #     # Ensure cropped_obj is a tensor and in the correct shape for loss calculation
        #     cropped_obj_tensor = autoencoder_transform(cropped_obj)
        #     cropped_obj_tensor = cropped_obj_tensor.unsqueeze(0) # Add batch dimension

        #     # Compute the reconstruction loss
        #     # score = F.mse_loss(reconstructed_img, cropped_obj_tensor.to(device))
        #     score = random.random()

        #     # We want to store the max anomaly score for this frame
        #     current_frame_max_anomaly_score = max(score, current_frame_max_anomaly_score)

        #     # print(score)

        score = random.random()

        # We should now have bounding box, frame idx, maximum anomaly score, 
        all_pred_ano.append(AnomalyDetection(frame_idx, bbox, score, video_names[i], track_id=-1))

        anomaly_scores_dict[video_names[i]][frame_idx] = score

In [None]:
compute_fpr_rbdr(all_pred_ano, gt_anomalies, all_gt_tracks, num_frames=15324, num_tracks=num_tracks)

#### Save and load to pickle

In [None]:
# # Save the object to a file
# with open(pickle_file, 'wb') as file:
#     pickle.dump(all_pred_ano, file)

In [None]:
# # Load the object from the file
# pickle_file = 'all_pred_ano_avenue_v3.pkl'
# with open(pickle_file, 'rb') as file:
#     all_pred_ano = pickle.load(file)

### Micro / Macro AUC Score

- micro e cand concatenez toate video-urile si evaluez abilitatea de a detecta anomalii
- macro e cand fac AUC pe fiecare video si fac medie 

In [40]:
def gaussian_filter(support, sigma):
    mu = support[len(support) // 2 - 1]
    # mu = np.mean(support)
    filter = 1.0 / (sigma * np.sqrt(2 * np.pi)) * np.exp(-0.5 * ((support - mu) / sigma) ** 2)
    return filter

def filt(input, dim=9, range=302, mu=25):
    filter_3d = np.ones((dim, dim, dim)) / (dim ** 3)
    filter_2d = gaussian_filter(np.arange(1, range), mu)

    frame_scores = input # This works
    # frame_scores = convolve(input, filter_3d)
    # frame_scores = frame_scores.max((1, 2))

    padding_size = len(filter_2d) // 2
    in_ = np.concatenate((np.zeros(padding_size), frame_scores, np.zeros(padding_size)))
    frame_scores = np.correlate(in_, filter_2d, 'valid')
    return frame_scores

def process_current_vid_preds(pred: np.array): 
    pred = np.nan_to_num(pred, nan=0.)
    pred = filt(pred, range=302, mu=25)
    pred = (pred - np.min(pred)) / (np.max(pred) - np.min(pred))
    return pred

In [41]:
def read_txt_to_numpy_array(file_path):
    try:
        data = np.loadtxt(file_path)
        return data
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

labels_path = './datasets/Avenue Dataset/gt/'
labels_dict = {}

for vid_name in anomaly_scores_dict:
    labels_dict[vid_name] = read_txt_to_numpy_array(os.path.join(labels_path, f"{vid_name}.txt"))

print(labels_dict.keys())

dict_keys(['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21'])


In [42]:
np.array(labels_dict['01'])

array([0., 0., 0., ..., 0., 0., 0.])

In [43]:
# !pip install scipy

In [44]:
aucs = []
filtered_preds = []
filtered_labels = []

for vid_name in anomaly_scores_dict:
    # print(len(anomaly_scores_dict[vid_name]))
    pred = np.array(list(tensor.cpu() for tensor in anomaly_scores_dict[vid_name].values()))
    pred = process_current_vid_preds(pred)
    filtered_preds.append(pred)

    lbl = labels_dict[vid_name]
    filtered_labels.append(lbl)

    lbl = np.array([0] + list(lbl) + [1])
    pred = np.array([0] + list(pred) + [1])

    fpr, tpr, _ = metrics.roc_curve(lbl, pred)
    res = metrics.auc(fpr, tpr)
    aucs.append(res)

macro_auc = np.nanmean(aucs)
print(macro_auc)

0.6002765828787121


In [45]:
# Micro-AUC
filtered_preds = np.concatenate(filtered_preds)
filtered_labels = np.concatenate(filtered_labels)

fpr, tpr, _ = metrics.roc_curve(filtered_labels, filtered_preds)
micro_auc = metrics.auc(fpr, tpr)
micro_auc = np.nan_to_num(micro_auc, nan=1.0)

In [46]:
print(micro_auc)

0.6007772307192685
