In [1]:
import os
import json
import numpy as np
from tqdm import tqdm
import cv2

In [2]:
def modified_nms(det, dets, thresh): 
    """Pure Python NMS baseline.""" 

    x1 = det["timestamp"][0]
    x2 = det["timestamp"][1]
    sentence_score = det["sentence_score"]

    timestamp = np.array([elem["timestamp"] for elem in dets])
    d1 = timestamp[:, 0]
    d2 = timestamp[:, 1]

    xx1 = np.maximum(x1, d1)
    xx2 = np.minimum(x2, d2)
    overlap = xx2 - xx1
    not_overlapped = overlap < 0

    xx1 = np.minimum(x1, d1)
    xx2 = np.maximum(x2, d2)
    iou = abs(overlap / (xx2 - xx1 + 1e-6))
    not_the_same_det = iou < thresh

    kept_dets = np.array(list(zip(not_overlapped, not_the_same_det))).any(1)

    other_ss = [elem["sentence_score"] for elem, keep in zip(dets, kept_dets) if not keep]
    other_s = [elem["sentence"] for elem, keep in zip(dets, kept_dets) if not keep]
    better_cap = None
    if other_ss:
        other_best = max(other_ss)
        find_better_one = None if other_best <= sentence_score else other_ss.index(other_best)
        if find_better_one is not None:
            better_cap = other_s[find_better_one]
    return [det for keep, det in zip(kept_dets, dets) if keep], better_cap

def get_video_duration(video_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Could not open video.")
        return None
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = frame_count / fps
    cap.release()
    return frame_count, duration

In [3]:
pred_file = "../captions_ann/gvl_pred.json"
video_dir = "../inputs/test_set"
thresh = 0.2

In [4]:
with open(pred_file, "r") as f:
    pred = json.load(f)

post_ann = {}
for vid_name, predictions in pred["results"].items():
    pred_event_count = np.mean([elem["pred_event_count"] for elem in predictions])
    predictions.sort(key=lambda x: x["proposal_score"], reverse=True)

    output = []
    for i in range(int(pred_event_count)):
        first = predictions.pop(0)
        first["event_id"] = i
        first["sentence"] = [first["sentence"]]
        output.append(first)

        if predictions:
            predictions, better_cap = modified_nms(first, predictions, thresh)
            if better_cap:
                output[-1]["sentence"][0] = better_cap
        else:
            break

    post_ann[vid_name] = output

with open("post_predictions.json", "w") as f:
    json.dump(post_ann, f, indent=4)

In [5]:
vast_ann = []
for vid_name, predictions in post_ann.items():
    frame_count, duration = get_video_duration(os.path.join(video_dir, vid_name) + ".mp4")
    for elem in predictions:
        ann_i = {
            "video_id": f"{os.path.join(video_dir, vid_name)}.mp4@{elem['event_id']}",
            "caption": [elem["sentence"]],
            "timestamp": (np.array(elem["timestamp"]) / duration * frame_count).astype(int).tolist(),
            "event_id": elem["event_id"]
        }
        vast_ann.append(ann_i)

with open("vast_inference.json", "w") as f:
    json.dump(vast_ann, f, indent=4)