In [None]:
import torch
from training.summary.datamodule import SummaryDataset
from transformers import ViTImageProcessor
from tqdm import tqdm
import matplotlib.pyplot as plt
import cv2
import seaborn as sns
import numpy as np
from moviepy.editor import VideoFileClip, concatenate_videoclips
from tqdm import tqdm

from v2021 import SummaryModel

In [None]:
preprocessor = ViTImageProcessor.from_pretrained(
    "google/vit-base-patch16-224", size=224, device='cuda'
)

In [None]:
SAMPLE_EVERY_SEC = 2

video_path = 'videos/sample.mp4'

cap = cv2.VideoCapture(video_path)

n_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
fps = cap.get(cv2.CAP_PROP_FPS)

video_len = n_frames / fps

print(f'Video length {video_len:.2f} seconds!')

frames = []
last_collected = -1

with tqdm(total=n_frames, desc="Processing frames") as pbar:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        timestamp = cap.get(cv2.CAP_PROP_POS_MSEC)
        second = timestamp // 1000

        if second % SAMPLE_EVERY_SEC == 0 and second != last_collected:
            last_collected = second
            frames.append(frame)
        pbar.update(1)

features = preprocessor(images=frames, return_tensors="pt")["pixel_values"]

print(features.shape)

In [None]:
plt.figure(figsize=(10, 10))
plt.imshow(features[0].numpy().transpose(1, 2, 0)[:, :, ::-1])

In [None]:
model = SummaryModel.load_from_checkpoint('summary.ckpt')
model.to('cuda')
model.eval()

In [None]:
features = features.to('cuda')

y_pred = []

for frame in tqdm(features):
    y_p = model(frame.unsqueeze(0))
    y_p = torch.sigmoid(y_p)

    y_pred.append(y_p.cpu().detach().numpy().squeeze())

y_pred = np.array(y_pred)

sns.displot(y_pred)

In [None]:
THRESHOLD = 0.205
total_secs = 0

for i, y_p in enumerate(y_pred):
    if y_p >= THRESHOLD:
        print(i * SAMPLE_EVERY_SEC)
        total_secs += SAMPLE_EVERY_SEC

total_secs

In [None]:
clip = VideoFileClip(video_path)

subclips = []
total_duration = 0

for i, y_p in tqdm(enumerate(y_pred), total=len(y_pred)):
    sec = i * SAMPLE_EVERY_SEC

    if y_p >= THRESHOLD:
        subclip = clip.subclip(sec, sec + SAMPLE_EVERY_SEC)
        subclips.append(subclip)
        total_duration += subclip.duration

result = concatenate_videoclips(subclips)

result.write_videofile("videos/result.mp4")

result.ipython_display(width=720, maxduration=total_duration)