In [1]:
# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import json
from tqdm import tqdm, trange
import os
import time

import wandb

from model.layers import AttentiveDiscriminator, AttentiveSummarizer
from utils import TensorboardWriter

In [2]:
input_size = 1024
hidden_size = 512
nhead = 4
num_layers = 2
dim_feedforward = hidden_size * 2

In [3]:
linear_compress = nn.Linear(
    input_size, hidden_size
).cuda()

## Summarizer: attentive_selector + attentive_auto_encoder
summarizer = AttentiveSummarizer(
    d_model=hidden_size,
    nhead=nhead,
    num_layers=num_layers,
    dim_feedforward=dim_feedforward,
).cuda()

    ## Discriminator: attentive_discriminator
discriminator = AttentiveDiscriminator(
    d_model=hidden_size,
    nhead=nhead,
    num_layers=num_layers,
    dim_feedforward=dim_feedforward,
).cuda()

model = nn.ModuleList(
    [linear_compress, summarizer, discriminator]
)



In [4]:
pretrained_model_dir = "epoch-43.pt"
state_dict = torch.load(pretrained_model_dir)
model.load_state_dict(state_dict)

<All keys matched successfully>

### Load GoogLeNet model

In [5]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
import cv2
import numpy as np

full_lenet_model = models.googlenet(pretrained=True)
pool5_extractor = nn.Sequential(*list(full_lenet_model.children())[:-2]).cuda()
pool5_extractor(torch.randn(1,3,224,224).cuda()).shape
# import numpy as np
# import cv2

# import tensorflow as tf
# from tensorflow.keras.applications.inception_v3 import InceptionV3
# from tensorflow.keras.models import Model

# # Load the InceptionV1 (GoogLeNet) model
# base_model = InceptionV3(weights='imagenet', include_top=True)

# # Create a new model that outputs the pool5 layer's output
# model = Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)



torch.Size([1, 1024, 1, 1])

In [6]:
# Open the video file
video_path = 'tvsum50_ver_1_1/ydata-tvsum50-v1_1/ydata-tvsum50-video/video/3eYKfiOEJNs.mp4'
cap = cv2.VideoCapture(video_path)
original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
original_frames = []
prerpocessed_frames = []

# Create a transform for preprocessing frames
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Process each frame in the video
while True:
    ret, frame = cap.read()
    
    if not ret:
        break  # Break the loop if no more frames
    
    original_frames.append(frame)
    
    # Preprocess the frame
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame = transform(frame)
    prerpocessed_frames.append(frame)

# Release the video capture object
cap.release()

In [7]:
pool5_extractor.cuda().eval();

In [8]:
features_array = []
with torch.no_grad():
    for i in range(0, len(prerpocessed_frames), 32):
        batch = prerpocessed_frames[i:i+32]
        batch = torch.stack(batch)
        batch = batch.cuda()
        features = pool5_extractor(batch)
        features = features.detach().cpu()
        features_array.append(features)

In [9]:
# Convert the list of features to a NumPy array
features_array = torch.cat(features_array).numpy()
print(features_array.shape)

# Save the features to a file (you can use any preferred method, e.g., np.save)
np.save('features.npy', features_array)
features_array.shape

(4853, 1024, 1, 1)


(4853, 1024, 1, 1)

In [10]:
features_array = np.load('features.npy')

# remove the third and fourth dimensions
features_array = np.squeeze(features_array, axis=2)
features_array = np.squeeze(features_array, axis=2)
features_array.shape

(4853, 1024)

In [None]:
linear_compress.cpu().eval();
summarizer.cpu().eval();

In [None]:
out_dict = {}
with torch.no_grad():
    features_array = torch.from_numpy(features_array).float().cpu()

    video_tensor = features_array.view(-1, 1, input_size)
    video_feature = Variable(video_tensor).cpu()

    # [seq_len, 1, hidden_size]
    video_feature = linear_compress(video_feature.detach()).unsqueeze(1)
    
    scores = summarizer.attentive_selector(video_feature).squeeze(1)
    scores = scores.cpu().numpy().tolist()

In [11]:
import os
from os import listdir
import json
import numpy as np
import h5py
from evaluation.generate_summary import generate_summary
from evaluation.evaluation_metrics import evaluate_summary
import cv2


PATH_TVSum = "data/TVSum/eccv16_dataset_tvsum_google_pool5.h5"
model_predictions = "model/output/attentive/base/tvsum/results/split4/tvsum_40.json"

all_scores = []

data = json.loads(open(model_predictions).read())
keys = list(data.keys())

for video_name in keys:
    scores = np.asarray(data[video_name])
    all_scores.append(scores)

all_user_summary, all_shot_bound, all_nframes, all_positions = [], [], [], []
with h5py.File(PATH_TVSum, "r") as hdf:
    for video_name in keys:
        video_index = video_name[6:]

        user_summary = np.array(
            hdf.get("video_" + video_index + "/user_summary")
        )
        sb = np.array(hdf.get("video_" + video_index + "/change_points"))
        n_frames = np.array(hdf.get("video_" + video_index + "/n_frames"))
        positions = np.array(hdf.get("video_" + video_index + "/picks"))

        all_user_summary.append(user_summary)
        all_shot_bound.append(sb)
        all_nframes.append(n_frames)
        all_positions.append(positions)

all_summaries = generate_summary(
    all_shot_bound, all_scores, all_nframes, all_positions
)
keys

['video_1',
 'video_11',
 'video_12',
 'video_15',
 'video_16',
 'video_36',
 'video_42',
 'video_5',
 'video_50',
 'video_6']

In [12]:
video_name = 'video_1'
video_idx = int(video_name[6:]) - 1

In [13]:
import pandas as pd

tvsum_video_dir = "tvsum50_ver_1_1/ydata-tvsum50-v1_1/ydata-tvsum50-video/video/"
tvsum_info_file = "tvsum50_ver_1_1/ydata-tvsum50-v1_1/ydata-tvsum50-data/data/ydata-tvsum50-info.tsv"
tvsum_info = pd.read_csv(tvsum_info_file, sep="\t")
video_id = tvsum_info["video_id"][video_idx]
print(video_id)

AwmHb44_ouw


In [14]:
video_summary = all_summaries[keys.index(video_name)]
video_summary

array([1, 1, 1, ..., 0, 0, 0], dtype=int8)

In [15]:
video_path = os.path.join(tvsum_video_dir, video_id + ".mp4")

In [16]:
cap = cv2.VideoCapture(video_path)
original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
original_frames = []


# Process each frame in the video
while True:
    ret, frame = cap.read()
    
    if not ret:
        break  # Break the loop if no more frames
    
    original_frames.append(frame)

# Release the video capture object
cap.release()

In [17]:
seleted_frames = []
for i, frame in enumerate(original_frames):
    if video_summary[i] == 1:
        seleted_frames.append(frame)

In [18]:
print(f"Original video length: {len(original_frames):,}")
print(f"Selected frames length: {len(seleted_frames):,}")
print(f'len(seleted_frames) / len(original_frames): {len(seleted_frames) / len(original_frames)}')

Original video length: 10,597
Selected frames length: 1,587
len(seleted_frames) / len(original_frames): 0.14975936585826177


In [19]:
video_writer = cv2.VideoWriter(
    f"summary_{video_id}.mp4",
    cv2.VideoWriter_fourcc(*"mp4v"),
    60,
    (original_width, original_height),
)

for frame in seleted_frames:
    video_writer.write(frame)
    
video_writer.release()