In [43]:
import cv2
import torch
import numpy as np
from torchvision import models, transforms
from PIL import Image
from tqdm import tqdm
from sklearn.decomposition import PCA

In [45]:
model = models.resnet50(pretrained=True)
model.eval()

preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def get_frame_embedding(frame):
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame_pil = Image.fromarray(frame_rgb)
    input_tensor = preprocess(frame_pil).unsqueeze(0)
    with torch.no_grad():
        embedding = model(input_tensor)
    return embedding.squeeze().numpy()



In [46]:
def get_video_embedding(video_path, num_frames=30, method = 'mean'):
    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_indices = np.linspace(0, frame_count - 1, num_frames, dtype=int)

    embeddings = []
    for idx in tqdm(frame_indices, desc="Extracting frame embeddings"):
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if not ret:
            continue
        embedding = get_frame_embedding(frame)
        embeddings.append(embedding)
    
    cap.release()
    
    # Tính trung bình các embedding để tạo thành một vector duy nhất
    if method == 'mean':
        video_embedding = np.mean(embeddings, axis=0)
    elif method == 'stacked':
        video_embedding = np.concatenate(embeddings, axis = 0)
    return video_embedding

# Sử dụng hàm để tính embedding cho video
video_path = './data/sample_tv360/gt/chunk_1_gt.mp4'
video_embedding = get_video_embedding(video_path)
print("Video Embedding Shape:", video_embedding.shape)

Extracting frame embeddings: 100%|██████████| 30/30 [00:02<00:00, 11.65it/s]

Video Embedding Shape: (1000,)





In [47]:
video_embedding

array([-5.91770709e-01,  3.08126777e-01, -1.78604043e+00, -1.80084765e+00,
       -1.73786354e+00,  1.46364748e+00, -2.94299054e+00, -8.40123832e-01,
       -2.69071851e-02, -1.32566822e+00, -7.24764287e-01, -1.47137690e+00,
       -1.18426383e+00, -1.95144451e+00, -1.11683810e+00, -1.19100440e+00,
       -1.00986540e+00, -1.40102732e+00, -2.33818555e+00, -4.67798424e+00,
       -5.58488727e-01,  1.86033204e-01, -4.48700577e-01, -8.22493374e-01,
       -1.30206954e+00, -1.24364994e-01,  8.72364283e-01, -4.91729289e-01,
       -3.41637850e-01, -7.16596425e-01, -1.18146598e+00, -1.90524518e+00,
       -9.59793031e-02, -1.02171206e+00, -2.15303779e-01,  7.52449930e-01,
        9.54805076e-01, -8.97598565e-02,  5.59030712e-01, -6.16167068e-01,
       -2.24065161e+00, -1.93396911e-01, -2.21654606e+00, -1.50536382e+00,
       -5.92676938e-01, -4.32685405e-01, -1.80536401e+00, -8.94385159e-01,
       -1.47438824e+00, -1.30757070e+00, -6.08435392e-01, -1.39549688e-01,
        4.67116952e-01, -

In [53]:
def cosine_similarity(v1, v2):
    return 1 - cosine(v1, v2)

# Hàm tính khoảng cách giữa 2 video qua DTW
def calculate_dtw_distance(video1_chunks, video2_chunks):
    # Giả sử mỗi video_chunk là một embedding của từng đoạn
    distance, path = fastdtw(video1_chunks, video2_chunks, dist=cosine)
    return distance, path

def evaluate_summary_metrics(video1, video2, path, threshold=0.8):
    matched_pairs = []
    coverage_count = 0

    # Đo lường các đoạn có mức độ tương đồng cao
    for (i, j) in path:
        similarity = cosine_similarity(video1[i], video2[j])
        if similarity >= threshold:
            matched_pairs.append((i, j))
            coverage_count += 1

    precision = coverage_count / len(video1)
    recall = coverage_count / len(video2)
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1_score, coverage_count

# Hàm tính Kendall's Tau
def calculate_kendall_tau(video1_order, video2_order):
    tau, _ = kendalltau(video1_order, video2_order)
    return tau

In [87]:
import os
import numpy as np
from scipy.spatial.distance import cosine
from scipy.stats import kendalltau
from fastdtw import fastdtw
from tqdm import tqdm

root_gt = "./data/sample_tv360/gt"
root_data = "./data/sample_tv360/do_merge"

sorted_gt = sorted(os.listdir('./data/sample_tv360/gt'), key = lambda x : int(x.split("_")[1]))
sorted_data = sorted(os.listdir("./data/sample_tv360/do_merge"), key = lambda x: int(x.split("_")[-1].split(".")[0]))

data_embeddings, gt_embeddings = [], []
for i, j in zip(sorted_data, sorted_gt):
    print("ĐÂy là ",i)
    data_embedding = get_video_embedding(f"{root_data}/{i}", num_frames=30)
    gt_embedding = get_video_embedding(f"{root_gt}/{j}", num_frames=30)
    data_embeddings.append(data_embedding)
    gt_embeddings.append(gt_embedding)

distance, path = calculate_dtw_distance(data_embeddings, gt_embeddings)
    

ĐÂy là  merged_video_processed_1.mp4


Extracting frame embeddings:   0%|          | 0/30 [00:00<?, ?it/s]

Extracting frame embeddings: 100%|██████████| 30/30 [00:02<00:00, 11.84it/s]
Extracting frame embeddings: 100%|██████████| 30/30 [00:02<00:00, 13.86it/s]


ĐÂy là  merged_video_processed_2.mp4


Extracting frame embeddings: 100%|██████████| 30/30 [00:02<00:00, 11.87it/s]
Extracting frame embeddings: 100%|██████████| 30/30 [00:02<00:00, 12.98it/s]


ĐÂy là  merged_video_processed_3.mp4


Extracting frame embeddings: 100%|██████████| 30/30 [00:02<00:00, 12.16it/s]
Extracting frame embeddings: 100%|██████████| 30/30 [00:02<00:00, 12.22it/s]


ĐÂy là  merged_video_processed_4.mp4


Extracting frame embeddings: 100%|██████████| 30/30 [00:02<00:00, 11.40it/s]
Extracting frame embeddings: 100%|██████████| 30/30 [00:02<00:00, 12.56it/s]


ĐÂy là  merged_video_processed_5.mp4


Extracting frame embeddings: 100%|██████████| 30/30 [00:02<00:00, 11.59it/s]
Extracting frame embeddings: 100%|██████████| 30/30 [00:02<00:00, 12.48it/s]


ĐÂy là  merged_video_processed_6.mp4


Extracting frame embeddings: 100%|██████████| 30/30 [00:02<00:00, 11.53it/s]
Extracting frame embeddings: 100%|██████████| 30/30 [00:02<00:00, 13.06it/s]


ĐÂy là  merged_video_processed_7.mp4


Extracting frame embeddings: 100%|██████████| 30/30 [00:02<00:00, 11.79it/s]
Extracting frame embeddings: 100%|██████████| 30/30 [00:02<00:00, 12.79it/s]


ĐÂy là  merged_video_processed_8.mp4


Extracting frame embeddings: 100%|██████████| 30/30 [00:02<00:00, 11.81it/s]
Extracting frame embeddings: 100%|██████████| 30/30 [00:02<00:00, 12.50it/s]


ĐÂy là  merged_video_processed_9.mp4


Extracting frame embeddings: 100%|██████████| 30/30 [00:02<00:00, 12.04it/s]
Extracting frame embeddings: 100%|██████████| 30/30 [00:02<00:00, 12.68it/s]


ĐÂy là  merged_video_processed_10.mp4


Extracting frame embeddings: 100%|██████████| 30/30 [00:02<00:00, 11.78it/s]
Extracting frame embeddings: 100%|██████████| 30/30 [00:02<00:00, 12.20it/s]


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (10,) + inhomogeneous part.

In [95]:
len(data_embeddings[0])

30000

In [41]:
import cv2

# Đường dẫn đến video
video_path = '/home/thiendc/projects/video_summarization/data/sample_tv360/final/complete_video.mp4'

# Mở video
cap = cv2.VideoCapture(video_path)

# Tính toán số frame và FPS
fps = cap.get(cv2.CAP_PROP_FPS)
print(fps)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
duration = frame_count / fps  # Thời lượng video tính theo giây

cap.release()

# In ra độ dài video
print(f"Video length: {duration} seconds")


30.0
Video length: 270.06666666666666 seconds


In [71]:
video1_embeddings = [np.random.rand(512) for _ in range(5)]  # Video tóm tắt
video2_embeddings = [np.random.rand(512) for _ in range(5)]  # Video ground truth

# Tính DTW để tìm các đoạn tương tự
distance1, path1 = calculate_dtw_distance(video1_embeddings, video2_embeddings)

In [77]:
video1_embeddings[2]

array([0.0823345 , 0.73197522, 0.1183429 , 0.37314301, 0.27359758,
       0.51966801, 0.20951843, 0.19970385, 0.37358778, 0.97332946,
       0.0639759 , 0.19082878, 0.80952405, 0.53757529, 0.16513354,
       0.99443947, 0.97347087, 0.06978456, 0.37790472, 0.24074146,
       0.19197544, 0.73209076, 0.32528459, 0.17932311, 0.33560054,
       0.08930026, 0.52408235, 0.28318204, 0.39725103, 0.7491654 ,
       0.6843843 , 0.35163996, 0.21413031, 0.37139775, 0.53823728,
       0.0613051 , 0.1163295 , 0.44751819, 0.97563355, 0.15099258,
       0.85737509, 0.45703083, 0.84365546, 0.23465767, 0.05971626,
       0.35526556, 0.26027632, 0.62486857, 0.10691214, 0.43599374,
       0.81202796, 0.94240534, 0.29791604, 0.39289709, 0.0687656 ,
       0.87154011, 0.5464458 , 0.33329789, 0.74366802, 0.26869032,
       0.97655375, 0.07021355, 0.20671585, 0.95751496, 0.62885183,
       0.0388798 , 0.38603552, 0.56603191, 0.23891597, 0.98779491,
       0.39656415, 0.34102569, 0.76963451, 0.2420208 , 0.20514