In [24]:
import os
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
import cv2
from sklearn.cluster import DBSCAN
from PIL import Image


In [25]:
# Load the pre-trained model
model = SentenceTransformer('clip-ViT-L-14')




In [26]:
def extract_frame_features(video_path):
    """
    Extract features from each frame of the video.

    Parameters:
        video_path (str): Path to the input video.

    Returns:
        frame_indices (list): List of frame indices.
        features (np.ndarray): Array of extracted features.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Cannot open video: " + video_path)
        return [], np.array([])

    frame_indices = []
    features = []
    frame_count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Convert the frame from BGR to RGB and then to PIL Image
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image_pil = Image.fromarray(frame_rgb)

        with torch.no_grad():
            # Compute the embedding and normalize it
            image_embedding = model.encode(
                [image_pil],
                convert_to_tensor=True,
                show_progress_bar=False
            )[0]
            image_embedding = image_embedding / image_embedding.norm()

            # Append the embedding and frame index
            features.append(image_embedding.cpu().numpy())
            frame_indices.append(frame_count)

        frame_count += 1

    cap.release()
    return frame_indices, np.array(features)


In [27]:
from sklearn.metrics.pairwise import cosine_distances


def cluster_frames(features, eps=0.3, min_samples=5):
    # Compute cosine distance matrix
    distance_matrix = cosine_distances(features)

    # Apply DBSCAN clustering
    dbscan = DBSCAN(metric='precomputed', eps=eps, min_samples=min_samples)
    labels = dbscan.fit_predict(distance_matrix)

    return labels

In [28]:
def extract_video_slots(video_path, frame_indices, labels, output_dir):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Cannot open video: " + video_path)
        return

    os.makedirs(output_dir, exist_ok=True)

    # Group frame indices by cluster label
    clusters = {}
    for idx, label in zip(frame_indices, labels):
        if label == -1:
            continue
        clusters.setdefault(label, []).append(idx)

    # Sort frames within each cluster
    for frames in clusters.values():
        frames.sort()

    # Get OpenCV version
    opencv_version = cv2.__version__

    # Extract video slots
    for label, frames in clusters.items():
        if not frames:
            continue

        start_frame = frames[0]
        end_frame = frames[-1]

        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        fps = cap.get(cv2.CAP_PROP_FPS)

        # Determine codec and output file extension
        if int(opencv_version.split('.')[0]) < 3:
            # For OpenCV 2.x
            fourcc = cv2.cv.CV_FOURCC(*'XVID')
            ext = 'avi'
        else:
            # For OpenCV 3.x and above
            fourcc = cv2.VideoWriter_fourcc(*'XVID')
            ext = 'avi'

        output_path = os.path.join(output_dir, f"slot_{label}.{ext}")
        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

        for frame_num in range(start_frame, end_frame + 1):
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
            ret, frame = cap.read()
            if not ret:
                break
            out.write(frame)

        out.release()
        print(f"Extracted slot {label}: Frames {start_frame} to {end_frame}")

    cap.release()


In [29]:
def process_video(video_path, output_dir, eps=0.5, min_samples=5):
    """
    Process the video to extract features, cluster frames, and extract video slots.

    Parameters:
        video_path (str): Path to the input video.
        output_dir (str): Directory to save the extracted video slots.
        eps (float): DBSCAN eps parameter.
        min_samples (int): DBSCAN min_samples parameter.
    """
    # Step 1: Extract frame features
    print("Extracting features from video frames...")
    frame_indices, features = extract_frame_features(video_path)

    if features.size == 0:
        print("No features extracted. Exiting.")
        return

    # Step 2: Cluster frames using DBSCAN
    print("Clustering frames with DBSCAN...")
    labels = cluster_frames(features, eps=eps, min_samples=min_samples)

    # Step 3: Extract video slots based on clustering
    print("Extracting video slots based on clustering...")
    extract_video_slots(video_path, frame_indices, labels, output_dir)

    print("Processing complete.")


# Path to the video
video_path = '/media/daoan/T7 Shield2/AI_Challenge_2024_DATA/video_with_audio/Videos_L23/video/L23_V001.mp4'

# Directory to save extracted video slots
output_dir = '/home/daoan/Projects/AI_Challenge_HCMC_2024/data_extraction/frame_split/video'

# Parameters for DBSCAN
eps = 0.3  # Adjusted for cosine distance (range 0 to 2)
min_samples = 5

# Process the video
process_video(video_path, output_dir, eps=eps, min_samples=min_samples)

Extracting features from video frames...
Clustering frames with DBSCAN...
Extracting video slots based on clustering...
Extracted slot 0: Frames 0 to 4398
Processing complete.
