1. Segment each video into shots
2. Extract features from each shot, including low-level features (such as color and motion) and high-level semantic features (such as object detection and scene recognition)
3. Cluster the shots based on their feature representations to group similar shots together
4. Identify key frames within each cluster by selecting frames that best represent the cluster
5. Order the key frames based on their importance score, which is calculated using a combination of visual and semantic information
6. Generate a summary by selecting the top-ranked key frames

## Step1

In [None]:
def extract_shots(video_path):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)

    # Group frames into shots with fixed interval of 30 frames
    shots = []
    start_frame = 0
    while True:
        end_frame = start_frame + 29
        if end_frame >= cap.get(cv2.CAP_PROP_FRAME_COUNT):
            end_frame = cap.get(cv2.CAP_PROP_FRAME_COUNT) - 1
        shots.append((int(start_frame), int(end_frame)))
        if end_frame >= cap.get(cv2.CAP_PROP_FRAME_COUNT) - 1:
            break
        start_frame = end_frame + 1

    cap.release()

    return shots[:-1]


## step 2

In [None]:
import cv2
import numpy as np

def compute_optical_flow(prev_gray, curr_gray, prev_pts):
    curr_pts, status, err = cv2.calcOpticalFlowPyrLK(prev_gray, curr_gray, prev_pts, None)

    # Filter out points for which the flow is not found
    good_pts = prev_pts[status==1]
    new_pts = curr_pts[status==1]

    # Find the average flow vector
    flow_vec = np.mean(new_pts - good_pts, axis=0)

    return flow_vec

In [None]:
import cv2
import numpy as np

def extract_features(video_path, shots):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)

    features = []

    for shot in shots:
        start_frame, end_frame = shot
        frame_count = end_frame - start_frame + 1

        # Skip shots with less than two frames
        if frame_count < 2:
            continue

        # Initialize feature vectors
        color_features = np.zeros((frame_count, 3), dtype=np.float32)
        motion_features = np.zeros((frame_count-1, 2), dtype=np.float32)

        # Read first frame of shot
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
        ret, prev_frame = cap.read()
        prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)

        # Compute color feature for first frame
        color_features[0] = cv2.mean(prev_frame)[:3]

        # Initialize feature points for Lucas-Kanade method
        prev_pts = cv2.goodFeaturesToTrack(prev_gray, maxCorners=100, qualityLevel=0.01, minDistance=10, blockSize=3)

        # Iterate over frames in shot
        for i in range(1, frame_count):
            # Read current frame
            ret, curr_frame = cap.read()
            if not ret:
                break
            curr_gray = cv2.cvtColor(curr_frame, cv2.COLOR_BGR2GRAY)

            # Compute color feature
            color_features[i] = cv2.mean(curr_frame)[:3]

            # Compute motion feature using Lucas-Kanade method
            flow_vec = compute_optical_flow(prev_gray, curr_gray, prev_pts)
            motion_features[i-1, 0] = np.linalg.norm(flow_vec)
            motion_features[i-1, 1] = np.arctan2(flow_vec[1], flow_vec[0])

            prev_gray = curr_gray
            prev_pts = cv2.goodFeaturesToTrack(prev_gray, maxCorners=100, qualityLevel=0.01, minDistance=10, blockSize=3)

        # Normalize motion features
        motion_features /= np.max(motion_features, axis=0)

        # Combine color and motion features
        shot_features = np.hstack((color_features[:-1], motion_features))

        # Add shot features to list of features
        features.append(shot_features)

    cap.release()

    return features


## step 3

In [None]:
from sklearn.cluster import KMeans

def cluster_shots(features, num_clusters):
    stacked_features = np.vstack(features)
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(stacked_features)
    labels = kmeans.labels_

    clusters = [[] for _ in range(num_clusters)]
    current_shot_index = 0
    for i, label in enumerate(labels):
        while i >= sum(len(c) for c in clusters[:label+1]):
            current_shot_index += 1
        clusters[label].append(current_shot_index)

    return clusters


In [None]:
from sklearn.cluster import MiniBatchKMeans

def cluster_shots(features, num_clusters):
    stacked_features = np.vstack(features)
    kmeans = MiniBatchKMeans(n_clusters=num_clusters)
    kmeans.fit(stacked_features)
    labels = kmeans.labels_

    clusters = [[] for _ in range(num_clusters)]
    current_shot_index = 0
    for i, label in enumerate(labels):
        while i >= sum(len(c) for c in clusters[:label+1]):
            current_shot_index += 1
        clusters[label].append(current_shot_index)

    return clusters


In [2]:
import numpy as np

# create a 2D array
a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

# flatten the array column-wise
b = a.flatten(order='F')

print(a)
print(b)

[[1 2 3]
 [4 5 6]
 [7 8 9]]
[1 4 7 2 5 8 3 6 9]


In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

def cluster_shots(features, num_clusters):
    # Reduce the dimensionality of the feature vectors using PCA
    stacked_features = np.vstack(features)
    pca = PCA(n_components=2)
    features_pca = pca.fit_transform(stacked_features)
    
    # Perform clustering on the reduced feature vectors
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(features_pca)
    labels = kmeans.labels_

    clusters = [[] for _ in range(num_clusters)]
    current_shot_index = 0
    for i, label in enumerate(labels):
        while i >= sum(len(c) for c in clusters[:label+1]):
            current_shot_index += 1
        clusters[label].append(current_shot_index)

    return clusters

## step 4

In [None]:
def identify_keyframes(clusters, features):
    keyframes = []

    for cluster in clusters:
        # Find representative shot for cluster (shot with smallest Euclidean distance to cluster centroid)
        cluster_features = np.array([features[i] for i in cluster])
        centroid = np.mean(cluster_features, axis=0)
        distances = np.linalg.norm(cluster_features - centroid, axis=1)
        representative_shot = cluster[np.argmin(distances)]

        # Add keyframe for representative shot (first frame of representative shot)
        keyframes.append(representative_shot[0])

    return keyframes


## step 5

In [None]:
def calculate_importance_score(keyframes, features, labels, num_clusters, semantic_scores):
    # Compute visual scores (based on color and motion features)
    color_features = np.array([features[i][:3] for i in keyframes])
    motion_features = np.array([features[i][3:] for i in keyframes])
    color_scores = np.linalg.norm(color_features, axis=1)
    motion_scores = np.linalg.norm(motion_features, axis=1)
    visual_scores = color_scores + motion_scores

    # Compute semantic scores (based on cluster labels and semantic scores)
    cluster_sizes = [len(cluster) for cluster in labels]
    cluster_weights = np.array(cluster_sizes) / np.sum(cluster_sizes)
    semantic_scores = np.array([semantic_scores[i] for i in labels])
    semantic_scores_weighted = np.sum(semantic_scores * cluster_weights, axis=1)
    semantic_scores_normalized = (semantic_scores_weighted - np.min(semantic_scores_weighted)) / (np.max(semantic_scores_weighted) - np.min(semantic_scores_weighted))
    semantic_scores_normalized = np.clip(semantic_scores_normalized, 0, 1)

    # Combine visual and semantic scores to get importance scores
    importance_scores = 0.5 * visual_scores + 0.5 * semantic_scores_normalized

    # Sort keyframes by importance score
    sorted_indices = np.argsort(importance_scores)[::-1]
    sorted_keyframes = [keyframes[i] for i in sorted_indices]

    return sorted_keyframes

## step 6

In [None]:
def generate_summary(keyframes, num_frames):
    # Sort keyframes by index (to ensure temporal order)
    keyframes = sorted(keyframes)

    # Select top-ranked keyframes based on their index
    summary = keyframes[:num_frames]

    return summary

## implementation

In [None]:
import cv2
import numpy as np

# Step 1: Segment the video into shots
video_file = r'Air_Force_One.mp4'
shots = extract_shots(video_file)

# Step 2: Extract features from each shot (color and motion)
features = extract_features(video_file,shots)

# Step 3: Cluster shots based on their feature representations
num_clusters = 100
clusters = cluster_shots(features, num_clusters)

# Step 4: Identify keyframes within each cluster
keyframes = identify_keyframes(clusters, features)

# Step 6: Generate summary by selecting top-ranked keyframes
num_frames = 150
summary = generate_summary(keyframes, num_frames)

# Load video using OpenCV
cap = cv2.VideoCapture(video_file)

# Extract frames corresponding to summary frames and save as a new video
out_file = 'summary.mp4'
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(out_file, fourcc, 30.0, (640, 360)) # Change resolution as needed

for frame_idx in summary:
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
    ret, frame = cap.read()
    out.write(frame)

cap.release()
out.release()



KeyboardInterrupt: ignored

In [None]:
print(np.shape(np.vstack(features)))