In [1]:
!pip install av
import av
import torch
import numpy as np
import cv2
import os
import time

from transformers import AutoImageProcessor, TimesformerForVideoClassification
from torch.quantization import quantize_dynamic

Collecting av
  Downloading av-13.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading av-13.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av
Successfully installed av-13.1.0


In [2]:
np.random.seed(0)

In [3]:
def read_video_opencv(video_path, indices):
    cap = cv2.VideoCapture(video_path)
    frames = []
    index = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret or index > max(indices):
            break
        if index in indices:
            frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))  # Convert to RGB
        index += 1
    cap.release()
    return np.stack(frames)

In [4]:
# Frame sampling
def adaptive_frame_sampling(clip_len, seg_len):
    return np.linspace(0, seg_len - 1, num=clip_len).astype(int)

In [5]:
# TimeSformer
model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400")

# Applying quantization
model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/22.7k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/486M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

In [6]:
predictions = []

folder_path = '/content/sample_data/Video'
video_files = [f for f in os.listdir(folder_path) if f.endswith('.avi')]

for video_file in video_files:
    video_path = os.path.join(folder_path, video_file)

    # Video length
    cap = cv2.VideoCapture(video_path)
    seg_len = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()

    # Sample 8 keyframes
    indices = adaptive_frame_sampling(clip_len=8, seg_len=seg_len)
    video = read_video_opencv(video_path, indices)

    inputs = image_processor(list(video), return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    predicted_label = logits.argmax(-1).item()
    print(f"Video: {video_file}, Predicted Action: {model.config.id2label[predicted_label]}")
    predictions.append((video_file, model.config.id2label[predicted_label]))

Video: v_HighJump_g05_c05.avi, Predicted Action: high jump
Video: v_GolfSwing_g07_c03.avi, Predicted Action: golf driving
Video: v_Punch_g01_c04.avi, Predicted Action: punching person (boxing)
Video: v_Knitting_g02_c05.avi, Predicted Action: knitting
Video: v_Archery_g12_c04.avi, Predicted Action: archery
Video: v_MilitaryParade_g17_c02.avi, Predicted Action: singing
Video: v_BrushingTeeth_g17_c03.avi, Predicted Action: brushing teeth
Video: v_Swing_g21_c02.avi, Predicted Action: swinging on something
Video: v_PushUps_g07_c04.avi, Predicted Action: push up
Video: v_HorseRace_g07_c02.avi, Predicted Action: riding or walking with horse
Video: v_Bowling_g05_c07.avi, Predicted Action: bowling
Video: v_BlowDryHair_g01_c03.avi, Predicted Action: curling hair
Video: v_HeadMassage_g20_c04.avi, Predicted Action: massaging person's head
Video: v_PlayingViolin_g07_c04.avi, Predicted Action: playing violin
Video: v_SkyDiving_g15_c01.avi, Predicted Action: skydiving
Video: v_MoppingFloor_g05_c01.av

In [7]:
import pandas as pd

excel_file_path = "/content/sample_data/Video Dataset.xlsx"
df = pd.read_excel(excel_file_path)

video_column = "Video Name"
action_column = "Action Performed"

matches = 0

# Compare predictions with dataset
for video_name, predicted_action in predictions:
    row = df[df[video_column] == video_name]

    if not row.empty:
        actual_action = row[action_column].values[0]
        if actual_action.lower() == predicted_action.lower():
            matches += 1

In [8]:
# Accuracy
accuracy = (matches / len(predictions)) * 100
print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 75.00%
