In [None]:
from google.colab import files
files.upload()  
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


# Note: The Dataset is not included in this folder
# Note: This is a jupyter notebook built with google colab, some of the code might require uploading the notebook on colab

Saving kaggle.json to kaggle.json


In [3]:
!kaggle datasets download -d pypiahmad/realistic-action-recognition-ucf50-dataset

Dataset URL: https://www.kaggle.com/datasets/pypiahmad/realistic-action-recognition-ucf50-dataset
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading realistic-action-recognition-ucf50-dataset.zip to /content
 92% 921M/0.98G [00:07<00:01, 45.8MB/s]
100% 0.98G/0.98G [00:07<00:00, 138MB/s]


In [4]:
import os
import random
import glob
import math
import json
from pathlib import Path
from typing import List, Tuple


import numpy as np
import pandas as pd
import cv2


import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, losses, callbacks
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.preprocessing import image

In [5]:
!unzip /content/realistic-action-recognition-ucf50-dataset.zip

Archive:  /content/realistic-action-recognition-ucf50-dataset.zip
  inflating: UCF11_updated_mpg/basketball/Annotation/v_shooting_01_01.xgtf  
  inflating: UCF11_updated_mpg/basketball/Annotation/v_shooting_01_02.xgtf  
  inflating: UCF11_updated_mpg/basketball/Annotation/v_shooting_01_03.xgtf  
  inflating: UCF11_updated_mpg/basketball/Annotation/v_shooting_01_04.xgtf  
  inflating: UCF11_updated_mpg/basketball/Annotation/v_shooting_01_05.xgtf  
  inflating: UCF11_updated_mpg/basketball/Annotation/v_shooting_01_06.xgtf  
  inflating: UCF11_updated_mpg/basketball/Annotation/v_shooting_01_07.xgtf  
  inflating: UCF11_updated_mpg/basketball/Annotation/v_shooting_02_01.xgtf  
  inflating: UCF11_updated_mpg/basketball/Annotation/v_shooting_02_02.xgtf  
  inflating: UCF11_updated_mpg/basketball/Annotation/v_shooting_02_03.xgtf  
  inflating: UCF11_updated_mpg/basketball/Annotation/v_shooting_02_04.xgtf  
  inflating: UCF11_updated_mpg/basketball/Annotation/v_shooting_02_05.xgtf  
  inflatin

In [None]:
DATA_DIR = '/content/UCF11_updated_mpg' # Google Colab path to UCF11 dataset
SEQ_LEN = 16 # Number of frames per video sequence
IMG_SIZE = 224 # Height and width to resize frames for model input
BATCH_SIZE = 8  # Number of samples per batch during training
EPOCHS = 15 # Number of training epochs
RANDOM_SEED = 42 # Seed for reproducibility


random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)


# Check if dataset directory exists and extract class names
if os.path.exists(DATA_DIR):
  # List all subdirectories in DATA_DIR as classes
  CLASSES = sorted([d.name for d in Path(DATA_DIR).iterdir() if d.is_dir()])
else:
  CLASSES = []


NUM_CLASSES = len(CLASSES)
# Print classes and total number
print('Found classes:', CLASSES)
print('Num classes:', NUM_CLASSES)

Found classes: ['basketball', 'biking', 'diving', 'golf_swing', 'horse_riding', 'soccer_juggling', 'swing', 'tennis_swing', 'trampoline_jumping', 'volleyball_spiking', 'walking']
Num classes: 11


In [None]:
def make_csv_from_ucf11(root_dir: str, out_csv='ucf11_metadata.csv') -> pd.DataFrame:
  """Walks the UCF11 folder structure and creates a CSV with video paths and labels.
  It expects structure: root_dir/<class>/Group_xx/*.mpg
  """
  rows = []
  root = Path(root_dir)
  # Iterate through each class folder
  for cls in sorted([d for d in root.iterdir() if d.is_dir()]):
    # Iterate through each group subfolder inside the class
    for group in sorted([g for g in cls.iterdir() if g.is_dir()]):
      for v in sorted(group.glob('*')):
        # Only include valid video file extensions
        if v.suffix.lower() in ['.mpg', '.mp4', '.avi', '.mov', '.m4v']:
          rows.append({'video_path': str(v), 'label': cls.name})
  # Create DataFrame and save to CSV
  df = pd.DataFrame(rows)
  df.to_csv(out_csv, index=False)
  print(f'Wrote {len(df)} rows to', out_csv)
  return df

# Generate or load metadata CSV if classes exist
if CLASSES:
  metadata_csv = 'ucf11_metadata.csv'
  # Create CSV if it doesn't exist
  if not os.path.exists(metadata_csv):
    df_meta = make_csv_from_ucf11(DATA_DIR, metadata_csv)
  else:
    # Load existing CSV
    df_meta = pd.read_csv(metadata_csv)
else:
  print('DATA_DIR not found or empty. Skip CSV generation.')
  df_meta = pd.DataFrame() # Empty DataFrame if dataset not found

Wrote 1600 rows to ucf11_metadata.csv


In [None]:
def create_subset(metadata_df: pd.DataFrame, per_class: int = 30, out_csv='ucf11_small.csv') -> pd.DataFrame:
  """Create a small balanced subset with `per_class` videos per class.
  Useful for limited Colab resources.
  """
  rows = []
  # Iterate through each unique class label
  for lbl in metadata_df['label'].unique():
    # Get all video paths for the current class
    vids = metadata_df[metadata_df['label'] == lbl]['video_path'].tolist()
    if len(vids) <= per_class:
      # If there are fewer videos than `per_class`, take all; otherwise, sample randomly
      chosen = vids
    else:
      chosen = random.sample(vids, per_class)
    rows += [{'video_path': v, 'label': lbl} for v in chosen]
  # Create DataFrame and save to CSV
  df = pd.DataFrame(rows)
  df.to_csv(out_csv, index=False)
  print('Created subset with', len(df), 'videos ->', out_csv)
  return df

# Create a small subset only if the dataset is large enough
if not df_meta.empty and len(df_meta) > 400:
  df_small = create_subset(df_meta, per_class=30, out_csv='ucf11_small.csv')
else:
  df_small = df_meta.copy() # Use full dataset if small

Created subset with 330 videos -> ucf11_small.csv


In [None]:
def sample_frames_from_video(video_path: str, seq_len: int = SEQ_LEN, img_size: int = IMG_SIZE) -> np.ndarray:
  """Read `seq_len` frames from the video evenly spaced and return as a numpy array
  shape: (seq_len, img_size, img_size, 3) with float32 values in [0,1]
  If video shorter than seq_len, we will loop frames to fill.
  """
  cap = cv2.VideoCapture(video_path)
  if not cap.isOpened():
    raise IOError(f'Cannot open video {video_path}')

  # Get total number of frames in the video
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
  if total_frames <= 0:
    frames = []
    # Fallback: read all frames sequentially if frame count is unavailable
    while True:
      ret, f = cap.read()
      if not ret:
        break
    frames.append(f)
  else:
    # Generate evenly spaced frame indices
    indices = np.linspace(0, max(total_frames - 1, 0), num=seq_len, dtype=np.int32)
    frames = []
    for idx in indices:
      cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
      ret, f = cap.read()
      if not ret:
        break
      frames.append(f)


  cap.release()
    # If no frames were read, return a zero array
  if len(frames) == 0:
    out = np.zeros((seq_len, img_size, img_size, 3), dtype=np.float32)
    return out

  # Repeat last frame if video shorter than seq_len
  while len(frames) < seq_len:
    frames.append(frames[-1])

  
  frames = frames[:seq_len]
  # Process frames: convert BGR->RGB, resize, normalize
  processed = []
  for f in frames:
    f = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
    f = cv2.resize(f, (img_size, img_size))
    f = f.astype('float32') / 255.0
    processed.append(f)

  # Stack frames into a single numpy array
  arr = np.stack(processed, axis=0)
  return arr

In [None]:
def build_frame_feature_extractor(img_size: int = IMG_SIZE):
  base = MobileNetV2(weights='imagenet', include_top=False, input_shape=(img_size, img_size, 3))
  inp = base.input
  x = base.output
  x = layers.GlobalAveragePooling2D()(x) # Convert feature maps to a single vector
  model = models.Model(inputs=inp, outputs=x)
  return model

# Instantiate the feature extractor
feature_extractor = build_frame_feature_extractor()
# Get the dimensionality of the feature vector for each frame
feature_dim = feature_extractor.output_shape[-1]
print('Feature dim per frame:', feature_dim)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
Feature dim per frame: 1280


In [None]:
# Directory to save extracted features
FEATURE_DIR = 'features_ucf11'
os.makedirs(FEATURE_DIR, exist_ok=True)

def video_to_feature(video_path: str, feature_extractor_model, seq_len: int = SEQ_LEN, img_size: int = IMG_SIZE) -> np.ndarray:
  frames = sample_frames_from_video(video_path, seq_len=seq_len, img_size=img_size)
  feats = feature_extractor_model.predict(frames, verbose=0)
   # Pad features if video shorter than seq_len
  if feats.shape[0] < seq_len:
    last = feats[-1:]
    pad = np.repeat(last, seq_len - feats.shape[0], axis=0)
    feats = np.vstack([feats, pad])
  return feats.astype('float32')


def cache_features_for_df(df: pd.DataFrame, feature_dir: str = FEATURE_DIR, overwrite=False):
  rows = []
  for idx, row in df.iterrows():
    vpath = row['video_path']
    label = row['label']
    fname = os.path.splitext(os.path.basename(vpath))[0]
    key = f"{label}__{idx}.npy" # Unique filename for caching
    out_path = os.path.join(feature_dir, key)
    # Extract and save features if not already cached or overwrite is True
    if overwrite or (not os.path.exists(out_path)):
      try:
        feats = video_to_feature(vpath, feature_extractor, seq_len=SEQ_LEN, img_size=IMG_SIZE)
        np.save(out_path, feats)
      except Exception as e:
        print('Failed for', vpath, e)
        continue
    rows.append({'feature_path': out_path, 'label': label})
  return pd.DataFrame(rows)


# Extract and cache features for the small dataset
if not df_small.empty:
  df_features = cache_features_for_df(df_small, FEATURE_DIR)
else:
  df_features = pd.DataFrame() # Empty if no data

In [None]:
def make_dataset_from_feature_df(df_feat: pd.DataFrame, batch_size: int = BATCH_SIZE, shuffle: bool = True):
  file_paths = df_feat['feature_path'].values
  labels = df_feat['label'].values
  label2idx = {c: i for i, c in enumerate(sorted(set(labels)))}
  y = np.array([label2idx[l] for l in labels], dtype=np.int32)


  def load_npy(path, label):
    # Load feature array and ensure correct sequence length
    arr = np.load(path.decode('utf-8'))
    if arr.shape[0] != SEQ_LEN:
      if arr.shape[0] < SEQ_LEN:
        pad = np.repeat(arr[-1:], SEQ_LEN - arr.shape[0], axis=0)
        arr = np.vstack([arr, pad])
      else:
        arr = arr[:SEQ_LEN]
    return arr, label
  # Create TensorFlow dataset from file paths and labels
  paths_ds = tf.data.Dataset.from_tensor_slices((file_paths, y))


  def _py_loader(path, label):
    # Load features and convert to tensors
    feats, lab = tf.numpy_function(load_npy, [path, label], [tf.float32, tf.int32])
    feats.set_shape((SEQ_LEN, feature_dim)) # Set fixed shape for TF graph
    lab.set_shape(())
    return feats, tf.one_hot(lab, depth=len(label2idx)) # Convert label to one-hot


  ds = paths_ds.map(_py_loader, num_parallel_calls=tf.data.AUTOTUNE)
  if shuffle:
    ds = ds.shuffle(512, seed=RANDOM_SEED)
  ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
  return ds, label2idx


# Prepare training and validation datasets
if not df_features.empty:
  # Shuffle and split dataset
  df_features = df_features.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
  split = int(0.8 * len(df_features))
  df_train = df_features.iloc[:split]
  df_val = df_features.iloc[split:]


  train_ds, label2idx = make_dataset_from_feature_df(df_train)
  val_ds, _ = make_dataset_from_feature_df(df_val, shuffle=False)
  print('Train samples:', len(df_train), 'Val samples:', len(df_val))
else:
  train_ds = None
  val_ds = None
  label2idx = {} # Empty if no features

Train samples: 264 Val samples: 66


In [None]:
def build_lstm_classifier(seq_len: int = SEQ_LEN, feature_dim: int = feature_dim, num_classes: int = None):
  """
    Builds an LSTM-based classifier for video action recognition.
    
    Architecture:
    - Input: sequence of frame features (seq_len x feature_dim)
    - Masking layer: ignores padded frames if any
    - LSTM layer: captures temporal dependencies across frames
    - Dropout layers: reduce overfitting
    - Dense layers: project to final class probabilities using softmax
    """
  
  if num_classes is None:
    num_classes = NUM_CLASSES
  inp = layers.Input(shape=(seq_len, feature_dim), name='frame_features')
  x = layers.Masking()(inp)
  x = layers.LSTM(128, return_sequences=False)(x)
  x = layers.Dropout(0.4)(x)
  x = layers.Dense(64, activation='relu')(x)
  x = layers.Dropout(0.3)(x)
  out = layers.Dense(num_classes, activation='softmax')(x)
  model = models.Model(inputs=inp, outputs=out)
  return model

In [20]:
model = build_lstm_classifier(num_classes=len(label2idx))
model.compile(optimizer=optimizers.Adam(learning_rate=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# Training
if train_ds is not None:
  checkpoint_cb = callbacks.ModelCheckpoint('best_action_model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
  early_cb = callbacks.EarlyStopping(monitor='val_accuracy', patience=4, restore_best_weights=True, verbose=1)


  history = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS, callbacks=[checkpoint_cb, early_cb])
  model.save('final_action_model.keras')
else:
  print('No training data available. Ensure you have created features and df_features is not empty.')

Epoch 1/15
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9386 - loss: 0.4502
Epoch 1: val_accuracy improved from -inf to 0.77273, saving model to best_action_model.h5




[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9379 - loss: 0.4520 - val_accuracy: 0.7727 - val_loss: 0.7910
Epoch 2/15
[1m29/33[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 10ms/step - accuracy: 0.8912 - loss: 0.4769
Epoch 2: val_accuracy improved from 0.77273 to 0.80303, saving model to best_action_model.h5




[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.8961 - loss: 0.4708 - val_accuracy: 0.8030 - val_loss: 0.7578
Epoch 3/15
[1m30/33[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 10ms/step - accuracy: 0.9240 - loss: 0.4558
Epoch 3: val_accuracy improved from 0.80303 to 0.81818, saving model to best_action_model.h5




[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.9240 - loss: 0.4542 - val_accuracy: 0.8182 - val_loss: 0.6840
Epoch 4/15
[1m31/33[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 10ms/step - accuracy: 0.9093 - loss: 0.3663
Epoch 4: val_accuracy improved from 0.81818 to 0.84848, saving model to best_action_model.h5




[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.9096 - loss: 0.3683 - val_accuracy: 0.8485 - val_loss: 0.6695
Epoch 5/15
[1m28/33[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 10ms/step - accuracy: 0.9069 - loss: 0.3400
Epoch 5: val_accuracy did not improve from 0.84848
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9108 - loss: 0.3369 - val_accuracy: 0.8333 - val_loss: 0.6381
Epoch 6/15
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9578 - loss: 0.2701
Epoch 6: val_accuracy did not improve from 0.84848
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9572 - loss: 0.2714 - val_accuracy: 0.8182 - val_loss: 0.6184
Epoch 7/15
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9558 - loss: 0.3156
Epoch 7: val_accurac

In [None]:
# End