In [2]:
#Importing the necessary modules
from absl import logging

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow_docs.vis import embed

logging.set_verbosity(logging.ERROR)

# Some modules to help with reading the UCF101 dataset.
import random
import re
import os
import tempfile
import ssl
import cv2
import numpy as np

import imageio
from IPython import display

from urllib import request




In [3]:
# These are some constants to set up that will be used in feature extraction and RNN.
IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 10

MAX_SEQ_LENGTH = 20
NUM_FEATURES = 2048

In [4]:
#@title Helper functions for the UCF101 dataset

# Utilities to fetch videos from UCF101 dataset
UCF_ROOT = "https://www.crcv.ucf.edu/THUMOS14/UCF101/UCF101/"
_VIDEO_LIST = None
# Setting the cache directory to store the videos not to keep downloading them everytime
_CACHE_DIR = "C://Users//most2//AppData//Local//Temp//tmprw3n5sec"
# As of July 2020, crcv.ucf.edu doesn't use a certificate accepted by the
# default Colab environment anymore.
unverified_context = ssl._create_unverified_context()

def list_ucf_videos():
  """Lists videos available in UCF101 dataset."""
  global _VIDEO_LIST
  if not _VIDEO_LIST:
    index = request.urlopen(UCF_ROOT, context=unverified_context).read().decode("utf-8")
    videos = re.findall("(v_[\w_]+\.avi)", index)
    _VIDEO_LIST = sorted(set(videos))
  return list(_VIDEO_LIST)

def fetch_ucf_video(video):
  """Fetches a video and cache into local filesystem."""
  cache_path = os.path.join(_CACHE_DIR, video)
  if not os.path.exists(cache_path):
    urlpath = request.urljoin(UCF_ROOT, video)
    print("Fetching %s => %s" % (urlpath, cache_path))
    data = request.urlopen(urlpath, context=unverified_context).read()
    open(cache_path, "wb").write(data)
  return cache_path

# Utilities to open video files using CV2
def crop_center_square(frame):
  y, x = frame.shape[0:2]
  min_dim = min(y, x)
  start_x = (x // 2) - (min_dim // 2)
  start_y = (y // 2) - (min_dim // 2)
  return frame[start_y:start_y+min_dim,start_x:start_x+min_dim]

def load_video(path, max_frames=0, resize=(224, 224)):
  cap = cv2.VideoCapture(path)
  frames = []
  i = 0
  try:
    while True:
      ret, frame = cap.read()
      if not ret:
        break
      if (i%5 == 0):
        frame = crop_center_square(frame)
        frame = cv2.resize(frame, resize)
        frame = frame[:, :, [2, 1, 0]]
        frames.append(frame)
      i += 1
      if len(frames) == max_frames:
        break
  finally:
    cap.release()
  return np.array(frames) / 255.0

def to_gif(images):
  converted_images = np.clip(images * 255, 0, 255).astype(np.uint8)
  imageio.mimsave('./animation.gif', converted_images, duration=40)
  return embed.embed_file('./animation.gif')

In [6]:
# Get the list of videos in the dataset.
ucf_videos = list_ucf_videos()
  
categories = {}
for video in ucf_videos:
  category = video[2:-12]
  if category not in categories:
    categories[category] = []
  categories[category].append(video)
print("Found %d videos in %d categories." % (len(ucf_videos), len(categories)))

for category, sequences in categories.items():
  summary = ", ".join(sequences[:2])
  print("%-20s %4d videos (%s, ...)" % (category, len(sequences), summary))

Found 13320 videos in 101 categories.
ApplyEyeMakeup        145 videos (v_ApplyEyeMakeup_g01_c01.avi, v_ApplyEyeMakeup_g01_c02.avi, ...)
ApplyLipstick         114 videos (v_ApplyLipstick_g01_c01.avi, v_ApplyLipstick_g01_c02.avi, ...)
Archery               145 videos (v_Archery_g01_c01.avi, v_Archery_g01_c02.avi, ...)
BabyCrawling          132 videos (v_BabyCrawling_g01_c01.avi, v_BabyCrawling_g01_c02.avi, ...)
BalanceBeam           108 videos (v_BalanceBeam_g01_c01.avi, v_BalanceBeam_g01_c02.avi, ...)
BandMarching          155 videos (v_BandMarching_g01_c01.avi, v_BandMarching_g01_c02.avi, ...)
BaseballPitch         150 videos (v_BaseballPitch_g01_c01.avi, v_BaseballPitch_g01_c02.avi, ...)
BasketballDunk        131 videos (v_BasketballDunk_g01_c01.avi, v_BasketballDunk_g01_c02.avi, ...)
Basketball            134 videos (v_Basketball_g01_c01.avi, v_Basketball_g01_c02.avi, ...)
BenchPress            160 videos (v_BenchPress_g01_c01.avi, v_BenchPress_g01_c02.avi, ...)
Biking              

## 1) Dataset 

In [7]:
#imports

import pandas as pd

In [8]:
data = [(category, item) for category, items in categories.items() for item in items]
df = pd.DataFrame(data, columns=['Category', 'VideoPath'])

# Taking only 100 videos from ApplyEyeMakeup, 100 from CliffDiving, 100 from Archery and 100 from Bowling
df = df[(df['Category'] == 'Bowling') | (df['Category'] == 'ApplyEyeMakeup') | (df['Category'] == 'CliffDiving') | (df['Category'] == 'Archery')].groupby('Category').head(100).reset_index(drop=True)

print(df)

           Category                     VideoPath
0    ApplyEyeMakeup  v_ApplyEyeMakeup_g01_c01.avi
1    ApplyEyeMakeup  v_ApplyEyeMakeup_g01_c02.avi
2    ApplyEyeMakeup  v_ApplyEyeMakeup_g01_c03.avi
3    ApplyEyeMakeup  v_ApplyEyeMakeup_g01_c04.avi
4    ApplyEyeMakeup  v_ApplyEyeMakeup_g01_c05.avi
..              ...                           ...
395     CliffDiving     v_CliffDiving_g18_c03.avi
396     CliffDiving     v_CliffDiving_g18_c04.avi
397     CliffDiving     v_CliffDiving_g18_c05.avi
398     CliffDiving     v_CliffDiving_g18_c06.avi
399     CliffDiving     v_CliffDiving_g18_c07.avi

[400 rows x 2 columns]
Category
ApplyEyeMakeup    100
Archery           100
Bowling           100
CliffDiving       100
Name: count, dtype: int64
Videos Shape:  (33, 224, 224, 3)


## 2) Data Preprocessing

In [9]:
#imports

from IPython.display import Video
import matplotlib.pyplot as plt # for plotting
from sklearn.model_selection import train_test_split

import os

import keras

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import cv2
from IPython.display import Image

#### a) Define helper method to extract frames from videos, the method should take video path as input and return an array of video frames as output

In [10]:
def extractFrames(path):
    video = load_video(fetch_ucf_video(path), max_frames=20)
    return video


#### b) Apply preprocessing steps on each frame ex: (resize to a fixed size to normalize frame dimensions)

In [19]:
# Already done in the load_video function

#### c) Split the dataset into training, validation, and testing sets (optional, can be done during data loading).

In [11]:
# Splitting the data into train and test

train_df = df.sample(frac=0.7)
test_df = df.drop(train_df.index).reset_index(drop=True)

print(df['Category'].value_counts())
print("Videos Shape: ", load_video(fetch_ucf_video(df['VideoPath'][0])).shape)

# The Validation set is created from the training set during the training process

## 3 Feature Extraction with CNN

#### a) Implement a feature extractor that could extract features form Image

In [12]:
import numpy as np
from keras.applications import VGG16
from keras.applications.vgg16 import preprocess_input
from keras.preprocessing import image

In [13]:
def build_feature_extractor():
    feature_extractor = keras.applications.InceptionV3(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.inception_v3.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")

# Extracting features from the videos using this feature extractor model
feature_extractor = build_feature_extractor()

In [14]:
# Label processor for the categories in the dataset to be numeric just like label encoding (to be used in the RNN)
label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(train_df["Category"])
)
print(label_processor.get_vocabulary())

['ApplyEyeMakeup', 'Archery', 'Bowling', 'CliffDiving']


In [37]:
def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["VideoPath"].values.tolist()
    labels = df["Category"].values
    labels = keras.ops.convert_to_numpy(label_processor(labels[..., None]))

    # 'frame_masks' and 'frame_features' are what we will feed to our sequence model.
    # 'frame_masks' will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool")
    frame_features = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    # For each video.
    for idx, path in enumerate(video_paths):
        # Gather all its frames and add a batch dimension.
        frames = extractFrames(path)
        frames = frames[None, ...]

        # Initialize placeholders to store the masks and features of the current video.
        temp_frame_mask = np.zeros(
            shape=(
                1,
                MAX_SEQ_LENGTH,
            ),
            dtype="bool",
        )
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # A video has been reduced to 'MAX_SEQ_LENGTH' features.
        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            # The batch is a frame in the video.
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                temp_frame_features[i, j, :] = feature_extractor.predict(
                    batch[None, j, :], verbose=0,
                )
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()

    return (frame_features, frame_masks), labels

# Getting the features for both training and testing sets and their corresponding labels
train_data, train_labels = prepare_all_videos(train_df, "train")
test_data, test_labels = prepare_all_videos(test_df, "test")

print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")

Frame features in train set: (280, 20, 2048)
Frame masks in train set: (280, 20)


In [15]:
import pickle

In [48]:
# saving the features locally not to run the CNN model again

with open('all_videos_dataset.pkl', 'wb') as f:
    pickle.dump([train_data, train_labels, test_data, test_labels], f)
    

In [16]:
loadedFeatures = None

# loading the features
with open('all_videos_dataset.pkl', 'rb') as f:
    train_data, train_labels, test_data, test_labels = pickle.load(f)

print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")

Frame features in train set: (280, 20, 2048)
Frame masks in train set: (280, 20)


## 4 Sequence Modeling with RNN

In [42]:
# Utility for our sequence model.
def get_sequence_model():
    class_vocab = label_processor.get_vocabulary()

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")
    # We made a mask input because we don't want to condition on padded values (If any).
    
    # Our RNN model layers were
    # Input -> LSTM -> LSTM -> Dense -> Output
    x = keras.layers.LSTM(32, return_sequences=True)(
        frame_features_input, mask=mask_input
    )
    x = keras.layers.LSTM(32)(x)
    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)

    rnn_model = keras.Model([frame_features_input, mask_input], output)

    rnn_model.compile(
        loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    rnn_model.summary()
    return rnn_model

seq_model = get_sequence_model()

## 5 Model Training


#### a) Combine Features and RNN: Concatenate or feed the extracted features from the CNN into the first layer of the RNN.


In [38]:
def run_experiment():
    # Training the model on the training data and validating on the validation set and saving the best model with the best weights
    filepath = "model.weights.h5"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

    history = seq_model.fit(
        [train_data[0], train_data[1]],
        train_labels,
        validation_split=0.3,
        epochs=EPOCHS,
        callbacks=[checkpoint],
    )

    seq_model.load_weights(filepath)

    return history, seq_model


_, sequence_model = run_experiment()

Epoch 1/10
[1m5/7[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m0s[0m 13ms/step - accuracy: 0.6875 - loss: 0.8200
Epoch 1: val_loss improved from inf to 0.64782, saving model to model.weights.h5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.7026 - loss: 0.7914 - val_accuracy: 0.7500 - val_loss: 0.6478
Epoch 2/10
[1m6/7[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 11ms/step - accuracy: 0.7296 - loss: 0.6501
Epoch 2: val_loss did not improve from 0.64782
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.7270 - loss: 0.6527 - val_accuracy: 0.6905 - val_loss: 0.8051
Epoch 3/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6953 - loss: 0.7457 
Epoch 3: val_loss did not improve from 0.64782
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.6958 - loss: 0.7467 - val_accuracy: 0.7143 - val_loss: 0.7559
Epoch 4/10
[1m5/7[0m [32m━━

## 6 Evaluation

#### a) Evaluate the model’s performance on the testing set using metrics like accuracy, precision, recall, and F1-score

In [41]:
# Testing the model on the test data
y_pred = sequence_model.predict([test_data[0], test_data[1]])

print(y_pred)
# Getting the class with the highest probability
class_pred = np.argmax(y_pred, axis=1)
print(class_pred)
print(test_labels.flatten())

# Calculating the accuracy
print("Training Accuracy: ", np.mean(class_pred == test_labels.flatten()))

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
[[0.6427876  0.03286105 0.21471083 0.10964059]
 [0.40319863 0.03571411 0.21990497 0.34118226]
 [0.4524123  0.16228    0.02898185 0.3563259 ]
 [0.15450868 0.41500416 0.02099449 0.4094926 ]
 [0.6587893  0.01383094 0.29921398 0.02816585]
 [0.79062736 0.01050435 0.1692647  0.02960349]
 [0.9258201  0.00897757 0.04457782 0.02062441]
 [0.87713164 0.0076946  0.08350866 0.03166518]
 [0.9274963  0.00932378 0.0386866  0.02449337]
 [0.93470687 0.00741483 0.03586477 0.02201357]
 [0.93393886 0.01323683 0.02606211 0.02676221]
 [0.4818963  0.0740191  0.1228486  0.32123604]
 [0.38306051 0.05443369 0.17626247 0.38624334]
 [0.20569709 0.0748185  0.6163762  0.1031082 ]
 [0.3714479  0.18807846 0.13766693 0.30280682]
 [0.8910512  0.00780288 0.07191209 0.02923385]
 [0.75600284 0.01439272 0.15535691 0.07424758]
 [0.9109697  0.00823644 0.05042314 0.03037071]
 [0.8326947  0.0095828  0.11590599 0.04181653]
 [0.9053996  0.00789706 0.05640641 