# Video Classifier using CNN and RNN :

In [15]:
from tensorflow_docs.vis import embed
from tensorflow import keras
from imutils import paths

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import cv2
import os

### Data Collection :  

In [16]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [17]:
print(f"Total videos for training : {len(train_df)}")
print(f"Total videos for testing  : {len(test_df)}")

print("Training Dataframe : ")
print(train_df.sample(10))

print("Testing Dataset    : ")
print(test_df.sample(10))

Total videos for training : 152
Total videos for testing  : 38
Training Dataframe : 
     Unnamed: 0          label                                    video_name
144         132  roadaccidents  data/roadaccidents/RoadAccidents033_x264.mp4
90           60          arson                  data/arson/Arson012_x264.mp4
62           21         arrest                data/arrest/Arrest022_x264.mp4
33          152  roadaccidents  data/roadaccidents/RoadAccidents054_x264.mp4
109         148  roadaccidents  data/roadaccidents/RoadAccidents050_x264.mp4
115          37         arrest                data/arrest/Arrest038_x264.mp4
145         117  roadaccidents  data/roadaccidents/RoadAccidents018_x264.mp4
56           62          arson                  data/arson/Arson014_x264.mp4
130         172  roadaccidents  data/roadaccidents/RoadAccidents074_x264.mp4
111          23         arrest                data/arrest/Arrest024_x264.mp4
Testing Dataset    : 
    Unnamed: 0          label                 

### Feed the videos to a Network : 

In [18]:
IMG_SIZE = 256

def crop_center_square(frame):
    y , x= frame.shape[0:2]
    min_dim = min(y,x)
    start_x = (x // 2) -  (min_dim // 2) 
    start_y = (y // 2) -  (min_dim // 2) 
    return frame[start_y : start_y+min_dim, start_x : start_x + min_dim]

def load_video(path, max_frames = 0, resize = (IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2,1,0]]
            frames.append(frame)

            if(len(frames) == max_frames):
                break
    finally:
        cap.release()
    return np.array(frames)

### Transfer Learning & Feature Extraction :

In [19]:
def build_feature_extractor():
    feature_extractor = keras.applications.InceptionV3(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.inception_v3.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")


feature_extractor = build_feature_extractor()

### Label Encoding
StringLookup layer encode the class labels as integers.

In [20]:
label_processor = keras.layers.StringLookup(num_oov_indices=0, vocabulary=np.unique(train_df["label"]))
print(label_processor.get_vocabulary())

labels = train_df["label"].values
labels = label_processor(labels[..., None]).numpy()
labels

['arrest', 'arson', 'roadaccidents']


array([[1],
       [0],
       [1],
       [2],
       [0],
       [2],
       [2],
       [0],
       [2],
       [2],
       [2],
       [0],
       [2],
       [2],
       [2],
       [0],
       [2],
       [1],
       [1],
       [2],
       [1],
       [2],
       [2],
       [2],
       [1],
       [1],
       [2],
       [2],
       [0],
       [2],
       [0],
       [2],
       [0],
       [2],
       [1],
       [0],
       [2],
       [0],
       [2],
       [0],
       [2],
       [1],
       [0],
       [2],
       [2],
       [1],
       [0],
       [2],
       [0],
       [1],
       [2],
       [1],
       [0],
       [2],
       [1],
       [1],
       [1],
       [2],
       [2],
       [2],
       [1],
       [2],
       [0],
       [0],
       [2],
       [1],
       [2],
       [2],
       [1],
       [1],
       [1],
       [0],
       [1],
       [2],
       [2],
       [2],
       [2],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
    

<b>Finally, we can put all the pieces together to create our data processing utility.</b>

In [21]:
#Define hyperparameters

IMG_SIZE = 256
BATCH_SIZE = 64
EPOCHS = 100

MAX_SEQ_LENGTH = 20
NUM_FEATURES = 2048

In [22]:
def prepare_all_videos(df):
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()    
    
    ##take all classlabels from train_df column named 'label' and store in labels
    labels = df["label"].values

    #convert classlabels to label encoding
    labels = label_processor(labels[..., None]).numpy()

    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool") 
    frame_features = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32") 

    # For each video.
    for idx, path in enumerate(video_paths):
        # Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(path))
        frames = frames[None, ...]
        # Initialize placeholders to store the masks and features of the current video.
        temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                temp_frame_features[i, j, :] = feature_extractor.predict(
                    batch[None, j, :]
                )
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()

    return (frame_features, frame_masks), labels


train_data, train_labels = prepare_all_videos(train_df)
test_data, test_labels = prepare_all_videos(test_df)

print("train_video_data")
# print(train_data)
print("train_video_labels")
# print(train_labels)

print("test_video_data")
# print(test_data)
print("test_video_labels")
# print(test_labels)

# print(f"Frame features in train set: {train_data[0].shape}")
# print(f"Frame masks in train set: {train_data[1].shape}")
# print(f"train_labels in train set: {train_labels.shape}")
# print(f"test_labels in train set: {test_labels.shape}")




: 

: 

In [None]:
# print("train_video_data")
# print(train_data)
# print("train_video_labels")
# print(train_labels)

# print("test_video_data")
# print(test_data)
# print("test_video_labels")
# print(test_labels)

print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")
print(f"train_labels in train set: {train_labels.shape}")
print(f"test_labels in train set: {test_labels.shape}")


Frame features in train set: (152, 20, 2048)
Frame masks in train set: (152, 20)
train_labels in train set: (152, 1)
test_labels in train set: (38, 1)


### The sequence model
Now, we can feed this data to a sequence model consisting of recurrent layers like GRU.

In [None]:
# Utility for our sequence model.
def get_sequence_model():
    class_vocab = label_processor.get_vocabulary()

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

    # Refer to the following tutorial to understand the significance of using `mask`:
    # https://keras.io/api/layers/recurrent_layers/gru/
    x = keras.layers.GRU(16, return_sequences=True)(frame_features_input, mask=mask_input)
    x = keras.layers.GRU(8)(x)
    x = keras.layers.Dropout(0.4)(x)
    x = keras.layers.Dense(8, activation="relu")(x)
    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)

    rnn_model = keras.Model([frame_features_input, mask_input], output)

    rnn_model.compile(
        loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    return rnn_model

EPOCHS = 30
# Utility for running experiments.
def run_experiment():
    filepath = "./tmp/video_classifier"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

    seq_model = get_sequence_model()
    history = seq_model.fit(
        [train_data[0], train_data[1]],
        train_labels,
        validation_split=0.3,
        epochs=EPOCHS,
        callbacks=[checkpoint],
    )

    seq_model.load_weights(filepath)
    _, accuracy = seq_model.evaluate([test_data[0], test_data[1]], test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return history, seq_model


_, sequence_model = run_experiment()

Epoch 1/30
Epoch 1: val_loss improved from inf to 1.08130, saving model to ./tmp\video_classifier
Epoch 2/30
Epoch 2: val_loss improved from 1.08130 to 1.07999, saving model to ./tmp\video_classifier
Epoch 3/30
Epoch 3: val_loss improved from 1.07999 to 1.07335, saving model to ./tmp\video_classifier
Epoch 4/30
Epoch 4: val_loss improved from 1.07335 to 1.06364, saving model to ./tmp\video_classifier
Epoch 5/30
Epoch 5: val_loss improved from 1.06364 to 1.06073, saving model to ./tmp\video_classifier
Epoch 6/30
Epoch 6: val_loss improved from 1.06073 to 1.05244, saving model to ./tmp\video_classifier
Epoch 7/30
Epoch 7: val_loss improved from 1.05244 to 1.03951, saving model to ./tmp\video_classifier
Epoch 8/30
Epoch 8: val_loss improved from 1.03951 to 1.02655, saving model to ./tmp\video_classifier
Epoch 9/30
Epoch 9: val_loss improved from 1.02655 to 1.01889, saving model to ./tmp\video_classifier
Epoch 10/30
Epoch 10: val_loss improved from 1.01889 to 1.00974, saving model to ./tmp

### Inference

In [None]:
def prepare_single_video(frames):
    frames = frames[None, ...]
    frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    for i, batch in enumerate(frames):
        video_length = batch.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length)
        for j in range(length):
            frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
        frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

    return frame_features, frame_mask


def sequence_prediction(path):
    class_vocab = label_processor.get_vocabulary()

    frames = load_video(os.path.join(path))
    frame_features, frame_mask = prepare_single_video(frames)
    probabilities = sequence_model.predict([frame_features, frame_mask])[0]

    for i in np.argsort(probabilities)[::-1]:
        print(f"  {class_vocab[i]}: {probabilities[i] * 100:5.2f}%")
    return frames

test_video = np.random.choice(test_df["video_name"].values.tolist())
# print(f"Test video path: {test_video}")
print(test_video)
test_frames = sequence_prediction(test_video)

data/arrest/Arrest051_x264.mp4
  roadaccidents: 59.73%
  arrest: 25.97%
  arson: 14.30%
