In [1]:
pip install opencv-python

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install keras

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [4]:
!pip install imutils



In [5]:
from imutils import paths
from tqdm import tqdm
import pandas as pd 
import numpy as np
import shutil
import cv2
import os

In [6]:
# Open the .txt file which have names of training videos
f = open("sgsl_data/train/trainlist01.txt", "r")
temp = f.read()
videos = temp.split('\n')

# Create a dataframe having video names
train = pd.DataFrame()
train['video_name'] = videos
train = train[:-1]
train.head()

Unnamed: 0,video_name
0,Bedok-Bedok_1.MOV
1,Bedok-Bedok_2.mov
2,Bedok-Bedok_3.mov
3,Bedok-Bedok_4.MOV
4,Bedok-Bedok_5.mov


In [7]:
# Open the .txt file which have names of test videos
with open("sgsl_data/test/testlist01.txt", "r") as f:
    temp = f.read()
videos = temp.split("\n")

# Create a dataframe having video names
test = pd.DataFrame()
test["video_name"] = videos
test = test[:-1]
test.head()

Unnamed: 0,video_name
0,Bedok-Bedok_6.MOV
1,Bedok-Bedok_7.mov
2,Bishan-Bishan_6.MOV
3,Bishan-Bishan_7.mov
4,BoonLay-BoonLay_6.MOV


In [8]:
def extract_tag(video_path):
    return video_path.split("-")[0]

def separate_video_name(video_name):
    return video_name.split("-")[1]

def rectify_video_name(video_name):
    return video_name.split(" ")[0]

def move_videos(df, output_dir):
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    for i in tqdm(range(df.shape[0])):
        videoFile = df['video_name'][i].split("-")[-1]
        videoPath = os.path.join("data", videoFile)
        shutil.copy2(videoPath, output_dir)
    print()
    print(f"Total videos: {len(os.listdir(output_dir))}")

In [9]:
train["tag"] = train["video_name"].apply(extract_tag)
train["video_name"] = train["video_name"].apply(separate_video_name)
train.head()

Unnamed: 0,video_name,tag
0,Bedok_1.MOV,Bedok
1,Bedok_2.mov,Bedok
2,Bedok_3.mov,Bedok
3,Bedok_4.MOV,Bedok
4,Bedok_5.mov,Bedok


In [10]:
train["video_name"] = train["video_name"].apply(rectify_video_name)
train.head()

Unnamed: 0,video_name,tag
0,Bedok_1.MOV,Bedok
1,Bedok_2.mov,Bedok
2,Bedok_3.mov,Bedok
3,Bedok_4.MOV,Bedok
4,Bedok_5.mov,Bedok


In [11]:
test["tag"] = test["video_name"].apply(extract_tag)
test["video_name"] = test["video_name"].apply(separate_video_name)
test.head()

Unnamed: 0,video_name,tag
0,Bedok_6.MOV,Bedok
1,Bedok_7.mov,Bedok
2,Bishan_6.MOV,Bishan
3,Bishan_7.mov,Bishan
4,BoonLay_6.MOV,BoonLay


In [12]:
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

In [13]:
!pip install -q git+https://github.com/tensorflow/docs

In [14]:
from tensorflow_docs.vis import embed
from tensorflow import keras


import matplotlib.pyplot as plt
import tensorflow as tf
import imageio

In [15]:
IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 25

MAX_SEQ_LENGTH = 100
NUM_FEATURES = 2048

In [16]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")

train_df.sample(10)

Total videos for training: 96
Total videos for testing: 37


Unnamed: 0,video_name,tag
91,Orchard_5.mov,Orchard
81,Newton_5.mov,Newton
89,Orchard_3.mov,Orchard
5,Bishan_1.MOV,Bishan
57,Eunos_5.mov,Eunos
62,Kallang_5.mov,Kallang
61,Kallang_4.MOV,Kallang
37,CityHall_4.MOV,CityHall
28,BukitGombak_5.mov,BukitGombak
50,DhobyGhaut_3.mov,DhobyGhaut


In [17]:
# The following two methods are taken from this tutorial:
# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub


def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]


def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

In [18]:
def build_feature_extractor():
    feature_extractor = keras.applications.InceptionV3(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.inception_v3.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")


feature_extractor = build_feature_extractor()

In [19]:
label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(train_df["tag"])
)
print(label_processor.get_vocabulary())

['Bedok', 'Bishan', 'BoonLay', 'Braddell', 'BukitBatok', 'BukitGombak', 'BuonaVista', 'CityHall', 'Clementi', 'Commonwealth', 'DhobyGhaut', 'Eunos', 'Kallang', 'Khatib', 'MarinaSouthPier', 'MoneyB', 'Newton', 'Novena', 'Orchard', 'OutramPark']


In [20]:
def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()
    labels = df["tag"].values
    labels = label_processor(labels[..., None]).numpy()

    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool")
    frame_features = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    # For each video.
    for idx, path in enumerate(video_paths):
        # Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(root_dir, path))
        frames = frames[None, ...]

        # Initialize placeholders to store the masks and features of the current video.
        temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                temp_frame_features[i, j, :] = feature_extractor.predict(
                    batch[None, j, :]
                )
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()

    return (frame_features, frame_masks), labels


train_data, train_labels = prepare_all_videos(train_df, "train")
test_data, test_labels = prepare_all_videos(test_df, "test")

print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")

Frame features in train set: (96, 100, 2048)
Frame masks in train set: (96, 100)


In [21]:
# Utility for our sequence model.
def get_sequence_model():
    class_vocab = label_processor.get_vocabulary()

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

    # Refer to the following tutorial to understand the significance of using `mask`:
    # https://keras.io/api/layers/recurrent_layers/gru/
    x = keras.layers.GRU(16, return_sequences=True)(
        frame_features_input, mask=mask_input
    )
    x = keras.layers.GRU(8)(x)
    x = keras.layers.Dropout(0.4)(x)
    x = keras.layers.Dense(8, activation="relu")(x)
    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)

    rnn_model = keras.Model([frame_features_input, mask_input], output)

    rnn_model.compile(
        loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    return rnn_model


# Utility for running experiments.
def run_experiment():
    filepath = "./video_classifier"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

    seq_model = get_sequence_model()
    history = seq_model.fit(
        [train_data[0], train_data[1]],
        train_labels,
        validation_split=0.3,
        epochs=EPOCHS,
        callbacks=[checkpoint],
    )

    seq_model.load_weights(filepath)
    _, accuracy = seq_model.evaluate([test_data[0], test_data[1]], test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return history, seq_model


_, sequence_model = run_experiment()

Epoch 1/25
Epoch 1: val_loss improved from inf to 2.99854, saving model to .\video_classifier
Epoch 2/25
Epoch 2: val_loss did not improve from 2.99854
Epoch 3/25
Epoch 3: val_loss did not improve from 2.99854
Epoch 4/25
Epoch 4: val_loss did not improve from 2.99854
Epoch 5/25
Epoch 5: val_loss did not improve from 2.99854
Epoch 6/25
Epoch 6: val_loss did not improve from 2.99854
Epoch 7/25
Epoch 7: val_loss did not improve from 2.99854
Epoch 8/25
Epoch 8: val_loss did not improve from 2.99854
Epoch 9/25
Epoch 9: val_loss did not improve from 2.99854
Epoch 10/25
Epoch 10: val_loss did not improve from 2.99854
Epoch 11/25
Epoch 11: val_loss did not improve from 2.99854
Epoch 12/25
Epoch 12: val_loss did not improve from 2.99854
Epoch 13/25
Epoch 13: val_loss did not improve from 2.99854
Epoch 14/25
Epoch 14: val_loss did not improve from 2.99854
Epoch 15/25
Epoch 15: val_loss did not improve from 2.99854
Epoch 16/25
Epoch 16: val_loss did not improve from 2.99854
Epoch 17/25
Epoch 17: 

In [22]:
def prepare_single_video(frames):
    frames = frames[None, ...]
    frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    for i, batch in enumerate(frames):
        video_length = batch.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length)
        for j in range(length):
            frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
        frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

    return frame_features, frame_mask


def sequence_prediction(path):
    class_vocab = label_processor.get_vocabulary()

    frames = load_video(os.path.join("test", path))
    frame_features, frame_mask = prepare_single_video(frames)
    probabilities = sequence_model.predict([frame_features, frame_mask])[0]

    for i in np.argsort(probabilities)[::-1]:
        print(f"  {class_vocab[i]}: {probabilities[i] * 100:5.2f}%")
    return frames


# This utility is for visualization.
# Referenced from:
# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub
def to_gif(images):
    converted_images = images.astype(np.uint8)
    imageio.mimsave("animation.gif", converted_images, fps=10)
    return embed.embed_file("animation.gif")


test_video = np.random.choice(test_df["video_name"].values.tolist())
print(f"Test video path: {test_video}")
test_frames = sequence_prediction(test_video)
# to_gif(test_frames[:MAX_SEQ_LENGTH])

Test video path: BukitBatok_6.MOV
  Eunos:  5.01%
  CityHall:  5.01%
  Bishan:  5.01%
  BoonLay:  5.01%
  BukitBatok:  5.01%
  Khatib:  5.01%
  Commonwealth:  5.01%
  BuonaVista:  5.01%
  Kallang:  5.01%
  Braddell:  5.00%
  Clementi:  5.00%
  BukitGombak:  5.00%
  Bedok:  5.00%
  DhobyGhaut:  5.00%
  Orchard:  4.99%
  MarinaSouthPier:  4.99%
  MoneyB:  4.99%
  Newton:  4.99%
  Novena:  4.99%
  OutramPark:  4.99%
