In [None]:
#Creating a new RNN model but keeping the features and masking the same

In [18]:
#from tensorflow_docs.vis import embed
from tensorflow import keras
from imutils import paths
from tensorflow.keras.preprocessing import image
from tensorflow_docs.vis import embed

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import cv2
import os

In [2]:
IMG_W = 224
IMG_H = 224
BATCH_SIZE = 64
EPOCHS = 100

MAX_SEQ_LENGTH = 20
NUM_FEATURES = 2048

In [3]:
df1 = pd.read_csv("../Data/mirror-data2.csv")
df1 = df1[df1.Action != ("Talking" or "talking")]

In [4]:

def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]


def load_video(frames,  resize=(IMG_H, IMG_W)):
    
    frames = []
    for frame in frames:

        frame = crop_center_square(frame)
        frame = cv2.resize(frame, resize)
        frame = frame[:, :, [2, 1, 0]]
        frames.append(frame)

        
    
    return np.array(frames)


In [11]:

def build_feature_extractor():
    feature_extractor = keras.applications.VGG16(
        weights='imagenet',
        include_top=False,
        pooling="avg",
        input_shape=(IMG_H, IMG_W, 3),
    )
    preprocess_input = keras.applications.inception_v3.preprocess_input

    inputs = keras.Input((IMG_H, IMG_W, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")


feature_extractor = build_feature_extractor()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [12]:
label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(df1['Action'])
)
print(label_processor.get_vocabulary())

['Normal', 'Talking&Yawning', 'Yawning', 'talking']


In [13]:
i = 0
dfTrain = pd.DataFrame()
dfTest = pd.DataFrame()

while i<len(df1):
    if i%5==0:
        dfTest = dfTest.append(df1.iloc[[i]])
    else :
        dfTrain = dfTrain.append(df1.iloc[[i]])

    i+=1

In [14]:

def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["video-name"].values.tolist()
    start_nums = df["yawn-start"].values.tolist()
    # print(video_paths)
    labels = df["Action"].values
    labels = label_processor(labels[..., None]).numpy()

    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool")
    frame_features = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    # For each video.
    for idx, path in enumerate(video_paths):
        videoName = path
        number = start_nums[idx]
        frames = []

        while(len(frames)<=MAX_SEQ_LENGTH):
            path = "../Data/VideoFrames/"+videoName+"/"+videoName+"_"+f"{number:03}"+".jpg"
            frames.append(image.load_img(path, target_size=(224, 224, 3)))
            number+=1
        print(path)


        frames = load_video(frames)
        frames = frames[None, ...]
        # Initialize placeholders to store the masks and features of the current video.
        temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )
        # Extract features from the frames of the current video.
        
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                temp_frame_features[i, j, :] = feature_extractor.predict(
                    batch[None, j, :]
                )
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()

    return (frame_features, frame_masks), labels


train_data, train_labels = prepare_all_videos(dfTrain, "../Data/Mirror")
test_data, test_labels = prepare_all_videos(dfTest, "../Data/Mirror")

print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")

../Data/VideoFrames/1-FemaleNoGlasses-Yawning.avi/1-FemaleNoGlasses-Yawning.avi_070.jpg
../Data/VideoFrames/2-FemaleNoGlasses-Normal.avi/2-FemaleNoGlasses-Normal.avi_020.jpg
../Data/VideoFrames/2-FemaleNoGlasses-Yawning.avi/2-FemaleNoGlasses-Yawning.avi_395.jpg
../Data/VideoFrames/3-FemaleGlasses-Normal.avi/3-FemaleGlasses-Normal.avi_020.jpg
../Data/VideoFrames/4-FemaleGlasses-Normal.avi/4-FemaleGlasses-Normal.avi_020.jpg
../Data/VideoFrames/4-FemaleGlasses-Yawning.avi/4-FemaleGlasses-Yawning.avi_030.jpg
../Data/VideoFrames/5-FemaleGlasses-Normal.avi/5-FemaleGlasses-Normal.avi_020.jpg
../Data/VideoFrames/5-FemaleGlasses-Yawning.avi/5-FemaleGlasses-Yawning.avi_160.jpg
../Data/VideoFrames/6-FemaleNoGlasses-Yawning.avi/6-FemaleNoGlasses-Yawning.avi_140.jpg
../Data/VideoFrames/7-FemaleGlasses-Normal.avi/7-FemaleGlasses-Normal.avi_020.jpg
../Data/VideoFrames/7-FemaleGlasses-Yawning.avi/7-FemaleGlasses-Yawning.avi_105.jpg
../Data/VideoFrames/8-FemaleGlasses-Normal.avi/8-FemaleGlasses-Normal.

In [None]:
print(f"Frame features in train set: {test_data[0].shape}")
print(f"Frame masks in train set: {test_data[1].shape}")

Frame features in train set: (44, 20, 2048)
Frame masks in train set: (44, 20)


In [15]:
# Utility for our sequence model.
def get_sequence_model():
    class_vocab = label_processor.get_vocabulary()

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

    x = keras.layers.GRU(16, return_sequences=True)(
        frame_features_input, mask=mask_input
    )
    x = keras.layers.GRU(8)(x)
    x = keras.layers.Dropout(0.4)(x)
    x = keras.layers.Dense(8, activation="relu")(x)
    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)

    rnn_model = keras.Model([frame_features_input, mask_input], output)

    rnn_model.compile(
        loss="sparse_categorical_crossentropy", optimizer=keras.optimizers.Adam(epsilon=1, learning_rate=5), metrics=["accuracy"]
    )
    return rnn_model


# Utility for running experiments.
def run_experiment():
    filepath = "./video_classifier"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

    seq_model = get_sequence_model()
    history = seq_model.fit(
        [train_data[0], train_data[1]],
        train_labels,
        validation_split=0.3,
        epochs=EPOCHS,
        callbacks=[checkpoint],
    )

    seq_model.load_weights(filepath)
    _, accuracy = seq_model.evaluate([test_data[0], test_data[1]], test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return history, seq_model


_, sequence_model = run_experiment()

Epoch 1/100
Epoch 1: val_loss improved from inf to 1.22618, saving model to .\video_classifier
Epoch 2/100
Epoch 2: val_loss improved from 1.22618 to 1.07359, saving model to .\video_classifier
Epoch 3/100
Epoch 3: val_loss improved from 1.07359 to 0.97663, saving model to .\video_classifier
Epoch 4/100
Epoch 4: val_loss improved from 0.97663 to 0.93079, saving model to .\video_classifier
Epoch 5/100
Epoch 5: val_loss improved from 0.93079 to 0.91211, saving model to .\video_classifier
Epoch 6/100
Epoch 6: val_loss improved from 0.91211 to 0.90420, saving model to .\video_classifier
Epoch 7/100
Epoch 7: val_loss improved from 0.90420 to 0.89809, saving model to .\video_classifier
Epoch 8/100
Epoch 8: val_loss improved from 0.89809 to 0.89471, saving model to .\video_classifier
Epoch 9/100
Epoch 9: val_loss improved from 0.89471 to 0.89054, saving model to .\video_classifier
Epoch 10/100
Epoch 10: val_loss improved from 0.89054 to 0.88823, saving model to .\video_classifier
Epoch 11/100

In [19]:

def prepare_single_video(frames):
    frames = frames[None, ...]
    frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    for i, batch in enumerate(frames):
        video_length = batch.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length)
        print(length)
        for j in range(length):
            frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
        frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

    return frame_features, frame_mask


def sequence_prediction(path):
    class_vocab = label_processor.get_vocabulary()

    frames = load_video(os.path.join("../Data/Mirror/", path))
    frame_features, frame_mask = prepare_single_video(frames)
    probabilities = sequence_model.predict([frame_features, frame_mask])[0]

    for i in np.argsort(probabilities)[::-1]:
        print(f"  {class_vocab[i]}: {probabilities[i] * 100:5.2f}%")
    return frames





def to_gif(images):
    converted_images = images.astype(np.uint8)
    imageio.mimsave("animation.gif", converted_images, fps=10)
    return embed.embed_file("animation.gif")


test_video = np.random.choice(dfTest["video-name"].values.tolist())
print(f"Test video path: {test_video}")
test_frames = sequence_prediction(test_video)
to_gif(test_frames[:MAX_SEQ_LENGTH])

Test video path: 36-FemaleNoGlasses-Talking&Yawning.avi
0
  Yawning: 49.06%
  Normal: 44.10%
  Talking&Yawning:  6.04%
  talking:  0.80%


ValueError: Image data must be a sequence of ndimages.