In [None]:
!pip install -q keras-cv

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m613.1/613.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install -q kaggle

In [None]:
! mkdir ~/.kaggle

! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d saniuzzamanrobin/ravdess-speech-all-audio-and-video

Downloading ravdess-speech-all-audio-and-video.zip to /content
100% 6.39G/6.39G [01:07<00:00, 97.1MB/s]
100% 6.39G/6.39G [01:07<00:00, 102MB/s] 


In [None]:
!unzip -qq /content/ravdess-speech-all-audio-and-video.zip

In [None]:
import os

import keras
from imutils import paths

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import imageio
import cv2
from IPython.display import Image

In [None]:
IMG_SIZE = 224
BATCH_SIZE = 32
EPOCHS = 10

MAX_SEQ_LENGTH = 20
NUM_FEATURES = 2048

In [None]:
def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]


def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

In [None]:
def build_feature_extractor():
    feature_extractor = keras.applications.InceptionV3(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.inception_v3.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")


feature_extractor = build_feature_extractor()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fear','disgust','surprise']

In [None]:
video_paths = os.listdir('/content/All Videos')

In [None]:
from tqdm import tqdm
def prepare_all_videos(root_dir):
    num_samples = len(video_paths)
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool")
    labels = np.zeros(shape=(num_samples, 1), dtype="int")
    frame_features = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    for idx, path in tqdm(enumerate(video_paths)):
        frames = load_video(os.path.join(root_dir, path))
        frames = frames[None, ...]
        labels[idx] = int(path.split('-')[2]) -1

        temp_frame_mask = np.zeros(
            shape=(
                1,
                MAX_SEQ_LENGTH,
            ),
            dtype="bool",
        )
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                temp_frame_features[i, j, :] = feature_extractor.predict(
                    batch[None, j, :], verbose=0,
                )
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()

    return (frame_features, frame_masks), labels


train_data, train_labels = prepare_all_videos("/content/All Videos")

print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")

In [None]:
train_labels_oh = np.max(train_labels) + 1
train_labels_oh=np.eye(train_labels_oh)[train_labels]
train_labels_oh = train_labels_oh.reshape(-1,8)
train_labels_oh.shape

In [None]:
def get_sequence_model():

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")
    x = keras.layers.LSTM(32, return_sequences=True)(
        frame_features_input, mask=mask_input
    )
    x = keras.layers.LSTM(16)(x)
    x = keras.layers.Dropout(0.4)(x)
    x = keras.layers.Dense(16, activation="relu")(x)
    output = keras.layers.Dense(len(emotions), activation="softmax")(x)

    rnn_model = keras.Model([frame_features_input, mask_input], output)

    rnn_model.compile(
        loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    return rnn_model


seq_model = get_sequence_model()
history = seq_model.fit(
    [train_data[0], train_data[1]],
    train_labels_oh,
    validation_split=0.3,
    epochs=EPOCHS,
)


In [None]:
import moviepy.editor
moviepy.editor.ipython_display('/content/All Videos/'+video_paths[0])

In [None]:
train_data[0][0].shape

(20, 2048)

In [None]:
pred = seq_model.predict([np.expand_dims(train_data[0][0],0),np.expand_dims(train_data[1][0],0)])



In [None]:
emotions[np.argmax(pred)]

'happy'