# 0) Necessary Imports & Installations

In [None]:
pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
# Import the necessary libraries

import tensorflow as tf
import cv2
import os
import matplotlib.pyplot as plt
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import librosa
import librosa.display
from IPython.display import Audio

import moviepy.editor

from pydub import AudioSegment

from google.colab.patches import cv2_imshow

# 1) Loading the Dataset

In [1]:
# Mount the drive into this notebook

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Unzip the dataset and extract the content

import zipfile

zip_ref = zipfile.ZipFile("/content/drive/MyDrive/NNFL Project/Datasets/Video_Speech_Actor_01.zip")
zip_ref.extractall()
zip_ref.close()

zip_ref = zipfile.ZipFile("/content/drive/MyDrive/NNFL Project/Datasets/facial_emotion_dataset.zip")
zip_ref.extractall()
zip_ref.close()

In [None]:
class_labels = ["angry", "disgusted", "fearful",
           "happy", "neutral", "sad", "surprised"]

num_classes = len(class_labels)

In [None]:
def load_and_preprocess_image(file_path, target_size):
    image = cv2.imread(file_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert to RGB format
    image = cv2.resize(image, target_size)  # Resize to a common size
    return image


data_dir = '/content/emotion_dataset/train'
target_size = (64, 64)  # Adjust the size as needed

In [None]:

train_data = []
train_labels = []

for class_name in os.listdir(data_dir):
    class_dir = os.path.join(data_dir, class_name)
    class_id = class_labels.index(class_name)  # You need to define class_labels

    for image_file in os.listdir(class_dir):
        image_path = os.path.join(class_dir, image_file)
        image = load_and_preprocess_image(image_path, target_size)
        train_data.append(image)
        train_labels.append(class_id)

train_data = np.array(train_data)
train_labels = np.array(train_labels)


# 2) Extracting Video and Audio from the samples

In [None]:
# Function to capture individual frames present in a video

def FrameCapture(path):
  frames = []

  vidObj = cv2.VideoCapture(path)

  count = 0

  success = 1

  while success:
    success, image = vidObj.read()

    if success == False:
      break

    # Converts the image frame into grayscale
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Resizes the image to the dimensions as required by the pre-trained model
    image = cv2.resize(image, (64, 64))

    # Further necessary image transformations
    image = np.dstack([image]*3 )
    image = np.expand_dims(image, axis=0)

    # To store each frame into one large 'frames' list
    frames.append(image)
    count += 1

  return frames

In [None]:
# Function to extract audio from a given audio-visual file

import moviepy.editor as mp

def ExtractAudio(path):
	# Insert Local Video File Path
	clip = mp.VideoFileClip(path)

 	# Insert Local Audio File Path
	clip.audio.write_audiofile(path.split(".")[0] + ".mp3")

	# Paths for mp3 and wav files
	input_file = (path.split(".")[0] + ".mp3")
	output_file = path.split(".")[0] + ".wav"

	# convert mp3 file to wav file
	sound = AudioSegment.from_mp3(input_file)
	sound.export(output_file, format = "wav")

# Extract video from each audio file

for i in range(len(paths)):
	ExtractAudio(paths[i])

In [None]:
# Function to extract mfcc features from audio samples

def extract_mfcc(filename):
    y, sr = librosa.load(filename, duration=3, offset=0.5)
    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
    return mfcc

In [None]:
# Iterate through each directory and store the path of each file

paths = []
labels = []
for dirname, _, filenames in os.walk('/content/Actor_01'):
    counter = 0
    for filename in filenames:
        paths.append(os.path.join(dirname, filename))
        label = filename.split('_')[-1]
        label = label.split('.')[0]
        labels.append(label.lower())
        counter = counter + 1
        if(counter == 60):
            break
print('Dataset is Loaded')

Dataset is Loaded


In [None]:
# Store the frames of all the 60 videos

frames_dir = []

for i in range(len(paths)):
  frames = FrameCapture(paths[i])

  frames_dir.append(frames)

len(frames_dir)

60

In [None]:
max_frames = max(len(frames_dir[i]) for i in range(len(frames_dir)))
max_frames

149

In [None]:
np.array(frames_dir[0]).shape
# empty_frames = np.zeros((19, 1, 64, 64, 3))

# np.concatenate((np.array(frames_dir[0]), empty_frames)).shape

(130, 1, 64, 64, 3)

In [None]:
frames_dir_new = []

for i in range(len(frames_dir)):
  frames = np.array(frames_dir[i])
  empty_frames = np.zeros((max_frames - len(frames_dir[i]), 1, 64, 64, 3))

  frames_dir_new.append(np.concatenate((frames, empty_frames)))

frames_dir = frames_dir_new
frames_dir[0].shape

(149, 1, 64, 64, 3)

In [None]:
# Function to make a dataframe of audio file paths

import pandas as pd

df = pd.DataFrame()

Y = []

for i in range(len(paths)):
  Y.append(paths[i].split(".")[0] + ".wav")

df['speech'] = Y

In [None]:
# Function call to extract the mfcc features from each audio file path present in the dataframe df

X_mfcc = df['speech'].apply(lambda x: extract_mfcc(x))

# Converting the dataframe list into a numpy array

X = [x for x in X_mfcc]
X = np.array(X)
X.shape

# Necessary size transformations for passing to the model

X = np.expand_dims(X, -1)

## Uncomment this when testing out a single sound sample
# X = np.expand_dims(X, axis=0)

X.shape

# 3) Model Definition

In [None]:
# Loading pre-trained models

model_images = tf.keras.models.load_model('/content/drive/MyDrive/NNFL Project/Models/final_model_Custom.h5')

model_speech = tf.keras.models.load_model('/content/drive/MyDrive/NNFL Project/Models/final_model_speechRecognition.h5')

In [None]:
model_images


<keras.src.engine.sequential.Sequential at 0x7bed35287f40>

In [None]:
# Model for images
model1 = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu')
])

# Model for audio
model2 = Sequential([
    LSTM(256, return_sequences=False, input_shape=(40,1)),
    Dropout(0.2),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu')
])