<h1>Importing the model</h1>

In [None]:
import os
import numpy as np
import sklearn
import tensorflow


PROCESSED_DATA_PATH = "D:/emotion_aware_speaker_identification/code/processed_data"

# we load the files and store them as specific numpy array
X = np.load(os.path.join(PROCESSED_DATA_PATH, 'X_data.npy'))
y_emotion = np.load(os.path.join(PROCESSED_DATA_PATH, 'y_emotion.npy'))
y_speaker = np.load(os.path.join(PROCESSED_DATA_PATH, 'y_speaker.npy'))

print(f"X data shape: {X.shape}")
print(f"Speaker labels shape: {y_speaker.shape}")
print(f"Emotion labels shape: {y_emotion.shape}")

X data shape: (1440, 224, 224, 3)
Speaker labels shape: (1440,)
Emotion labels shape: (1440,)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

speaker_encoder = LabelEncoder()    # Create the instance of LabelEncoder
y_speaker_encoded = speaker_encoder.fit_transform(y_speaker)    # Each unique placeholder is given a numeric value e.g., 'Speaker A' becomes 0, 'Speaker B' becomes 1, and so on
y_speaker_categorical = to_categorical(y_speaker_encoded)   # converts to one hot encoded form say we have two columns male female and others, if someone is male then the male column is 1 whereas the female column and the others column are provided 0 value 

# Same done for emotions 

emotion_encoder = LabelEncoder()
y_emotion_encoded = emotion_encoder.fit_transform(y_emotion)
y_emotion_categorical = to_categorical(y_emotion_encoded)


# Here we will split the data into training and test 0.2 defines that 20% of data will be saved for test, random_state just tells us that the picks would be random
X_train, X_test, y_speaker_train, y_speaker_test, y_emotion_train, y_emotion_test = train_test_split(X, y_speaker_categorical, y_emotion_categorical, test_size=0.2, random_state=42)


print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

Training data shape: (1152, 224, 224, 3)
Test data shape: (288, 224, 224, 3)


In [8]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras import layers, Model

def create_vgg16_multi_head_model(input_shape, num_speakers, num_emotions):
    
    #call the base model here vgg16
    vgg_base = VGG16(
        include_top = False,    # Whether to include the 3 fully-connected layers at the top of the network.
        weights = 'imagenet',    # Weights it aquired pre-training on ImageNet
        input_shape = input_shape   # Defines what is the shape of tensor eg here [224,224,3] represent [length, breadth,dimension] 
    )

    vgg_base.trainable = False  # This prevents the pre-trained weights from being updated during training

    x = vgg_base.output    # The output from the model is stored in x
    x = layers.Flatten()(x)     # Here the layer is flattened as it will be fed into the dense layer which only takes 1d tensor as input say if an image had shape [25,25,3] meaning 25X25 pixels in 3 dimensions as R,G,B flatten basically turns into [25X25X3, 1] or [75,1] 

    speaker_head = layers.Dense(128, activation='relu')(x)  # This creates a dense or a fully connected neural network, 128 are the number of neurons in the layer, here defining the activation function as ReLU means that all the positive values will remain the same and the negative values are raised to 0, (x) connects this layer to the input tensor x
    speaker_head = layers.Dense(num_speakers, activation='softmax', name='speaker_output')(speaker_head)    # This creates then final output layer for speaker identification num_speakers are the number of unique speakers, name ='speaker_output' assigns a specific name to the output layer

    # Same process for emotion

    emotion_head = layers.Dense(128, activation='relu')(x)
    emotion_head = layers.Dense(num_emotions, activation='softmax', name='emotion_output')(emotion_head)

    model = Model(inputs=vgg_base.input, outputs=[speaker_head, emotion_head])  # # Create the final model with two outputs

    return model

num_speakers = len(speaker_encoder.classes_)
num_emotions = len(emotion_encoder.classes_)
input_shape = X_train.shape[1:]     #Removes the batch size coodrinate only gives [length, breadth, dimension]

model = create_vgg16_multi_head_model(input_shape, num_speakers, num_emotions)
model.summary()


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 0us/step


In [None]:
"""
# Compile the model
model.compile(
    optimizer='adam',
    loss={
        'speaker_output': 'categorical_crossentropy',
        'emotion_output': 'categorical_crossentropy'
    },
    metrics={
        'speaker_output': 'accuracy',
        'emotion_output': 'accuracy'
    }
)

# Train the model
history = model.fit(
    X_train,
    {'speaker_output': y_speaker_train, 'emotion_output': y_emotion_train},
    epochs=20,  # Training fewer epochs is often enough for transfer learning
    batch_size=32,
    validation_data=(X_test, {'speaker_output': y_speaker_test, 'emotion_output': y_emotion_test})
)

# Evaluate the model
loss, speaker_loss, emotion_loss, speaker_accuracy, emotion_accuracy = model.evaluate(
    X_test,
    {'speaker_output': y_speaker_test, 'emotion_output': y_emotion_test},
    verbose=0
)

print(f"Overall Test Loss: {loss:.4f}")
print(f"Speaker Head Test Accuracy: {speaker_accuracy:.4f}")
print(f"Emotion Head Test Accuracy: {emotion_accuracy:.4f}")
"""