In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import tensorflow as tf
import os
from PIL import Image
import kaggle
from sklearn.model_selection import train_test_split
import pickle # to save training history
#import tensorflow_hub as hub




In [3]:
data_dir = './human-face-emotions/data'

In [89]:
full_ds = tf.keras.preprocessing.image_dataset_from_directory(
    data_dir,
    label_mode="int",
    image_size=(48, 48),
    color_mode="grayscale",
    batch_size=None,
    shuffle=True,
    seed=42
)

class_names = full_ds.class_names
num_classes = len(class_names)

print("Classes:", class_names)

Found 57756 files belonging to 5 classes.
Classes: ['Angry', 'Fear', 'Happy', 'Sad', 'Suprise']


In [5]:
full_data = list(full_ds.as_numpy_iterator())

images = [x[0] for x in full_data]  # list of arrays
labels = [x[1] for x in full_data]  # list of ints

In [None]:
x_train, x_temp, y_train, y_temp = train_test_split(
    images, labels,
    test_size=0.30,         # 30% val+test
    random_state=42,
    stratify=labels 
)

x_val, x_test, y_val, y_test = train_test_split(
    x_temp, y_temp,
    test_size=0.50,         # 15% val, 15% test
    random_state=42,
    stratify=y_temp
)

In [30]:
train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
val_ds   = tf.data.Dataset.from_tensor_slices((x_val, y_val))
test_ds  = tf.data.Dataset.from_tensor_slices((x_test, y_test))

In [None]:
# no doing this for transformer
#batch_size = 128

#train_ds = train_ds.shuffle(10000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
#val_ds   = val_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
#test_ds  = test_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [90]:
# ViT required size is 224x224

def preprocess_vit(image, label):
    image = tf.image.grayscale_to_rgb(image)
    image = tf.image.resize(image, (224, 224))

    # normalize pixel values
    image = image / 255.0
    return image, label

In [91]:
## originally created for vit but same works for ef

vit_train_ds = train_ds.map(preprocess_vit).shuffle(10000).batch(32).prefetch(tf.data.AUTOTUNE)
vit_val_ds   = val_ds.map(preprocess_vit).batch(32).prefetch(tf.data.AUTOTUNE)
vit_test_ds  = test_ds.map(preprocess_vit).batch(32).prefetch(tf.data.AUTOTUNE)

## prefetch(tf.data.AUTOTUNE) - makes preloading dynamic --> faster training

### self trained VIT

In [92]:
class PatchEmbedding(tf.keras.layers.Layer):
    def __init__(self, patch_size=4, embed_dim=64):
        super().__init__()
        self.projection = tf.keras.layers.Conv2D(
            filters=embed_dim,
            kernel_size=patch_size,
            strides=patch_size,
            padding="valid"
        )
        self.flatten = tf.keras.layers.Reshape((-1, embed_dim))

    def call(self, x):
        x = self.projection(x)
        x = self.flatten(x)
        return x


In [93]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, max_len=1024, embed_dim=64):
        super().__init__()
        self.pos_embedding = tf.keras.layers.Embedding(
            input_dim=max_len,
            output_dim=embed_dim
        )

    def call(self, x):
        seq_len = tf.shape(x)[1]             
        positions = tf.range(seq_len)         
        pos_embed = self.pos_embedding(positions)
        return x + pos_embed


In [59]:
class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, mlp_dim, dropout=0.1):
        super().__init__()
        self.norm1 = tf.keras.layers.LayerNormalization()
        self.attn = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=embed_dim,
            dropout=dropout
        )

        self.norm2 = tf.keras.layers.LayerNormalization()
        self.mlp = tf.keras.Sequential([
            tf.keras.layers.Dense(mlp_dim, activation="gelu"),
            tf.keras.layers.Dropout(dropout),
            tf.keras.layers.Dense(embed_dim),
            tf.keras.layers.Dropout(dropout)
        ])

    def call(self, x):
        x = x + self.attn(x, x)
        x = self.norm1(x)

        x = x + self.mlp(x)
        x = self.norm2(x)

        return x


In [94]:
def build_small_vit(input_shape=(48, 48, 1), num_classes=5):
    inputs = tf.keras.Input(shape=input_shape)

    # grayscale → RGB
    x = tf.keras.layers.Lambda(lambda t: tf.concat([t, t, t], axis=-1))(inputs)

    # resize to something divisible by 4
    x = tf.keras.layers.Resizing(64, 64)(x)

    # patches
    x = PatchEmbedding(patch_size=4, embed_dim=64)(x)

    # positional embeddings
    x = PositionalEmbedding(max_len=1024, embed_dim=64)(x)

    # transformer blocks
    for _ in range(3):
        x = TransformerEncoder(
            embed_dim=64,
            num_heads=2,
            mlp_dim=128,
            dropout=0.1
        )(x)

    # classifier head
    x = tf.keras.layers.LayerNormalization()(x)
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    x = tf.keras.layers.Dense(128, activation="gelu")(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    outputs = tf.keras.layers.Dense(num_classes, activation="softmax")(x)

    return tf.keras.Model(inputs, outputs)


In [95]:
model = build_small_vit()

model.compile(
    optimizer=tf.keras.optimizers.Adam(3e-4),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)


In [None]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=5,
    restore_best_weights=True
)

In [97]:
history_1 = model.fit(
    train_ds.batch(32),
    validation_data=val_ds.batch(32),
    epochs=20,
    callbacks=[callback]
)

Epoch 1/20
[1m 564/1264[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m2:12[0m 190ms/step - accuracy: 0.2938 - loss: 1.5916

KeyboardInterrupt: 

In [88]:
test_loss, test_acc = model.evaluate(test_ds)
print("Final Test Accuracy:", test_acc)

ValueError: Input 0 of layer "conv2d_4" is incompatible with the layer: expected min_ndim=4, found ndim=3. Full shape received: (224, 224, 1)

In [None]:
model.save('models/small_ViT.keras')

  return saving_lib.save_model(model, filepath)


In [None]:
with open("training_history/history_small_ViT.pkl", "wb") as f:
    pickle.dump(history_1.history, f)

### EfficientNetB0 - lightweight transformer-like CNN

In [80]:
image_size = (96, 96)

def preprocess_ef(image, label):
    image = tf.image.grayscale_to_rgb(image)
    image = tf.image.resize(image, image_size)
    image = image / 255.0
    return image, label

train_ef = train_ds.map(preprocess_ef).shuffle(10000).batch(32).prefetch(tf.data.AUTOTUNE)
val_ef   = val_ds.map(preprocess_ef).batch(32).prefetch(tf.data.AUTOTUNE)
test_ef  = test_ds.map(preprocess_ef).batch(32).prefetch(tf.data.AUTOTUNE)


In [None]:

base = tf.keras.applications.EfficientNetB0(
    include_top=False,
    weights="imagenet",
    input_shape=(96, 96, 3),
    pooling="avg"
)

base.trainable = False

model_ef = tf.keras.Sequential([
    tf.keras.layers.Resizing(96, 96),
    tf.keras.layers.Conv2D(3, (1,1)),  # grayscale --> RGB

    base,

    tf.keras.layers.Dense(256, activation="relu"),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(num_classes, activation="softmax")
])

model_ef.summary()

In [82]:
model_ef.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

In [83]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=4,
    restore_best_weights=True
)

In [84]:
history_ef = model_ef.fit(
    train_ef,
    validation_data=val_ef,
    epochs=30,
    callbacks=[callback]
)

Epoch 1/30
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m334s[0m 258ms/step - accuracy: 0.3037 - loss: 1.5790 - val_accuracy: 0.3113 - val_loss: 1.5687
Epoch 2/30
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m475s[0m 375ms/step - accuracy: 0.3088 - loss: 1.5718 - val_accuracy: 0.3113 - val_loss: 1.5675
Epoch 3/30
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m665s[0m 526ms/step - accuracy: 0.3133 - loss: 1.5689 - val_accuracy: 0.3113 - val_loss: 1.5676
Epoch 4/30
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10710s[0m 8s/step - accuracy: 0.3079 - loss: 1.5716 - val_accuracy: 0.3113 - val_loss: 1.5695
Epoch 5/30
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m720s[0m 570ms/step - accuracy: 0.3113 - loss: 1.5704 - val_accuracy: 0.3113 - val_loss: 1.5688
Epoch 6/30
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4621s[0m 4s/step - accuracy: 0.3072 - loss: 1.5719 - val_accuracy: 0.3113 - val_loss: 1.