In [3]:
import os
import json
from PIL import Image
import cv2
import mediapipe as mp
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import ImageFolder, ImageNet
from torchvision import transforms, models
from torchvision.models import mobilenet_v2, MobileNet_V2_Weights
from torch.cuda.amp import GradScaler
import random, shutil

In [4]:
if torch.cuda.is_available():
    print("CUDA is available. PyTorch can access the GPU.")
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    print(f"Current GPU Name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. PyTorch is using the CPU.")


CUDA is available. PyTorch can access the GPU.
Number of GPUs available: 1
Current GPU Name: NVIDIA GeForce RTX 4050 Laptop GPU


In [5]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Grayscale(num_output_channels=3),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

train_dataset = ImageFolder(
    root="images/train",
    transform=transform
)

val_dataset = ImageFolder(
    root="images/validation",
    transform=transform
)

In [6]:
class_names = train_dataset.classes
num_classes = len(class_names)
print(class_names)
print(f"Number of classes: {num_classes}") 

['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
Number of classes: 7


In [7]:
train_loader = DataLoader(
    train_dataset,
    batch_size=128,
    shuffle=True,
    num_workers=4,
    pin_memory = True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=128,
    shuffle=False,
    num_workers=4
)

train_dataset.classes == val_dataset.classes

print(len(train_dataset), len(val_dataset))

28821 7066


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

weights = MobileNet_V2_Weights.DEFAULT
model = mobilenet_v2(weights=weights)

for param in model.features.parameters():
    param.requires_grad = False
#freezing the feature extractor

model.classifier[1] = nn.Linear(
    model.last_channel,
    num_classes 
    
    #Linear(1280 â†’ 36)
)

model = model.to(device)

In [10]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    model.classifier.parameters(),
    lr=1e-3
)


scaler = GradScaler()

In [11]:
epochs = 25

for epoch in range(epochs):
    # TRAIN
    model.train()
    running_loss = 0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        optimizer.zero_grad()

        with torch.amp.autocast('cuda'):
            outputs = model(images)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()

        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    avg_loss = running_loss / len(train_loader)
    train_acc = 100 * correct / total

    # VALIDATION
    model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            with torch.amp.autocast('cuda'):
                outputs = model(images)
                loss = criterion(outputs, labels)

            val_loss += loss.item()

            _, preds = torch.max(outputs, 1)
            val_correct += (preds == labels).sum().item()
            val_total += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_acc = 100 * val_correct / val_total

    print(
        f"Epoch [{epoch+1}/{epochs}] "
        f"- Train Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}% "
        f"- Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.2f}%"
    )


Epoch [1/25] - Train Loss: 1.6483, Train Acc: 35.42% - Val Loss: 1.5586, Val Acc: 40.11%
Epoch [2/25] - Train Loss: 1.5402, Train Acc: 40.50% - Val Loss: 1.5231, Val Acc: 41.88%
Epoch [3/25] - Train Loss: 1.5091, Train Acc: 41.85% - Val Loss: 1.5123, Val Acc: 41.92%
Epoch [4/25] - Train Loss: 1.4894, Train Acc: 42.57% - Val Loss: 1.4965, Val Acc: 42.41%
Epoch [5/25] - Train Loss: 1.4760, Train Acc: 43.08% - Val Loss: 1.4946, Val Acc: 42.56%
Epoch [6/25] - Train Loss: 1.4680, Train Acc: 43.46% - Val Loss: 1.4938, Val Acc: 42.60%
Epoch [7/25] - Train Loss: 1.4655, Train Acc: 43.45% - Val Loss: 1.4820, Val Acc: 43.46%
Epoch [8/25] - Train Loss: 1.4559, Train Acc: 44.10% - Val Loss: 1.4880, Val Acc: 42.43%
Epoch [9/25] - Train Loss: 1.4524, Train Acc: 44.28% - Val Loss: 1.4806, Val Acc: 42.95%
Epoch [10/25] - Train Loss: 1.4493, Train Acc: 44.27% - Val Loss: 1.4793, Val Acc: 43.52%
Epoch [11/25] - Train Loss: 1.4465, Train Acc: 44.60% - Val Loss: 1.4847, Val Acc: 42.58%
Epoch [12/25] - Tra

In [9]:
torch.save(model.state_dict(), "mobilnet_emotion.pth")
print("Model saved.")

Model saved.


In [9]:
import json

with open("class_names.json", "w") as f:
    json.dump(class_names, f)

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

weights = MobileNet_V2_Weights.DEFAULT
model = mobilenet_v2(weights=None)  # no pretrained at inference

model.classifier[1] = torch.nn.Linear(
    model.last_channel,
    len(class_names)
)

model.load_state_dict(torch.load("mobilnet_emotion.pth", map_location=device))
model.to(device)
model.eval()

transform = weights.transforms()

In [11]:
def preprocess(img):
    if img is None or img.size == 0:
        return None

    img = cv2.resize(img, (224, 224))

    # Convert to GRAYSCALE first (to match training)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Convert grayscale to 3 channels
    img = np.stack([img, img, img], axis=-1)

    img = img.astype("float32") / 255.0

    mean = np.array([0.485, 0.456, 0.406])
    std  = np.array([0.229, 0.224, 0.225])
    img = (img - mean) / std

    img = np.transpose(img, (2, 0, 1))  # CHW
    img = torch.from_numpy(img).unsqueeze(0).float()
    return img


In [12]:
cap = cv2.VideoCapture(0)
mp_face = mp.solutions.face_detection
mp_drawing = mp.solutions.drawing_utils

with mp_face.FaceDetection(model_selection=0, min_detection_confidence=0.6) as face_detection:

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame = cv2.flip(frame, 1)
        h, w, _ = frame.shape

        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = face_detection.process(rgb)

        if results.detections:
            detection = results.detections[0]
            bbox = detection.location_data.relative_bounding_box

            xmin = int(bbox.xmin * w)
            ymin = int(bbox.ymin * h)
            bw = int(bbox.width * w)
            bh = int(bbox.height * h)

            xmax = xmin + bw
            ymax = ymin + bh

            # Make square and add margin
            box_size = max(bw, bh)
            cx = xmin + bw // 2
            cy = ymin + bh // 2

            scale = 1.3
            half = int(box_size * scale / 2)

            xmin = max(0, cx - half)
            ymin = max(0, cy - half)
            xmax = min(w, cx + half)
            ymax = min(h, cy + half)

            face_crop = frame[ymin:ymax, xmin:xmax]

            if face_crop.size != 0:
                face_rgb = cv2.cvtColor(face_crop, cv2.COLOR_BGR2RGB)
                face_pil = Image.fromarray(face_rgb)

                input_tensor = transform(face_pil).unsqueeze(0).to(device)

                with torch.no_grad():
                    output = model(input_tensor)
                    probs = torch.softmax(output, dim=1)
                    conf, pred = torch.max(probs, dim=1)

                label = class_names[pred.item()]
                confidence = conf.item()

                cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
                cv2.putText(
                    frame,
                    f"{label} ({confidence:.2f})",
                    (xmin, ymin - 10),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.8,
                    (0, 255, 0),
                    2
                )

                cv2.imshow("FACE_CROP_DEBUG", face_crop)

        cv2.imshow("Emotion Detection", frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()

In [None]:
print(class_names)

In [None]:
model.classifier[1].out_features == len(class_names)

In [None]:
train_dataset.classes == val_dataset.classes