In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.models import resnet50, inception_v3, googlenet
from PIL import Image
import json
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
json_path    = "final_annotations.json"
batch_size   = 32
epochs       = 10
lr           = 1e-4
weight_decay = 1e-2
device       = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_classes  = 7

In [3]:
class EmotionImageDataset(Dataset):
    def __init__(self, json_path, split='train', img_size=224):
        with open(json_path, 'r') as f:
            data = json.load(f)
        self.samples = data[split]
        self.transform = transforms.Compose([
            transforms.Resize((img_size, img_size)),
            transforms.ToTensor(),
            transforms.Normalize([0.485,0.456,0.406],
                                 [0.229,0.224,0.225]),
        ])

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = self.samples[idx]
        img = Image.open(item['image_path']).convert('RGB')
        img = self.transform(img)
        face = torch.tensor(item['face_embedding'], dtype=torch.float32)
        pose = torch.tensor(item['pose_embedding'], dtype=torch.float32)
        label = torch.tensor(item['multi_hot'], dtype=torch.float32)
        return img, face, pose, label

In [4]:
def get_backbone(name):
    if name == 'resnet50':
        model = resnet50(pretrained=True)
        feat_dim = model.fc.in_features
        model.fc = nn.Identity()
        return model, feat_dim

    if name == 'inception_v3':
        model = inception_v3(pretrained=True)  
        feat_dim = model.fc.in_features
        model.fc = nn.Identity()
        model.aux_logits = False
        model.AuxLogits = nn.Identity()
        return model, feat_dim
    
    if name == 'googlenet':
        weights = GoogLeNet_Weights.IMAGENET1K_V1
        model = googlenet(weights=weights, aux_logits=True)
        model.aux_logits = False
        feat_dim = model.fc.in_features
        model.fc = nn.Identity()
        return model, feat_dim

    raise ValueError(f"Unknown backbone: {name}")

In [5]:
class MultiModalNet(nn.Module):
    def __init__(self, backbone_name, face_dim=512, pose_dim=34, hidden_dim=256):
        super().__init__()
        self.backbone, img_feat = get_backbone(backbone_name)
        self.face_proj = nn.Linear(face_dim, hidden_dim)
        self.pose_proj = nn.Linear(pose_dim, hidden_dim)
        self.classifier = nn.Sequential(
            nn.Linear(img_feat + hidden_dim*2, hidden_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(hidden_dim, num_classes),
            nn.Sigmoid()
        )

    def forward(self, img, face, pose):
        img_feat = self.backbone(img)           
        f = self.face_proj(face)               
        p = self.pose_proj(pose)                
        x = torch.cat([img_feat, f, p], dim=1) 
        return self.classifier(x)               

In [6]:
def compute_metrics(y_true, y_pred, thr=0.5):
    y_pred_bin = (y_pred > thr).astype(int)
    return {
        'accuracy':  accuracy_score(y_true, y_pred_bin),
        'precision': precision_score(y_true, y_pred_bin, average='micro', zero_division=0),
        'recall':    recall_score(y_true, y_pred_bin, average='micro', zero_division=0),
        'f1':        f1_score(y_true, y_pred_bin, average='micro', zero_division=0)
    }

def train_epoch(model, dataloader, optimizer, criterion):
    model.train()
    losses, Ys, Ps = [], [], []
    for img, face, pose, label in tqdm(dataloader, desc="Train", leave=False):
        img, face, pose, label = img.to(device), face.to(device), pose.to(device), label.to(device)
        optimizer.zero_grad()
        out = model(img, face, pose)
        loss = criterion(out, label)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
        Ys.append(label.detach().cpu().numpy())
        Ps.append(out.detach().cpu().numpy())
    y_true = np.vstack(Ys)
    y_pred = np.vstack(Ps)
    return np.mean(losses), compute_metrics(y_true, y_pred)

def eval_epoch(model, dataloader, criterion, desc="Val"):
    model.eval()
    losses, Ys, Ps = [], [], []
    with torch.no_grad():
        for img, face, pose, label in tqdm(dataloader, desc=desc, leave=False):
            img, face, pose, label = img.to(device), face.to(device), pose.to(device), label.to(device)
            out = model(img, face, pose)
            losses.append(criterion(out, label).item())
            Ys.append(label.cpu().numpy())
            Ps.append(out.cpu().numpy())
    y_true = np.vstack(Ys)
    y_pred = np.vstack(Ps)
    return np.mean(losses), compute_metrics(y_true, y_pred)

In [7]:
train_ds = EmotionImageDataset(json_path, split='train', img_size=224)
val_ds   = EmotionImageDataset(json_path, split='val',   img_size=224)
test_ds  = EmotionImageDataset(json_path, split='test',  img_size=224)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False)

In [None]:
from torchvision.models import googlenet, GoogLeNet_Weights
for backbone_name in ('resnet50','inception_v3','googlenet'):
    print(f"\n Backbone: {backbone_name.upper()}")
    model     = MultiModalNet(backbone_name).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.BCELoss()
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5,
                                                     patience=2, verbose=True)

    best_val_loss = float('inf')
    for epoch in range(1, epochs+1):
        print(f"Epoch {epoch}/{epochs}")
        tr_loss, tr_metrics = train_epoch(model, train_loader, optimizer, criterion)
        val_loss, val_metrics = eval_epoch(model, val_loader, criterion, desc="Val")
        print(f"  Train → loss: {tr_loss:.4f}, acc: {tr_metrics['accuracy']:.4f}, prec: {tr_metrics['precision']:.4f}, rec: {tr_metrics['recall']:.4f}, f1: {tr_metrics['f1']:.4f}")
        print(f"  Val   → loss: {val_loss:.4f}, acc: {val_metrics['accuracy']:.4f}, prec: {val_metrics['precision']:.4f}, rec: {val_metrics['recall']:.4f}, f1: {val_metrics['f1']:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), f"best_{backbone_name}.pt")
            print("Saved best model.")
        scheduler.step(val_loss)

    model.load_state_dict(torch.load(f"best_{backbone_name}.pt"))
    test_loss, test_metrics = eval_epoch(model, test_loader, criterion, desc="Test")
    print(f"→ Test → loss: {test_loss:.4f}, acc: {test_metrics['accuracy']:.4f}, prec: {test_metrics['precision']:.4f}, rec: {test_metrics['recall']:.4f}, f1: {test_metrics['f1']:.4f}")


 Backbone: RESNET50




Epoch 1/10


                                                        

  Train → loss: 0.4470, acc: 0.4613, prec: 0.7048, rec: 0.6197, f1: 0.6595
  Val   → loss: 0.2761, acc: 0.5633, prec: 0.8127, rec: 0.6142, f1: 0.6997
Saved best model.
Epoch 2/10


                                                        

  Train → loss: 0.2875, acc: 0.5409, prec: 0.7812, rec: 0.6319, f1: 0.6987
  Val   → loss: 0.2658, acc: 0.5656, prec: 0.8137, rec: 0.6170, f1: 0.7018
Saved best model.
Epoch 3/10


                                                        

  Train → loss: 0.2740, acc: 0.5571, prec: 0.7943, rec: 0.6361, f1: 0.7064
  Val   → loss: 0.2666, acc: 0.5647, prec: 0.8021, rec: 0.6335, f1: 0.7079
Epoch 4/10


                                                        

  Train → loss: 0.2570, acc: 0.5634, prec: 0.7952, rec: 0.6448, f1: 0.7121
  Val   → loss: 0.2706, acc: 0.5627, prec: 0.8011, rec: 0.6340, f1: 0.7078
Epoch 5/10


                                                        

  Train → loss: 0.2400, acc: 0.5705, prec: 0.7998, rec: 0.6520, f1: 0.7184
  Val   → loss: 0.2818, acc: 0.5576, prec: 0.7941, rec: 0.6361, f1: 0.7064
Epoch 00005: reducing learning rate of group 0 to 5.0000e-05.
Epoch 6/10


                                                        

  Train → loss: 0.2094, acc: 0.5838, prec: 0.8035, rec: 0.6777, f1: 0.7353
  Val   → loss: 0.3036, acc: 0.5450, prec: 0.7760, rec: 0.6518, f1: 0.7085
Epoch 7/10


                                                        

  Train → loss: 0.1893, acc: 0.6060, prec: 0.8185, rec: 0.7116, f1: 0.7613
  Val   → loss: 0.3111, acc: 0.5356, prec: 0.7722, rec: 0.6488, f1: 0.7052
Epoch 8/10


                                                        

  Train → loss: 0.1735, acc: 0.6378, prec: 0.8328, rec: 0.7426, f1: 0.7851
  Val   → loss: 0.3383, acc: 0.5156, prec: 0.7491, rec: 0.6613, f1: 0.7025
Epoch 00008: reducing learning rate of group 0 to 2.5000e-05.
Epoch 9/10


                                                        

  Train → loss: 0.1552, acc: 0.6654, prec: 0.8437, rec: 0.7662, f1: 0.8031
  Val   → loss: 0.3725, acc: 0.5010, prec: 0.7384, rec: 0.6557, f1: 0.6946
Epoch 10/10


                                                        

  Train → loss: 0.1441, acc: 0.6858, prec: 0.8536, rec: 0.7864, f1: 0.8186
  Val   → loss: 0.3814, acc: 0.4844, prec: 0.7215, rec: 0.6570, f1: 0.6878




→ Test → loss: 0.2625, acc: 0.5672, prec: 0.8260, rec: 0.6237, f1: 0.7107

 Backbone: INCEPTION_V3
Epoch 1/10


                                                        

  Train → loss: 0.4555, acc: 0.4726, prec: 0.7101, rec: 0.6190, f1: 0.6614
  Val   → loss: 0.2726, acc: 0.5636, prec: 0.8130, rec: 0.6144, f1: 0.6999
Saved best model.
Epoch 2/10


                                                        

  Train → loss: 0.2805, acc: 0.5526, prec: 0.7910, rec: 0.6305, f1: 0.7017
  Val   → loss: 0.2709, acc: 0.5607, prec: 0.8040, rec: 0.6237, f1: 0.7024
Saved best model.
Epoch 3/10


                                                        

  Train → loss: 0.2629, acc: 0.5637, prec: 0.7987, rec: 0.6336, f1: 0.7067
  Val   → loss: 0.2840, acc: 0.5650, prec: 0.8027, rec: 0.6335, f1: 0.7082
Epoch 4/10


                                                        

  Train → loss: 0.2449, acc: 0.5653, prec: 0.7980, rec: 0.6406, f1: 0.7107
  Val   → loss: 0.2805, acc: 0.5616, prec: 0.8008, rec: 0.6327, f1: 0.7069
Epoch 5/10


                                                        

  Train → loss: 0.2290, acc: 0.5772, prec: 0.8009, rec: 0.6510, f1: 0.7182
  Val   → loss: 0.2858, acc: 0.5530, prec: 0.7836, rec: 0.6477, f1: 0.7092
Epoch 00005: reducing learning rate of group 0 to 5.0000e-05.
Epoch 6/10


                                                        

  Train → loss: 0.2064, acc: 0.5938, prec: 0.8053, rec: 0.6658, f1: 0.7290
  Val   → loss: 0.3076, acc: 0.5584, prec: 0.7894, rec: 0.6441, f1: 0.7094
Epoch 7/10


                                                        

  Train → loss: 0.1915, acc: 0.6103, prec: 0.8100, rec: 0.6827, f1: 0.7409
  Val   → loss: 0.3279, acc: 0.5301, prec: 0.7512, rec: 0.6714, f1: 0.7091
Epoch 8/10


                                                        

  Train → loss: 0.1807, acc: 0.6270, prec: 0.8136, rec: 0.7013, f1: 0.7533
  Val   → loss: 0.3429, acc: 0.5147, prec: 0.7442, rec: 0.6682, f1: 0.7041
Epoch 00008: reducing learning rate of group 0 to 2.5000e-05.
Epoch 9/10


                                                        

  Train → loss: 0.1662, acc: 0.6388, prec: 0.8180, rec: 0.7234, f1: 0.7678
  Val   → loss: 0.3554, acc: 0.5298, prec: 0.7607, rec: 0.6514, f1: 0.7018
Epoch 10/10


                                                        

  Train → loss: 0.1571, acc: 0.6523, prec: 0.8192, rec: 0.7518, f1: 0.7840
  Val   → loss: 0.3842, acc: 0.5013, prec: 0.7281, rec: 0.6761, f1: 0.7012




→ Test → loss: 0.2652, acc: 0.5680, prec: 0.8159, rec: 0.6339, f1: 0.7134

 Backbone: GOOGLENET
Epoch 1/10


                                                        

  Train → loss: 0.6413, acc: 0.4407, prec: 0.6784, rec: 0.6074, f1: 0.6409
  Val   → loss: 0.2766, acc: 0.5630, prec: 0.8144, rec: 0.6133, f1: 0.6997
Saved best model.
Epoch 2/10


                                                        

  Train → loss: 0.2863, acc: 0.5479, prec: 0.7881, rec: 0.6271, f1: 0.6984
  Val   → loss: 0.2658, acc: 0.5636, prec: 0.8120, rec: 0.6168, f1: 0.7011
Saved best model.
Epoch 3/10


                                                        

  Train → loss: 0.2598, acc: 0.5608, prec: 0.7980, rec: 0.6318, f1: 0.7053
  Val   → loss: 0.2723, acc: 0.5627, prec: 0.8091, rec: 0.6200, f1: 0.7021
Epoch 4/10


                                                        

  Train → loss: 0.2419, acc: 0.5673, prec: 0.8021, rec: 0.6345, f1: 0.7085
  Val   → loss: 0.2775, acc: 0.5624, prec: 0.8020, rec: 0.6297, f1: 0.7055
Epoch 5/10


                                                        

  Train → loss: 0.2259, acc: 0.5720, prec: 0.8020, rec: 0.6415, f1: 0.7128
  Val   → loss: 0.2901, acc: 0.5658, prec: 0.8070, rec: 0.6277, f1: 0.7062
Epoch 00005: reducing learning rate of group 0 to 5.0000e-05.
Epoch 6/10


                                                        

  Train → loss: 0.2064, acc: 0.5822, prec: 0.8058, rec: 0.6493, f1: 0.7191
  Val   → loss: 0.2877, acc: 0.5567, prec: 0.7893, rec: 0.6443, f1: 0.7094
Epoch 7/10


                                                        

  Train → loss: 0.1957, acc: 0.5918, prec: 0.8068, rec: 0.6599, f1: 0.7260
  Val   → loss: 0.3001, acc: 0.5598, prec: 0.7940, rec: 0.6389, f1: 0.7081
Epoch 8/10


                                                        

  Train → loss: 0.1899, acc: 0.6022, prec: 0.8078, rec: 0.6720, f1: 0.7337
  Val   → loss: 0.3247, acc: 0.5590, prec: 0.7936, rec: 0.6424, f1: 0.7100
Epoch 00008: reducing learning rate of group 0 to 2.5000e-05.
Epoch 9/10


                                                        

  Train → loss: 0.1788, acc: 0.6193, prec: 0.8117, rec: 0.6904, f1: 0.7462
  Val   → loss: 0.3238, acc: 0.5541, prec: 0.7842, rec: 0.6480, f1: 0.7096
Epoch 10/10


                                                        

  Train → loss: 0.1738, acc: 0.6255, prec: 0.8135, rec: 0.6979, f1: 0.7512
  Val   → loss: 0.3491, acc: 0.5444, prec: 0.7760, rec: 0.6510, f1: 0.7080


Test:  48%|████▊     | 53/111 [00:14<00:13,  4.35it/s]