# E-UAML: Multimodal Kinship Verification (Python Skeleton)

This notebook provides a **skeleton implementation** of the E-UAML framework in Python using PyTorch. The goal is to demonstrate the architecture described in the manuscript, including:
- Modality-specific encoders (face, voice, ear, gait)
- L2-normalized embeddings
- Adversarial alignment (Gradient Reversal Layer + Modality Discriminator)
- Multi-head modality attention
- Transformer-based fusion
- Composite loss (contrastive + alignment + attention regularization)

**Note:** For simplicity, lightweight placeholder models are used. Replace them with real pretrained models (ResNet, VGG, TimeSformer, etc.) for practical training.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

# Embedding sizes for each modality
D_FACE = 512
D_VOICE = 2048
D_EAR = 1024
D_GAIT = 512
D_FUSED = 1024


## 1. Modality-Specific Encoders
Each modality is encoded into a latent embedding. In the paper, these are:
- Face → ResNet-34
- Voice → ResNet-50
- Ear → VGG-16
- Gait → TimeSformer

In this notebook, simplified placeholder models are used to demonstrate the architecture.

In [None]:
class SimpleMLPEncoder(nn.Module):
    """Lightweight MLP encoder for vector-like modalities (voice, gait)."""
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512), nn.ReLU(), nn.Linear(512, output_dim)
        )
    def forward(self, x): return self.net(x)

class SimpleConvEncoder(nn.Module):
    """Lightweight Conv encoder for image modalities (face, ear)."""
    def __init__(self, in_channels, output_dim):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(in_channels, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.AdaptiveAvgPool2d((1,1))
        )
        self.fc = nn.Linear(64, output_dim)
    def forward(self, x):
        h = self.features(x).view(x.size(0), -1)
        return self.fc(h)

class ModalityEncoders(nn.Module):
    """Four encoders for four biometric modalities."""
    def __init__(self):
        super().__init__()
        self.face_encoder = SimpleConvEncoder(3, D_FACE)
        self.ear_encoder = SimpleConvEncoder(1, D_EAR)
        self.voice_encoder = SimpleMLPEncoder(128, D_VOICE)
        self.gait_encoder = SimpleMLPEncoder(128, D_GAIT)
    def forward(self, xf, xv, xa, xb):
        return (
            self.face_encoder(xf),
            self.voice_encoder(xv),
            self.ear_encoder(xa),
            self.gait_encoder(xb)
        )


## 2. Adversarial Alignment
To enforce modality-invariant embeddings, we use:
- Gradient Reversal Layer (GRL)
- Multi-class modality discriminator

This aligns the distributions of embeddings from different modalities.

In [None]:
class GradientReversalFn(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, λ): ctx.λ = λ; return x.clone()
    @staticmethod
    def backward(ctx, g): return -ctx.λ * g, None

def grad_reverse(x, λ=1): return GradientReversalFn.apply(x, λ)

class ModalityDiscriminator(nn.Module):
    def __init__(self, dim, num_mods=4):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(dim,256), nn.ReLU(), nn.Linear(256,num_mods))
    def forward(self, x): return self.net(x)


## 3. Multi-Head Modality Attention
This module learns how much each modality should contribute to the final fused representation.

In [None]:
class ModalityAttention(nn.Module):
    def __init__(self, d_model, heads=4):
        super().__init__()
        self.attn = nn.MultiheadAttention(d_model, heads, batch_first=True)
        self.ffn = nn.Sequential(nn.Linear(d_model, d_model*4), nn.ReLU(), nn.Linear(d_model*4, d_model))
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
    def forward(self, x):
        h, w = self.attn(x, x, x)
        h = self.ln1(x + h)
        z = self.ffn(h)
        return self.ln2(h + z), w


## 4. Transformer Fusion Layer
A lightweight Transformer Encoder integrates the attended embeddings of two individuals (X and Y).

In [None]:
class FusionTransformer(nn.Module):
    def __init__(self, d=D_FUSED, heads=8, layers=2):
        super().__init__()
        L = nn.TransformerEncoderLayer(d, heads, d*4, batch_first=True)
        self.encoder = nn.TransformerEncoder(L, layers)
    def forward(self, zx, zy):
        seq = torch.stack([zx, zy], dim=1)  # (B,2,D)
        out = self.encoder(seq)
        return out[:,0], out[:,1]


## 5. Complete E-UAML Model
This integrates encoders, attention, fusion, and loss components into a full Siamese-style architecture.

In [None]:
def l2_norm(x, eps=1e-6): return x / (x.norm(2,-1,keepdim=True)+eps)

class EUAML(nn.Module):
    def __init__(self):
        super().__init__()
        self.enc = ModalityEncoders()
        self.attn = ModalityAttention(D_FUSED)
        self.fuse = FusionTransformer()
        # projection layers
        self.pf = nn.Linear(D_FACE, D_FUSED)
        self.pv = nn.Linear(D_VOICE, D_FUSED)
        self.pa = nn.Linear(D_EAR, D_FUSED)
        self.pg = nn.Linear(D_GAIT, D_FUSED)

    def encode(self, xf,xv,xa,xb):
        ef, ev, ea, eb = self.enc(xf,xv,xa,xb)
        return l2_norm(ef), l2_norm(ev), l2_norm(ea), l2_norm(eb)

    def forward(self, batch):
        efX, evX, eaX, ebX = self.encode(batch['fX'], batch['vX'], batch['aX'], batch['bX'])
        efY, evY, eaY, ebY = self.encode(batch['fY'], batch['vY'], batch['aY'], batch['bY'])

        zX = torch.stack([
 self.pf(efX), self.pv(evX), self.pa(eaX), self.pg(ebX)], dim=1)
        zY = torch.stack([
 self.pf(efY), self.pv(evY), self.pa(eaY), self.pg(ebY)], dim=1)

        zx, ax = self.attn(zX)
        zy, ay = self.attn(zY)

        fX, fY = self.fuse(zx.mean(1), zy.mean(1))
        return fX, fY, ax, ay


## 6. Loss Functions
Contrastive loss + attention regularization is implemented here.

In [None]:
def contrastive_loss(x,y,labels,margin=1):
    d = 1 - F.cosine_similarity(x,y)
    pos = labels * d.pow(2)
    neg = (1-labels) * F.relu(margin-d).pow(2)
    return (pos+neg).mean()

def attn_reg(w):
    w = w.mean(1).mean(1) # (B,M)
    M = w.size(1)
    t = torch.full_like(w, 1/M)
    return F.mse_loss(w,t)


## 7. Example Training Step (Dummy Data)
Random data is used only to demonstrate the forward/backward flow.

In [None]:
model = EUAML().to(device)
opt = torch.optim.AdamW(model.parameters(), lr=1e-4)

B=4
batch = {
    'fX': torch.randn(B,3,64,64).to(device),
    'vX': torch.randn(B,128).to(device),
    'aX': torch.randn(B,1,64,64).to(device),
    'bX': torch.randn(B,128).to(device),
    'fY': torch.randn(B,3,64,64).to(device),
    'vY': torch.randn(B,128).to(device),
    'aY': torch.randn(B,1,64,64).to(device),
    'bY': torch.randn(B,128).to(device),
    'labels': torch.randint(0,2,(B,)).float().to(device)
}

opt.zero_grad()
fX,fY,aX,aY = model(batch)
L = contrastive_loss(fX,fY,batch['labels']) + 0.01*(attn_reg(aX)+attn_reg(aY))
L.backward(); opt.step()
print('Loss:', L.item())
