In [None]:
# Colab installs
!pip install -q timm mediapipe facenet-pytorch librosa xgboost tqdm
# torchvision/torch usually preinstalled in Colab; if not:
# !pip install -q torch torchvision


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.6/35.6 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m76.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m92.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m95.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.5/755.5 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.0/166.0 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install --upgrade --force-reinstall "pillow==10.3.0"


Collecting pillow==10.3.0
  Downloading pillow-10.3.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.2 kB)
Downloading pillow-10.3.0-cp312-cp312-manylinux_2_28_x86_64.whl (4.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pillow
  Attempting uninstall: pillow
    Found existing installation: pillow 10.2.0
    Uninstalling pillow-10.2.0:
      Successfully uninstalled pillow-10.2.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
facenet-pytorch 2.6.0 requires Pillow<10.3.0,>=10.2.0, but you have pillow 10.3.0 which is incompatible.[0m[31m
[0mSuccessfully installed pillow-10.3.0


In [None]:
import os, glob, json, math, numpy as np, cv2, librosa, time
from tqdm import tqdm
import torch, torch.nn as nn
from torchvision import transforms, models
import timm
from facenet_pytorch import InceptionResnetV1  # face embeddings
from scipy.stats import skew, kurtosis, entropy
from scipy.signal import butter, filtfilt, periodogram
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import xgboost as xgb


In [None]:
def load_frames_from_dir(frames_dir):
    paths = sorted(glob.glob(os.path.join(frames_dir, "*.jpg")) + glob.glob(os.path.join(frames_dir,"*.png")))
    return paths

def sample_frames(paths, n=16):
    if len(paths) == 0:
        return []
    L = len(paths)
    if L <= n:
        return paths
    step = L / n
    idxs = [int(i*step) for i in range(n)]
    return [paths[i] for i in idxs]

def read_img(path):
    img = cv2.imread(path)
    if img is None:
        return None
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return img


In [None]:
import mediapipe as mp
mp_face = mp.solutions.face_mesh.FaceMesh(static_image_mode=True, max_num_faces=1, refine_landmarks=True)
def detect_face_and_landmarks(rgb_img):
    h,w,_ = rgb_img.shape
    results = mp_face.process(rgb_img)
    if not results.multi_face_landmarks:
        return None, None
    lm = results.multi_face_landmarks[0]
    pts = [(int(p.x*w), int(p.y*h), float(p.z)) for p in lm.landmark]
    xs = [p[0] for p in pts]; ys = [p[1] for p in pts]
    x1,y1,x2,y2 = max(0,min(xs)-8), max(0,min(ys)-8), min(w,max(xs)+8), min(h,max(ys)+8)
    face_crop = rgb_img[y1:y2, x1:x2].copy()
    return face_crop, pts

def crop_and_align(face_crop, size=224):
    if face_crop is None:
        return None
    return cv2.resize(face_crop, (size, size))


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
preprocess = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])
fast_model = models.mobilenet_v3_small(pretrained=True)
in_ft = fast_model.classifier[-1].in_features
fast_model.classifier[-1] = nn.Linear(in_ft, 2)
fast_model = fast_model.to(device).eval()

@torch.no_grad()
def run_fast_model(crops):
    if len(crops)==0:
        return np.array([])
    batch = torch.stack([preprocess(c) for c in crops]).to(device)
    logits = fast_model(batch)
    probs = torch.softmax(logits, dim=1)[:,1].cpu().numpy()  # spoof prob
    return probs


Downloading: "https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v3_small-047dcff4.pth
100%|██████████| 9.83M/9.83M [00:00<00:00, 35.8MB/s]


In [None]:
# heavy backbone (try Xception via timm), else fallback
try:
    heavy_backbone = timm.create_model('xception', pretrained=True, features_only=True)
    feat_dim = heavy_backbone.feature_info[-1]['num_chs']
except Exception as e:
    print("Xception not available, using resnet50d fallback:", e)
    heavy_backbone = timm.create_model('resnet50d', pretrained=True, features_only=True)
    feat_dim = heavy_backbone.feature_info[-1]['num_chs']

def extract_backbone_features(batch_tensor):
    feats = heavy_backbone(batch_tensor)[-1]  # B x C x h x w
    pooled = torch.nn.functional.adaptive_avg_pool2d(feats, 1).reshape(feats.size(0), -1)
    return pooled  # B x C

class TemporalAggregator(nn.Module):
    def __init__(self, feat_dim, nhead=4, layers=2, max_len=64):
        super().__init__()
        self.pos = nn.Parameter(torch.randn(1, max_len, feat_dim))
        encoder_layer = nn.TransformerEncoderLayer(d_model=feat_dim, nhead=nhead, dim_feedforward=feat_dim*4)
        self.tr = nn.TransformerEncoder(encoder_layer, num_layers=layers)
        self.head = nn.Linear(feat_dim, 2)
    def forward(self, x):
        # x: B x T x D
        B,T,D = x.shape
        x = x + self.pos[:,:T,:].to(x.device)
        x = x.permute(1,0,2)  # T,B,D
        out = self.tr(x)      # T,B,D
        out = out.mean(0)     # B,D
        return self.head(out)

temporal_model = TemporalAggregator(feat_dim).to(device).eval()


  model = create_fn(
Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-cadene/xception-43020ad28.pth" to /root/.cache/torch/hub/checkpoints/xception-43020ad28.pth


In [None]:
emb_model = InceptionResnetV1(pretrained='vggface2').eval().to(device)

@torch.no_grad()
def face_embedding(img):
    # img: RGB HxWx3 (uint8)
    pil_t = transforms.ToPILImage()(img)
    t = transforms.Compose([transforms.Resize((160,160)), transforms.ToTensor(), transforms.Normalize([0.5]*3, [0.5]*3)])(pil_t).unsqueeze(0).to(device)
    emb = emb_model(t)
    return emb.cpu().numpy()[0]

def embedding_drift_from_list(embs):
    if len(embs) <= 1:
        return 0.0, 0.0
    diffs = np.linalg.norm(np.diff(np.vstack(embs), axis=0), axis=1)
    return float(diffs.mean()), float(diffs.std())


  0%|          | 0.00/107M [00:00<?, ?B/s]

In [None]:
import numpy as np
def ear_from_landmarks(landmarks, eye_idx):
    pts = np.array([ [landmarks[i][0], landmarks[i][1]] for i in eye_idx ])
    A = np.linalg.norm(pts[1]-pts[5])
    B = np.linalg.norm(pts[2]-pts[4])
    C = np.linalg.norm(pts[0]-pts[3]) + 1e-9
    ear = (A + B) / (2.0 * C)
    return ear

# Mediapipe mesh index mapping for eyes (example)
LEFT_EYE = [33, 160, 158, 133, 153, 144]   # approximate mapping to mediapipe mesh indices
RIGHT_EYE = [263, 387, 385, 362, 380, 373]

def compute_ear_stats(landmarks_list):
    ears = []
    for lm in landmarks_list:
        if not lm:
            ears.append(None)
            continue
        try:
            l = ear_from_landmarks(lm, LEFT_EYE)
            r = ear_from_landmarks(lm, RIGHT_EYE)
            ears.append((l+r)/2.0)
        except:
            ears.append(None)
    valid = [e for e in ears if e is not None]
    if len(valid) == 0:
        return 0.0, 0.0, 0  # mean, std, blink_count
    mean_ear = float(np.mean(valid))
    std_ear = float(np.std(valid))
    # blink detection: EAR below threshold transient => count
    blink_thresh = 0.18
    blink_count = int(np.sum(np.array(valid) < blink_thresh))
    return mean_ear, std_ear, blink_count

# head pose (approx) using solvePnP (requires mapping indices)
def estimate_head_pose_from_landmarks(landmarks, size):
    try:
        image_points = np.array([
            landmarks[1][:2], landmarks[33][:2], landmarks[263][:2],
            landmarks[61][:2], landmarks[291][:2], landmarks[199][:2]
        ], dtype='double')
        model_points = np.array([
            (0.0, 0.0, 0.0),
            (-30.0, -30.0, -30.0),
            (30.0, -30.0, -30.0),
            (-25.0, 30.0, -30.0),
            (25.0, 30.0, -30.0),
            (0.0, 60.0, -30.0)
        ])
        focal_length = size[1]
        center = (size[1]/2, size[0]/2)
        camera_matrix = np.array([[focal_length,0,center[0]],[0,focal_length,center[1]],[0,0,1]])
        dist_coeffs = np.zeros((4,1))
        success, rotation_vector, translation_vector = cv2.solvePnP(model_points, image_points, camera_matrix, dist_coeffs, flags=cv2.SOLVEPNP_ITERATIVE)
        rvec = rotation_vector.flatten()
        tvec = translation_vector.flatten()
        return rvec, tvec
    except:
        return None, None

def head_pose_stats(landmarks_list, img_size):
    rvecs = []
    for lm in landmarks_list:
        if not lm:
            continue
        r,t = estimate_head_pose_from_landmarks(lm, img_size)
        if r is not None:
            rvecs.append(r)
    if len(rvecs) == 0:
        return [0,0,0], [0,0,0]
    arr = np.vstack(rvecs)
    mean = list(arr.mean(axis=0))
    var = list(arr.var(axis=0))
    return mean, var


In [None]:
def optical_flow_stats(frame_list):
    # frame_list: list of RGB images (resized)
    flows = []
    for i in range(1, len(frame_list)):
        a = cv2.cvtColor(frame_list[i-1], cv2.COLOR_RGB2GRAY)
        b = cv2.cvtColor(frame_list[i], cv2.COLOR_RGB2GRAY)
        f = cv2.calcOpticalFlowFarneback(a, b, None, pyr_scale=0.5, levels=3, winsize=15, iterations=3, poly_n=5, poly_sigma=1.2, flags=0)
        mag, ang = cv2.cartToPolar(f[...,0], f[...,1])
        flows.append(mag.mean())
    if len(flows)==0:
        return 0.0, 0.0
    return float(np.mean(flows)), float(np.std(flows))

def micro_expression_stat(landmarks_list):
    # compute variance of small region movements (e.g., eyebrow points)
    eyebrow_idx = [70, 63, 105, 66]  # example (adjust)
    motions = []
    prev = None
    for lm in landmarks_list:
        if not lm:
            motions.append(0.0)
            continue
        pts = np.array([ [lm[i][0], lm[i][1]] for i in eyebrow_idx ])
        centroid = pts.mean(axis=0)
        if prev is not None:
            motions.append(np.linalg.norm(centroid - prev))
        else:
            motions.append(0.0)
        prev = centroid
    return float(np.mean(motions)), float(np.std(motions))


In [None]:
def compute_rppg_power(crops_rgb, fps=25):
    greens = np.array([np.mean(c[:,:,1]) for c in crops_rgb])
    greens = greens - greens.mean()
    if len(greens) < 4:
        return 0.0
    b,a = butter(3, [0.7/(0.5*fps), 4/(0.5*fps)], btype='band')
    try:
        filt = filtfilt(b, a, greens)
        f, Pxx = periodogram(filt, fs=fps)
        mask = (f >= 0.7) & (f <= 4.0)
        power = float(np.sum(Pxx[mask]))
    except:
        power = 0.0
    return power

def blood_perfusion_coherence(crops_rgb):
    # split face into left/right cheeks, compute rPPG for each, compute coherence
    # naive implementation: compute green mean per region and correlation
    if len(crops_rgb) == 0:
        return 0.0
    regs = []
    for c in crops_rgb:
        h,w,_ = c.shape
        left = c[int(h*0.2):int(h*0.6), int(w*0.1):int(w*0.4)]
        right = c[int(h*0.2):int(h*0.6), int(w*0.6):int(w*0.9)]
        regs.append((left[:,:,1].mean(), right[:,:,1].mean()))
    arr = np.array(regs)
    if arr.shape[0] < 2:
        return 0.0
    corr = np.corrcoef(arr[:,0], arr[:,1])[0,1]
    if np.isnan(corr):
        return 0.0
    return float(corr)


In [None]:
def high_freq_residual_energy(img):
    # compute Laplacian variance and high-pass energy
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    lap = cv2.Laplacian(gray, cv2.CV_64F)
    return float(np.var(lap))

def jpeg_blockiness_score(img):
    # divide into 8x8 blocks, compute DCT energy variance as proxy
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY).astype(np.float32)
    h,w = gray.shape
    block = 8
    vals = []
    for i in range(0, h-block+1, block):
        for j in range(0, w-block+1, block):
            patch = gray[i:i+block,j:j+block]
            d = cv2.dct(patch)
            vals.append(np.std(d))
    return float(np.mean(vals)) if len(vals)>0 else 0.0

def specular_highlight_stability(crops):
    # detect top 1% bright pixels location per crop; measure variance of centroids
    centers = []
    for c in crops:
        gray = cv2.cvtColor(c, cv2.COLOR_RGB2GRAY)
        th = np.percentile(gray, 99)
        mask = (gray >= th).astype(np.uint8)
        if mask.sum() == 0:
            centers.append((np.nan, np.nan))
            continue
        coords = np.column_stack(np.where(mask))
        centers.append(coords.mean(axis=0))
    arr = np.array([c for c in centers if not np.isnan(c[0])])
    if arr.shape[0] < 2: return 0.0
    return float(np.mean(np.linalg.norm(np.diff(arr, axis=0), axis=1)))


In [None]:
def mouth_opening_from_landmarks(landmarks):
    # use mouth indices (mediapipe): e.g., upper-lip and lower-lip points
    # approximate indices; adapt if you map properly
    upper = 13  # example index
    lower = 14
    try:
        u = np.array(landmarks[upper][:2])
        l = np.array(landmarks[lower][:2])
        return float(np.linalg.norm(u-l))
    except:
        return 0.0

def lip_audio_sync_score(landmarks_list, audio_path, fps=25):
    try:
        y, sr = librosa.load(audio_path, sr=None)
    except:
        return 0.0
    # audio envelope per frame
    hop = int(sr / fps)
    env = [np.mean(np.abs(y[i:i+hop])) for i in range(0, len(y), hop)]
    mouth = [mouth_opening_from_landmarks(lm) if lm else 0.0 for lm in landmarks_list]
    # align lengths
    L = min(len(env), len(mouth))
    if L < 3: return 0.0
    env = np.array(env[:L]); mouth = np.array(mouth[:L])
    # normalize
    if env.std()==0 or mouth.std()==0:
        return 0.0
    corr = np.corrcoef(env, mouth)[0,1]
    if np.isnan(corr): return 0.0
    return float(corr)


In [None]:
def probs_entropy(probs):
    # probs: sequence of scalar probs per frame
    p = np.array(probs)
    p = np.clip(p, 1e-6, 1-1e-6)
    e = -(p*np.log(p) + (1-p)*np.log(1-p))
    return float(np.mean(e)), float(np.std(e))

def kl_divergence(p, q):
    p = np.clip(np.array(p), 1e-6, 1-1e-6)
    q = np.clip(np.array(q), 1e-6, 1-1e-6)
    # compute KL(p||q) for Bernoulli per-frame then mean
    kl = p*np.log(p/q) + (1-p)*np.log((1-p)/(1-q))
    return float(np.mean(kl))


In [None]:
@torch.no_grad()
def model_stability_checks(crop, model, preprocess, noise_levels=[0.01,0.02], scales=[1.0,0.75,0.5]):
    # crop: RGB uint8
    base_p = float(model(torch.unsqueeze(preprocess(crop),0).to(device)).softmax(dim=1)[0,1].cpu())
    flips = 0
    probs = []
    for n in noise_levels:
        noisy = np.clip(crop + np.random.normal(0, n*255, crop.shape), 0, 255).astype(np.uint8)
        p = float(model(torch.unsqueeze(preprocess(noisy),0).to(device)).softmax(dim=1)[0,1].cpu())
        probs.append(p)
    for s in scales:
        if s==1.0:
            continue
        small = cv2.resize(crop, (int(crop.shape[1]*s), int(crop.shape[0]*s)))
        big = cv2.resize(small, (crop.shape[1], crop.shape[0]))
        p = float(model(torch.unsqueeze(preprocess(big),0).to(device)).softmax(dim=1)[0,1].cpu())
        probs.append(p)
    # instability metric: std of probs normalized
    return float(np.std([base_p]+probs)), base_p


In [None]:
def process_video(video_frames_dir, audio_path=None, sample_n=16, video_id=None, fps=25):
    video_id = video_id or os.path.basename(video_frames_dir.rstrip('/'))
    paths = load_frames_from_dir(video_frames_dir)
    sampled = sample_frames(paths, n=sample_n)
    crops, landmarks_list, frame_imgs = [], [], []
    for p in sampled:
        img = read_img(p)
        frame_imgs.append(img)
        crop, lm = detect_face_and_landmarks(img)
        if crop is None:
            crop = cv2.resize(img, (224,224))
        else:
            crop = crop_and_align(crop, size=224)
        crops.append(crop)
        landmarks_list.append(lm)
    # FAST triage
    fast_probs = run_fast_model(crops).tolist() if len(crops)>0 else []
    fast_avg = float(np.mean(fast_probs)) if len(fast_probs)>0 else 0.0
    # HEAVY (conditional)
    heavy_prob = None
    do_heavy = fast_avg >= 0.7 or (len(fast_probs)>0 and np.mean(np.array(fast_probs)>0.5)>0.4)
    if do_heavy and len(crops)>0:
        batch = torch.stack([preprocess(c) for c in crops]).to(device)
        feats = extract_backbone_features(batch)  # B x D
        feats_np = feats.cpu().numpy()
        feats_t = torch.tensor(feats_np).unsqueeze(0).to(device)  # 1 x T x D
        with torch.no_grad():
            logits = temporal_model(feats_t)
            heavy_prob = float(torch.softmax(logits, dim=1)[0,1].cpu().item())

    # embedding drift
    embs = []
    for c in crops:
        try:
            embs.append(face_embedding(c))
        except:
            embs.append(np.zeros(512))
    emb_mean, emb_std = embedding_drift_from_list(embs)

    # EAR
    mean_ear, std_ear, blink_count = compute_ear_stats(landmarks_list)
    # head pose
    mean_head, var_head = head_pose_stats(landmarks_list, img_size=frame_imgs[0].shape if len(frame_imgs)>0 else (224,224))
    # optical flow
    of_mean, of_std = optical_flow_stats(frame_imgs)
    # micro expressions
    micro_mean, micro_std = micro_expression_stat(landmarks_list)
    # rPPG
    rppg_pow = compute_rppg_power(crops, fps=fps)
    blood_corr = blood_perfusion_coherence(crops)
    # lip-audio sync
    lip_sync = lip_audio_sync_score(landmarks_list, audio_path, fps=fps) if audio_path else 0.0
    # freq & compression
    hf_res = np.mean([high_freq_residual_energy(c) for c in crops]) if len(crops)>0 else 0.0
    blockiness = np.mean([jpeg_blockiness_score(c) for c in crops]) if len(crops)>0 else 0.0
    spec_stab = specular_highlight_stability(crops)
    # temporal/prob stats
    ent_mean, ent_std = probs_entropy(fast_probs) if len(fast_probs)>0 else (0.0, 0.0)
    kl_fast_heavy = kl_divergence(fast_probs, [heavy_prob]*len(fast_probs)) if (heavy_prob is not None and len(fast_probs)>0) else 0.0
    # embedding drift already computed
    # noise / stability
    unstable_score, base_p = model_stability_checks(crops[0] if len(crops)>0 else np.zeros((224,224,3),dtype=np.uint8), fast_model, preprocess) if len(crops)>0 else (0.0, 0.0)

    # assemble features dict
    features = {
        'video_id': video_id,
        'fast_avg': fast_avg,
        'heavy_prob': heavy_prob,
        'emb_drift_mean': emb_mean, 'emb_drift_std': emb_std,
        'mean_ear': mean_ear, 'std_ear': std_ear, 'blink_count': blink_count,
        'head_mean_pitch': mean_head[0] if isinstance(mean_head, list) else 0.0,
        'head_mean_yaw': mean_head[1] if isinstance(mean_head, list) else 0.0,
        'head_mean_roll': mean_head[2] if isinstance(mean_head, list) else 0.0,
        'head_var_pitch': var_head[0] if isinstance(var_head, list) else 0.0,
        'optflow_mean': of_mean, 'optflow_std': of_std,
        'micro_mean': micro_mean, 'micro_std': micro_std,
        'rppg_power': rppg_pow, 'blood_corr': blood_corr,
        'lip_sync': lip_sync,
        'hf_residual': hf_res, 'blockiness': blockiness, 'specular_instability': spec_stab,
        'prob_entropy_mean': ent_mean, 'prob_entropy_std': ent_std,
        'kl_fast_heavy': kl_fast_heavy,
        'stability_std': unstable_score, 'stability_base_p': base_p
    }

    # simple unsupervised fused score (normalize features locally)
    # Build a linear fusion with heuristics (you should replace with trained XGB when labels available)
    # weight the model probs heavily, but include aux cues
    w_fast, w_heavy = 0.35, 0.55
    aux = 0.0
    # aux contributions (scaled)
    aux += (0.5 - min(0.5, features['mean_ear']))  # lower EAR more suspicious
    aux += (features['emb_drift_mean'] / (features['emb_drift_mean']+1e-6))
    aux += (1.0 - min(1.0, features['blood_corr']))
    aux += (features['optflow_std'] / (features['optflow_std']+1e-6))
    aux = aux / 4.0
    fused = features['fast_avg'] if heavy_prob is None else (w_fast*features['fast_avg'] + w_heavy*heavy_prob + 0.1*aux)
    features['fused'] = float(fused)
    # decision
    action = 'safe'
    if fused >= 0.85: action = 'quarantine'
    elif fused >= 0.6: action = 'analyst_review'
    features['action'] = action

    # explanation text
    explanation = f"fast={features['fast_avg']:.2f}"
    if heavy_prob is not None:
        explanation += f", heavy={heavy_prob:.2f}"
    explanation += f", emb_drift={features['emb_drift_mean']:.3f}, rPPG={features['rppg_power']:.3f}. Action: {action}"
    features['explanation'] = explanation

    # write alert
    out = {'video_id': video_id, 'features': features}
    with open('alerts.jsonl','a') as f:
        f.write(json.dumps(out) + '\n')
    return features


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import glob

for path in glob.glob("/content/drive/**/video_frames_manifest.csv", recursive=True):
    print(path)


/content/drive/MyDrive/MUFG SIH/processed/video_frames_manifest.csv


In [None]:
import pandas as pd

manifest_path = "/content/drive/MyDrive/MUFG SIH/processed/video_frames_manifest.csv"
df_manifest = pd.read_csv(manifest_path)
print(df_manifest.head())
print(df_manifest.columns)


                                      video_rel_path  \
0  DFD_manipulated_sequences/DFD_manipulated_sequ...   
1  DFD_manipulated_sequences/DFD_manipulated_sequ...   
2  DFD_manipulated_sequences/DFD_manipulated_sequ...   
3  DFD_manipulated_sequences/DFD_manipulated_sequ...   
4  DFD_manipulated_sequences/DFD_manipulated_sequ...   

                                          frames_dir  num_frames  fps_out  \
0  /content/processed_video_frames/DFD_manipulate...          42        5   
1  /content/processed_video_frames/DFD_manipulate...         116        5   
2  /content/processed_video_frames/DFD_manipulate...         209        5   
3  /content/processed_video_frames/DFD_manipulate...         146        5   
4  /content/processed_video_frames/DFD_manipulate...         181        5   

   size  src_fps  
0   224     24.0  
1   224     24.0  
2   224     24.0  
3   224     24.0  
4   224     24.0  
Index(['video_rel_path', 'frames_dir', 'num_frames', 'fps_out', 'size',
       'src_fp

In [None]:
rows = []

for idx, row in df_manifest.iterrows():
    video_id = os.path.basename(row["frames_dir"].rstrip("/"))
    frames_dir = row["frames_dir"]
    fps = int(row["fps_out"]) if "fps_out" in row else 25

    print(f"Processing {video_id} ({row['num_frames']} frames @ {fps} fps)")

    feats = process_video(
        video_frames_dir=frames_dir,
        audio_path=None,      # your manifest has no audio column
        sample_n=16,
        video_id=video_id,
        fps=fps
    )
    rows.append(feats)

import pandas as pd
df = pd.DataFrame(rows)
df.to_csv("video_features_table.csv", index=False)
print("✅ Saved video_features_table.csv with", len(df), "videos")


Processing DFD_manipulated_sequences__DFD_manipulated_sequences__01_02__exit_phone_room__YVGY8LOK (42 frames @ 5 fps)
Processing DFD_manipulated_sequences__DFD_manipulated_sequences__01_02__hugging_happy__YVGY8LOK (116 frames @ 5 fps)
Processing DFD_manipulated_sequences__DFD_manipulated_sequences__01_02__meeting_serious__YVGY8LOK (209 frames @ 5 fps)
Processing DFD_manipulated_sequences__DFD_manipulated_sequences__01_02__outside_talking_still_laughing__YVGY8LOK (146 frames @ 5 fps)
Processing DFD_manipulated_sequences__DFD_manipulated_sequences__01_02__secret_conversation__YVGY8LOK (181 frames @ 5 fps)
Processing DFD_manipulated_sequences__DFD_manipulated_sequences__01_02__talking_against_wall__YVGY8LOK (169 frames @ 5 fps)
Processing DFD_manipulated_sequences__DFD_manipulated_sequences__01_02__talking_angry_couch__YVGY8LOK (291 frames @ 5 fps)
Processing DFD_manipulated_sequences__DFD_manipulated_sequences__01_02__walk_down_hall_angry__YVGY8LOK (44 frames @ 5 fps)
Processing DFD_mani

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  greens = greens - greens.mean()


Processing DFD_manipulated_sequences__DFD_manipulated_sequences__03_09__outside_talking_still_laughing__RCETIXYL (188 frames @ 5 fps)
Processing DFD_manipulated_sequences__DFD_manipulated_sequences__03_09__secret_conversation__8DTEGQ54 (4 frames @ 5 fps)
Processing DFD_manipulated_sequences__DFD_manipulated_sequences__03_09__secret_conversation__RCETIXYL (159 frames @ 5 fps)
Processing DFD_manipulated_sequences__DFD_manipulated_sequences__03_09__talking_against_wall__8DTEGQ54 (182 frames @ 5 fps)
Processing DFD_manipulated_sequences__DFD_manipulated_sequences__03_09__talking_angry_couch__RCETIXYL (291 frames @ 5 fps)
Processing DFD_manipulated_sequences__DFD_manipulated_sequences__03_09__walk_down_hall_angry__8DTEGQ54 (88 frames @ 5 fps)
Processing DFD_manipulated_sequences__DFD_manipulated_sequences__03_11__exit_phone_room__P08VGHTA (107 frames @ 5 fps)
Processing DFD_manipulated_sequences__DFD_manipulated_sequences__03_11__kitchen_pan__P08VGHTA (107 frames @ 5 fps)
Processing DFD_man

In [None]:
import pandas as pd
df = pd.read_csv("video_features_table.csv")
print(df.head())
print(df.shape)


                                            video_id  fast_avg  heavy_prob  \
0  DFD_manipulated_sequences__DFD_manipulated_seq...       0.0         NaN   
1  DFD_manipulated_sequences__DFD_manipulated_seq...       0.0         NaN   
2  DFD_manipulated_sequences__DFD_manipulated_seq...       0.0         NaN   
3  DFD_manipulated_sequences__DFD_manipulated_seq...       0.0         NaN   
4  DFD_manipulated_sequences__DFD_manipulated_seq...       0.0         NaN   

   emb_drift_mean  emb_drift_std  mean_ear  std_ear  blink_count  \
0             0.0            0.0       0.0      0.0            0   
1             0.0            0.0       0.0      0.0            0   
2             0.0            0.0       0.0      0.0            0   
3             0.0            0.0       0.0      0.0            0   
4             0.0            0.0       0.0      0.0            0   

   head_mean_pitch  head_mean_yaw  ...  blockiness  specular_instability  \
0                0              0  ...        

In [None]:
!ls "/content/drive/MyDrive/MUFG SIH"


'01_data_preprocessing .ipynb'	  Deepfake_audio.zip
 02_intake_classification.ipynb   Deepfake_video.zip
 03_text_pipeline_agentic.ipynb   MUFG
 06_visualizations.ipynb	  phishnet
 07_dashboard_greenIT.ipynb	  processed
 AgenticAI_approach.svg		  processed_audio_mfcc.csv
'Dataset files_MUFG_SIH.gdoc'	  SIH2025_Presentation.pptx.pdf


In [None]:
# Mount Google Drive if not already mounted
from google.colab import drive
drive.mount('/content/drive')

# Unzip the Deepfake video dataset
!unzip -q "/content/drive/MyDrive/MUFG SIH/Deepfake_video.zip" -d "/content/drive/MyDrive/MUFG SIH/processed_video"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd

# Placeholder lists
video_features = []
video_names = []

for video_path in video_files:
    # Extract core signals
    fast_prob = extract_fast_prob(video_path)
    heavy_prob = extract_heavy_prob(video_path)
    entropy_prob = compute_frame_entropy(video_path)
    kl_fast_heavy = compute_kl_fast_heavy(video_path)

    # Landmark/geometry
    ear_mean, ear_var = compute_ear(video_path)
    head_pose_var = compute_head_pose_var(video_path)
    gaze_stability = compute_gaze_stability(video_path)
    lip_audio_sync = compute_lip_audio_sync(video_path)

    # Temporal / embedding
    embedding_drift_mean, embedding_drift_std = compute_embedding_drift(video_path)
    temporal_consistency = compute_temporal_consistency(video_path)
    optical_flow_anomaly = compute_optical_flow_anomaly(video_path)

    # Physiological / visual
    rPPG_quality = compute_rppg_signal_quality(video_path)
    blood_perfusion_coherence = compute_rppg_spatial_coherence(video_path)
    skin_flicker_index = compute_skin_flicker(video_path)
    specular_highlight_stability = compute_specular_stability(video_path)

    # Artifact / forensics
    compression_artifact = compute_compression_artifact(video_path)
    fft_residual_energy = compute_frequency_residual(video_path)
    jpeg_double_quant = compute_jpeg_double_quant(video_path)

    # Aggregate features
    features = [
        fast_prob, heavy_prob, entropy_prob, kl_fast_heavy,
        ear_mean, ear_var, head_pose_var, gaze_stability, lip_audio_sync,
        embedding_drift_mean, embedding_drift_std, temporal_consistency, optical_flow_anomaly,
        rPPG_quality, blood_perfusion_coherence, skin_flicker_index, specular_highlight_stability,
        compression_artifact, fft_residual_energy, jpeg_double_quant
    ]

    video_features.append(features)
    video_names.append(video_path)

# Convert to DataFrame
df_features = pd.DataFrame(video_features, columns=[
    "fast_prob","heavy_prob","entropy_prob","kl_fast_heavy",
    "ear_mean","ear_var","head_pose_var","gaze_stability","lip_audio_sync",
    "embedding_drift_mean","embedding_drift_std","temporal_consistency","optical_flow_anomaly",
    "rPPG_quality","blood_perfusion_coherence","skin_flicker_index","specular_highlight_stability",
    "compression_artifact","fft_residual_energy","jpeg_double_quant"
])
df_features['video'] = video_names


NameError: name 'video_files' is not defined

# **NEW START**

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

import glob, os
# find candidate video roots (commonly used names in your Drive)
cands = glob.glob("/content/drive/MyDrive/**/Deepfake_video*", recursive=True) + \
        glob.glob("/content/drive/MyDrive/**/processed_video*", recursive=True) + \
        glob.glob("/content/drive/MyDrive/**/Deepfake*", recursive=True)

# fallback: search for any video files
video_files = glob.glob("/content/drive/MyDrive/**/*.mp4", recursive=True) + \
              glob.glob("/content/drive/MyDrive/**/*.mkv", recursive=True) + \
              glob.glob("/content/drive/MyDrive/**/*.avi", recursive=True)

print("Candidate video folders (first 5):", cands[:5])
print("Total video files found under MyDrive:", len(video_files))
if len(video_files)>0:
    print("Example:", video_files[:3])
else:
    print("No video files found in MyDrive. If your videos are zipped, extract them first.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Candidate video folders (first 5): ['/content/drive/MyDrive/MUFG SIH/Deepfake_video.zip', '/content/drive/MyDrive/MUFG SIH/processed_video', '/content/drive/MyDrive/MUFG SIH/Deepfake_video.zip', '/content/drive/MyDrive/MUFG SIH/Deepfake_audio.zip']
Total video files found under MyDrive: 3431
Example: ['/content/drive/MyDrive/MUFG SIH/processed_video/DFD_manipulated_sequences/DFD_manipulated_sequences/01_02__exit_phone_room__YVGY8LOK.mp4', '/content/drive/MyDrive/MUFG SIH/processed_video/DFD_manipulated_sequences/DFD_manipulated_sequences/01_02__hugging_happy__YVGY8LOK.mp4', '/content/drive/MyDrive/MUFG SIH/processed_video/DFD_manipulated_sequences/DFD_manipulated_sequences/01_02__meeting_serious__YVGY8LOK.mp4']


In [None]:
# system + python packages (may take a minute)
!apt-get -qq update
!apt-get -qq install -y ffmpeg tesseract-ocr >/dev/null
!pip -q install mediapipe opencv-python-headless==4.7.0.72 pytesseract librosa soundfile praat-parselmouth scikit-image facenet-pytorch tqdm xgboost joblib


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


In [None]:
import cv2, os, glob

video_files = glob.glob("/content/drive/MyDrive/MUFG SIH/processed_video/**/*.mp4", recursive=True)
out_dir = "/content/drive/MyDrive/processed/video_frames"
os.makedirs(out_dir, exist_ok=True)

def extract_frames(video_path, every_n=20):
    cap = cv2.VideoCapture(video_path)
    count, saved = 0, 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret: break
        if count % every_n == 0:
            fname = os.path.join(out_dir, f"{os.path.basename(video_path)}_f{count}.jpg")
            cv2.imwrite(fname, frame)
            saved += 1
        count += 1
    cap.release()
    return saved

print("Example frames saved:", extract_frames(video_files[0]))


Example frames saved: 11


In [None]:
# ============================
# CELL 3 — Video Frame Feature Extraction
# ============================
import cv2
import numpy as np
import os
import glob
import pandas as pd
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image

# -----------------
# Helper functions
# -----------------

def high_freq_residual_energy(img):
    """Compute Laplacian variance as a proxy for high-frequency residuals (blur vs sharpness)."""
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    lap = cv2.Laplacian(gray, cv2.CV_64F)
    return float(np.var(lap))

def jpeg_blockiness_score(img, block_size=8):
    """Rough blockiness estimation by measuring edge discontinuities on 8x8 grids."""
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    h, w = gray.shape
    vert = np.sum(np.abs(gray[:, block_size:] - gray[:, :-block_size]))
    horiz = np.sum(np.abs(gray[block_size:, :] - gray[:-block_size, :]))
    return float((vert + horiz) / (h * w))

def blink_detection(frame_sequence, eye_cascade):
    """Very basic blink detection from sequence of frames."""
    eye_counts = []
    for frame in frame_sequence:
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        eyes = eye_cascade.detectMultiScale(gray, 1.1, 3)
        eye_counts.append(len(eyes))
    blink_var = np.var(eye_counts)
    return blink_var

# -----------------
# Embedding model setup (ResNet pretrained on VGGFace2-like)
# -----------------
device = "cuda" if torch.cuda.is_available() else "cpu"
resnet = models.resnet18(pretrained=True)
resnet.fc = torch.nn.Identity()  # drop final classifier
resnet = resnet.to(device).eval()

transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

def get_resnet_embedding(img):
    """Extract deep embeddings from a face/image frame."""
    img_pil = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    t = transform(img_pil).unsqueeze(0).to(device)
    with torch.no_grad():
        emb = resnet(t).cpu().numpy().flatten()
    return emb

# -----------------
# Run feature extraction
# -----------------

video_frame_dir = "/content/drive/MyDrive/MUFG SIH/processed_video_frames"
output_csv = "/content/drive/MyDrive/processed/video_frame_features.csv"

eye_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_eye.xml")

all_features = []

for video_folder in sorted(glob.glob(os.path.join(video_frame_dir, "*"))):
    video_id = os.path.basename(video_folder)
    frame_paths = sorted(glob.glob(os.path.join(video_folder, "*.jpg")))
    if not frame_paths:
        continue

    # Load subset of frames (sample every 10th frame for efficiency)
    sampled_frames = []
    for i, fp in enumerate(frame_paths):
        if i % 10 == 0:
            sampled_frames.append(cv2.imread(fp))

    if not sampled_frames:
        continue

    # Compute features
    hf_res = np.mean([high_freq_residual_energy(f) for f in sampled_frames])
    blockiness = np.mean([jpeg_blockiness_score(f) for f in sampled_frames])
    blink_var = blink_detection(sampled_frames, eye_cascade)

    # Embeddings (average across sampled frames)
    emb_list = [get_resnet_embedding(f) for f in sampled_frames[:5]]  # only first 5 frames to save time
    emb_avg = np.mean(emb_list, axis=0)

    row = {
        "video_id": video_id,
        "hf_residual": hf_res,
        "blockiness": blockiness,
        "blink_var": blink_var
    }

    # Merge embedding vector
    for j, val in enumerate(emb_avg):
        row[f"emb_{j}"] = val

    all_features.append(row)

df_video = pd.DataFrame(all_features)
df_video.to_csv(output_csv, index=False)

print(f"Saved video frame features: {output_csv}")


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:01<00:00, 29.9MB/s]


Saved video frame features: /content/drive/MyDrive/processed/video_frame_features.csv


In [None]:
import os

video_csv = "/content/drive/MyDrive/processed/video_frame_features.csv"
print("File exists?", os.path.exists(video_csv))
print("File size (bytes):", os.path.getsize(video_csv))


File exists? True
File size (bytes): 1


In [None]:
import cv2

test_video = "/content/drive/MyDrive/MUFG SIH/processed_video/DFD_manipulated_sequences/DFD_manipulated_sequences/01_02__exit_phone_room__YVGY8LOK.mp4"

cap = cv2.VideoCapture(test_video)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
print("Frame count:", frame_count)

# Grab 5 frames
grabbed = 0
while grabbed < 5 and cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    print("Frame shape:", frame.shape)
    grabbed += 1
cap.release()


Frame count: 210
Frame shape: (1080, 1920, 3)
Frame shape: (1080, 1920, 3)
Frame shape: (1080, 1920, 3)
Frame shape: (1080, 1920, 3)
Frame shape: (1080, 1920, 3)


In [None]:
import cv2, torch, numpy as np, os, pandas as pd
from torchvision import models, transforms
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
resnet = models.resnet18(weights=models.ResNet18_Weights.DEFAULT).to(device)
resnet.eval()

preprocess = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

rows = []
video_folder = "/content/drive/MyDrive/MUFG SIH/processed_video/DFD_manipulated_sequences/DFD_manipulated_sequences"
video_files = [os.path.join(video_folder, f) for f in os.listdir(video_folder) if f.endswith(".mp4")]

print("Found", len(video_files), "videos")

for vf in tqdm(video_files[:100]):  # limit to 100 for testing
    cap = cv2.VideoCapture(vf)
    frame_feats = []
    frame_count = 0
    while frame_count < 10 and cap.isOpened():  # sample 10 frames
        ret, frame = cap.read()
        if not ret: break
        img_t = preprocess(frame).unsqueeze(0).to(device)
        with torch.no_grad():
            feat = resnet(img_t).cpu().numpy().flatten()
        frame_feats.append(feat)
        frame_count += 1
    cap.release()

    if len(frame_feats) > 0:
        video_vec = np.mean(frame_feats, axis=0)  # ✅ proper mean pooling
        label = 1 if "manipulated" in vf.lower() else 0
        rows.append([vf, label] + video_vec.tolist())

if rows:
    cols = ["file", "label"] + [f"f{i}" for i in range(len(rows[0])-2)]
    df_video = pd.DataFrame(rows, columns=cols)
    out_csv = "/content/drive/MyDrive/processed/video_frame_features.csv"
    df_video.to_csv(out_csv, index=False)
    print("✅ Saved:", out_csv, "with rows:", len(df_video))
else:
    print("⚠️ No video features extracted.")


Found 3068 videos


 31%|███       | 31/100 [01:34<14:34, 12.68s/it]

In [None]:
import os, cv2, torch, numpy as np, pandas as pd
from tqdm import tqdm
import torchvision.models as models
import torchvision.transforms as transforms

# Paths
video_root = "/content/drive/MyDrive/MUFG SIH/processed_video"
out_csv    = "/content/drive/MyDrive/processed/video_frame_features.csv"

# Collect video list
video_files = []
for root, _, files in os.walk(video_root):
    for f in files:
        if f.lower().endswith((".mp4", ".avi", ".mov", ".mkv")):
            video_files.append(os.path.join(root, f))
print(f"Found {len(video_files)} videos")

# 🔹 LIMIT videos for demo/testing
MAX_VIDEOS   = 200   # reduce if needed
SAMPLE_FRAMES = 5    # only pick few frames

video_files = video_files[:MAX_VIDEOS]

# Model (ResNet18 backbone)
device = "cuda" if torch.cuda.is_available() else "cpu"
resnet = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
resnet.fc = torch.nn.Identity()  # remove classifier, keep features
resnet = resnet.to(device).eval()

# Transform
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

features = []

# Loop with error handling
for v in tqdm(video_files, desc="Extracting"):
    try:
        cap = cv2.VideoCapture(v)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        if frame_count <= 0:
            cap.release()
            continue

        # Pick SAMPLE_FRAMES evenly spaced
        idxs = np.linspace(0, frame_count-1, SAMPLE_FRAMES, dtype=int)

        frame_feats = []
        for idx in idxs:
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            ret, frame = cap.read()
            if not ret:
                continue
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            img_t = transform(frame_rgb).unsqueeze(0).to(device)
            with torch.no_grad():
                feat = resnet(img_t).cpu().numpy().flatten()
            frame_feats.append(feat)
        cap.release()

        if frame_feats:
            # average features for the video
            vid_feat = np.mean(frame_feats, axis=0)
            features.append({
                "video": os.path.basename(v),
                **{f"f{i}": val for i,val in enumerate(vid_feat)}
            })

    except Exception as e:
        print(f"❌ Error with {v}: {e}")
        continue

# Save as CSV
df = pd.DataFrame(features)
df.to_csv(out_csv, index=False)
print(f"✅ Saved optimized video features: {out_csv}, shape={df.shape}")


Found 3431 videos


Extracting: 100%|██████████| 200/200 [19:28<00:00,  5.84s/it]


✅ Saved optimized video features: /content/drive/MyDrive/processed/video_frame_features.csv, shape=(200, 513)


In [None]:
# ===============================
# 0. Install dependencies
# ===============================
!pip install pytorch-tabnet --quiet

# ===============================
# 1. Imports
# ===============================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

# ===============================
# 2. Load dataset
# ===============================
video_csv = "/content/drive/MyDrive/processed/video_frame_features.csv"
df_video = pd.read_csv(video_csv)

print("Shape of video dataset:", df_video.shape)
print(df_video.head())

# ===============================
# 3. Add dummy labels for demo
# ===============================
np.random.seed(42)
labels = np.random.randint(0, 2, size=df_video.shape[0])
if labels.sum() == 0 or labels.sum() == len(labels):
    labels[0] = 1  # ensure both classes exist
df_video["label"] = labels

# ===============================
# 4. Prepare features & labels
# ===============================
X = df_video.drop(columns=["video", "label"]).values
y = df_video["label"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ===============================
# 5. Initialize and train TabNet
# ===============================
clf = TabNetClassifier(
    n_d=64, n_a=64, n_steps=5,
    gamma=1.5, n_independent=2, n_shared=2,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax',
    scheduler_params={"step_size":50, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    verbose=0,
    seed=42
)

clf.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_name=["test"],
    eval_metric=["accuracy"],
    max_epochs=200,
    patience=20,
    batch_size=32,
    virtual_batch_size=16
)

# ===============================
# 6. Evaluate model
# ===============================
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, digits=2)

print(f"\nAccuracy: {acc:.2f}\n")
print("Classification Report:\n", report)

# ===============================
# 7. Agentic Reflection + Green IT metrics
# ===============================
# Energy proxy: number of epochs actually run * number of features
num_epochs_ran = len(clf.history['loss'])
energy_proxy = num_epochs_ran * X_train.shape[1]

# Reflection proxy: variance in feature importance (higher = more reflective)
feature_importance_var = np.var(clf.feature_importances_)

print("\n--- Agentic & Green IT Metrics ---")
print(f"Energy Proxy (lower = greener): {energy_proxy}")
print(f"Feature Importance Variance (higher = more reflective): {feature_importance_var:.6f}")


Shape of video dataset: (200, 513)
                                               video        f0        f1  \
0               01_02__exit_phone_room__YVGY8LOK.mp4  1.205889  0.510280   
1                 01_02__hugging_happy__YVGY8LOK.mp4  0.661470  1.122913   
2               01_02__meeting_serious__YVGY8LOK.mp4  0.145870  1.735320   
3  01_02__outside_talking_still_laughing__YVGY8LO...  0.821929  0.556383   
4           01_02__secret_conversation__YVGY8LOK.mp4  0.149407  1.534087   

         f2        f3        f4        f5        f6        f7        f8  ...  \
0  1.744059  1.820704  0.452797  0.615823  1.329797  0.392381  0.730904  ...   
1  0.594673  0.998632  0.713414  1.114628  0.407519  0.984603  0.873089  ...   
2  1.282610  0.462198  0.541554  3.627952  0.582905  0.854114  1.190425  ...   
3  0.128962  0.371354  1.796759  0.132320  1.230159  0.258587  1.965587  ...   
4  1.089437  1.135186  0.271337  0.386557  0.008120  0.337250  1.290210  ...   

       f502      f503      

