In [3]:
import sys
print("Notebook is using:", sys.executable)

Notebook is using: C:\Users\swani\AppData\Local\Programs\Python\Python312\python.exe


In [10]:
!{sys.executable} -m pip install --upgrade opencv-python

Collecting opencv-python
  Using cached opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl.metadata (20 kB)
Using cached opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl (39.5 MB)
Installing collected packages: opencv-python
Successfully installed opencv-python-4.11.0.86



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: C:\Users\swani\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


In [12]:
print("CUDA available:", torch.cuda.is_available(), "  CUDA version:", torch.version.cuda)
print("cv2:", cv2.__version__)

CUDA available: True   CUDA version: 11.8
cv2: 4.11.0


In [4]:
# In your Python 3.12 environment:
!{sys.executable} -m pip install timm torch torchvision pillow

Collecting timm
  Using cached timm-1.0.15-py3-none-any.whl.metadata (52 kB)
Using cached timm-1.0.15-py3-none-any.whl (2.4 MB)
Installing collected packages: timm
Successfully installed timm-1.0.15



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: C:\Users\swani\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


In [14]:
import os
import json
import cv2
import torch
from transformers import CLIPImageProcessor, CLIPModel
from PIL import Image
from tqdm import tqdm


def main(json_path="train_val_test_split_with_multihot.json",
         output_json_path="train_val_test_split_with_clip.json"):
    """
    Process a JSON file with 'train', 'val', and 'test' splits to add 512‑d CLIP face embeddings:
    1. Read the JSON containing splits.
    2. For each split ('train', 'val', 'test'):
       a. Crop each person's bbox and detect the largest face via a tuned Haar cascade.
       b. Fallback to full bbox if no face is found.
       c. Extract a 512‑dim embedding using CLIPVision.
       d. Store it under 'face_embedding' in that record.
    3. Write out the updated JSON with embeddings for all splits.
    """
    # Load JSON
    with open(json_path, 'r', encoding='utf-8') as f:
        all_data = json.load(f)

    # Prepare splits
    splits = [k for k in ['train', 'val', 'test'] if k in all_data]

    # Initialize Haar cascade
    face_cascade = cv2.CascadeClassifier(
        cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
    )
    if face_cascade.empty():
        raise RuntimeError("Failed to load Haar cascade for face detection.")

    # Initialize CLIP model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    processor = CLIPImageProcessor.from_pretrained('openai/clip-vit-base-patch32')
    clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
    clip_model.eval().to(device)

    # Process each split
    for split in splits:
        records = all_data.get(split, [])
        updated_records = []

        for rec in tqdm(records, desc=f"Embedding faces for '{split}' split"):
            rec['face_embedding'] = None
            img_path = rec.get('image_path')
            bbox = rec.get('bbox', [])
            if not img_path or len(bbox) != 4:
                updated_records.append(rec)
                continue

            img = cv2.imread(img_path)
            if img is None:
                updated_records.append(rec)
                continue

            x1, y1, x2, y2 = map(int, bbox)
            h, w = img.shape[:2]
            x1, y1 = max(0, x1), max(0, y1)
            x2, y2 = min(w, x2), min(h, y2)
            if x2 <= x1 or y2 <= y1:
                updated_records.append(rec)
                continue

            person_crop = img[y1:y2, x1:x2]
            if person_crop.size == 0:
                updated_records.append(rec)
                continue

            # Preprocess for face detection
            gray = cv2.cvtColor(person_crop, cv2.COLOR_BGR2GRAY)
            gray = cv2.equalizeHist(gray)
            faces = face_cascade.detectMultiScale(
                gray,
                scaleFactor=1.05,
                minNeighbors=3,
                minSize=(20, 20),
                flags=cv2.CASCADE_SCALE_IMAGE
            )

            # Select face region or fallback
            if len(faces) > 0:
                fx, fy, fw, fh = max(faces, key=lambda f: f[2] * f[3])
                face_img = person_crop[fy:fy+fh, fx:fx+fw]
                if face_img.size == 0:
                    face_img = person_crop
            else:
                face_img = person_crop

            # CLIP embedding
            rgb = cv2.cvtColor(face_img, cv2.COLOR_BGR2RGB)
            pil = Image.fromarray(rgb)
            inputs = processor(images=pil, return_tensors='pt').to(device)
            with torch.no_grad():
                feats = clip_model.get_image_features(**inputs)  # (1,512)
            rec['face_embedding'] = feats.squeeze(0).cpu().tolist()
            updated_records.append(rec)

        # Update split records
        all_data[split] = updated_records

    # Save updated JSON
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(all_data, f, indent=4)
    print(f"Wrote embeddings to {output_json_path}")


if __name__ == '__main__':
    main()


Embedding faces for 'train' split: 100%|█████████████████████████████████████████| 16659/16659 [19:59<00:00, 13.88it/s]
Embedding faces for 'val' split: 100%|█████████████████████████████████████████████| 3503/3503 [03:39<00:00, 15.94it/s]
Embedding faces for 'test' split: 100%|████████████████████████████████████████████| 3544/3544 [03:47<00:00, 15.55it/s]


Wrote embeddings to train_val_test_split_with_clip.json


In [15]:
import os
import json
import cv2
import torch
from tqdm import tqdm
import numpy as np
from torchvision.models.detection import keypointrcnn_resnet50_fpn

In [16]:
def main(json_path="train_val_test_split_with_clip.json",
         output_json_path="train_val_test_split_with_pose.json"):
    """
    Process a JSON file with 'train', 'val', and 'test' splits to add body-pose embeddings:
    1. Read the JSON containing splits.
    2. For each split ('train', 'val', 'test'):
       a. Crop each person's bbox from the image.
       b. Run Keypoint R-CNN (ResNet50-FPN) to detect 17 COCO keypoints.
       c. Flatten (x, y, score) for each keypoint into a 51-d vector.
       d. Store it under 'pose_embedding' in that record.
    3. Write out the updated JSON with embeddings for all splits.
    """
    # Load JSON
    with open(json_path, 'r', encoding='utf-8') as f:
        all_data = json.load(f)

    # Determine splits present
    splits = [k for k in ['train', 'val', 'test'] if k in all_data]
    if not splits:
        # assume top-level list
        splits = ['']
        all_data = {'': all_data}

    # Initialize Keypoint R-CNN model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = keypointrcnn_resnet50_fpn(pretrained=True).to(device)
    model.eval()

    # Iterate over splits
    for split in splits:
        records = all_data[split]
        updated_records = []

        for rec in tqdm(records, desc=f"Pose embedding for '{split or 'all'}'"):
            rec['pose_embedding'] = None
            img_path = rec.get('image_path')
            bbox = rec.get('bbox', [])
            if not img_path or len(bbox) != 4:
                updated_records.append(rec)
                continue

            img = cv2.imread(img_path)
            if img is None:
                updated_records.append(rec)
                continue

            x1, y1, x2, y2 = map(int, bbox)
            h, w = img.shape[:2]
            x1, y1 = max(0, x1), max(0, y1)
            x2, y2 = min(w, x2), min(h, y2)
            if x2 <= x1 or y2 <= y1:
                updated_records.append(rec)
                continue

            person_crop = img[y1:y2, x1:x2]
            if person_crop.size == 0:
                updated_records.append(rec)
                continue

            # Convert to tensor
            rgb = cv2.cvtColor(person_crop, cv2.COLOR_BGR2RGB)
            img_tensor = torch.from_numpy(rgb).permute(2, 0, 1).float() / 255.0
            img_tensor = img_tensor.to(device)

            # Forward pass
            with torch.no_grad():
                outputs = model([img_tensor])

            if not outputs or outputs[0]['scores'].numel() == 0:
                updated_records.append(rec)
                continue

            # Pick top detection
            scores = outputs[0]['scores']
            top_idx = scores.argmax().item()
            keypoints = outputs[0]['keypoints'][top_idx]  # [17,3]

            # Flatten to 51-dim vector
            vect = keypoints.cpu().numpy().reshape(-1).tolist()
            rec['pose_embedding'] = vect
            updated_records.append(rec)

        # Update split
        all_data[split] = updated_records

    # If we used synthetic '' split, unwrap
    if splits == ['']:
        output_data = all_data['']
    else:
        output_data = all_data

    # Save updated JSON
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=4)
    print(f"Wrote pose embeddings to {output_json_path}")


if __name__ == '__main__':
    main()




Downloading: "https://download.pytorch.org/models/keypointrcnn_resnet50_fpn_coco-fc266e95.pth" to C:\Users\swani/.cache\torch\hub\checkpoints\keypointrcnn_resnet50_fpn_coco-fc266e95.pth


100%|███████████████████████████████████████████████████████████████████████████████| 226M/226M [00:17<00:00, 13.5MB/s]
Pose embedding for 'train': 100%|████████████████████████████████████████████████| 16659/16659 [41:30<00:00,  6.69it/s]
Pose embedding for 'val': 100%|████████████████████████████████████████████████████| 3503/3503 [08:35<00:00,  6.79it/s]
Pose embedding for 'test': 100%|███████████████████████████████████████████████████| 3544/3544 [08:33<00:00,  6.90it/s]


Wrote pose embeddings to train_val_test_split_with_pose.json
