In [1]:
import os
import cv2
import pandas as pd
import torch
from tqdm import tqdm
import torch.nn as nn

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {device}")

Using device: cuda


In [3]:
# Paths
VIDEO_ROOT = '/content/drive/MyDrive/DAD/inner_mirror/'
ANNOTATION_CSV = '/content/drive/MyDrive/DAD/activities/inner_mirror/objectlevel.chunks_90.csv'
OUTPUT_DIR = '/content/drive/MyDrive/DAD/processed_frames/'

In [4]:
# Load annotations
df = pd.read_csv(ANNOTATION_CSV)
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [5]:
# Loop over annotations


for idx, row in tqdm(df.iterrows(), total=len(df)):
    participant = row['participant_id']
    file_id = row['file_id']
    activity = row['activity']
    start = int(row['frame_start'])
    end = int(row['frame_end'])
    chunk_id = row['chunk_id']


    # Skip bad labels or unreadable files
    if pd.isna(file_id) or activity == 'none':
        continue

    video_path = os.path.join(VIDEO_ROOT, file_id + '.mp4')
    if not os.path.exists(video_path):
        print(f"Missing: {video_path}")
        continue

    save_dir = os.path.join(OUTPUT_DIR, activity, f"vp{participant}_{chunk_id}")
    os.makedirs(save_dir, exist_ok=True)

    # Open video
    cap = cv2.VideoCapture(video_path)
    cap.set(cv2.CAP_PROP_POS_FRAMES, start)
    frame_idx = 0

    for fnum in range(start, end + 1):
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, (224, 224))
        out_path = os.path.join(save_dir, f"frame_{frame_idx:03d}.jpg")
        cv2.imwrite(out_path, frame)
        frame_idx += 1

    cap.release()

  0%|          | 1/9969 [00:08<23:25:56,  8.46s/it]


KeyboardInterrupt: 

In [7]:
import os
import torch
import pandas as pd
from PIL import Image
from torchvision import transforms
from tqdm import tqdm

# Paths
CSV_PATH = "/content/drive/MyDrive/DAD/clip_index.csv"
SOURCE_ROOT = "/content/drive/MyDrive/DAD/processed_frames"
DEST_ROOT = "/content/DAD_tensor_clips1"
FRAMES_PER_CLIP = 32
IMAGE_SIZE = 112

#Load clip list
df = pd.read_csv(CSV_PATH)
to_tensor = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.ToTensor()
])

#Create target root
os.makedirs(DEST_ROOT, exist_ok=True)

failed = []

#Convert each clip
for idx, row in tqdm(df.iterrows(), total=len(df)):
    label = row['label']
    orig_path = row['clip_path']
    clip_name = os.path.basename(orig_path)
    source_folder = os.path.join(SOURCE_ROOT, label, clip_name)
    dest_path = os.path.join(DEST_ROOT, label, clip_name + ".pt")

    os.makedirs(os.path.dirname(dest_path), exist_ok=True)

    try:
        frame_files = sorted([
            f for f in os.listdir(source_folder)
            if f.endswith(".jpg")
        ])[:FRAMES_PER_CLIP]

        frames = []
        for fname in frame_files:
            img = Image.open(os.path.join(source_folder, fname)).convert("RGB")
            img = to_tensor(img)
            frames.append(img)

        if not frames:
            raise ValueError("No valid frames")

        # Pad if fewer frames
        while len(frames) < FRAMES_PER_CLIP:
            frames.append(frames[-1])

        clip_tensor = torch.stack(frames)  # [T, C, H, W]
        torch.save(clip_tensor, dest_path)
    except Exception as e:
        print(f"[ERROR] Failed: {source_folder} â†’ {e}")
        failed.append(source_folder)


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 695/695 [1:04:50<00:00,  5.60s/it]


In [8]:
import os
import pandas as pd

ROOT = "/content/drive/MyDrive/DAD/processed_frames/"
data = []

for activity in os.listdir(ROOT):
    act_dir = os.path.join(ROOT, activity)
    if not os.path.isdir(act_dir):
        continue
    for clip in os.listdir(act_dir):
        clip_path = os.path.join(act_dir, clip)
        if os.path.isdir(clip_path):
            data.append([clip_path, activity])

df_clips = pd.DataFrame(data, columns=["clip_path", "label"])
df_clips.to_csv("/content/drive/MyDrive/DAD/clip_index.csv", index=False)
print("Saved: clip_index.csv")

# Replace frame-folder paths with .pt paths
df_clips['clip_path'] = df_clips.apply(
    lambda row: os.path.join(DEST_ROOT, row['label'], os.path.basename(row['clip_path']) + ".pt"),
    axis=1
)

# Optional: Save updated CSV
df_clips.to_csv("/content/clip_index_pt.csv", index=False)

print(df_clips)


âœ… Saved: clip_index.csv
                                            clip_path         label
0    /content/DAD_tensor_clips1/reaching_for/vp1_0.pt  reaching_for
1    /content/DAD_tensor_clips1/reaching_for/vp1_1.pt  reaching_for
2    /content/DAD_tensor_clips1/reaching_for/vp2_0.pt  reaching_for
3    /content/DAD_tensor_clips1/reaching_for/vp2_1.pt  reaching_for
4    /content/DAD_tensor_clips1/reaching_for/vp3_0.pt  reaching_for
..                                                ...           ...
690      /content/DAD_tensor_clips1/closing/vp12_3.pt       closing
691      /content/DAD_tensor_clips1/closing/vp12_4.pt       closing
692      /content/DAD_tensor_clips1/closing/vp14_0.pt       closing
693      /content/DAD_tensor_clips1/closing/vp14_1.pt       closing
694      /content/DAD_tensor_clips1/closing/vp14_2.pt       closing

[695 rows x 2 columns]


In [41]:
# Define once globally
CLASS_NAMES = ['closing', 'interacting', 'opening', 'placing_moving_to', 'reaching_for', 'retracting_from']
label2id = {name: i for i, name in enumerate(CLASS_NAMES)}
id2label = {i: name for name, i in label2id.items()}

df_clips['label_idx'] = df_clips['label'].map(label2id)

In [42]:
from torch.utils.data import Dataset

class ClipTensorDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        label = label2id[row['label']]  # now fixed globally
        tensor = torch.load(row['clip_path'])  # [T, C, H, W]
        return tensor.permute(1, 0, 2, 3), label



In [43]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader


# Stratify on integer label
df_train, df_val = train_test_split(df_clips, test_size=0.2, stratify=df_clips['label_idx'], random_state=42)

#Dataset definition should use 'label_idx', not 'label'
train_ds = ClipTensorDataset(df_train)
val_ds = ClipTensorDataset(df_val)

#Normalization for visual models
transform = transforms.Compose([
    transforms.Resize((112, 112)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

#Weighted sampler on label_idx (int class labels)
class_counts = df_train['label_idx'].value_counts()
class_weights = 1. / class_counts
weights = df_train['label_idx'].map(class_weights).values

sampler = WeightedRandomSampler(weights, num_samples=len(weights), replacement=True)

#Dataloaders
train_loader = DataLoader(train_ds, batch_size=4, sampler=sampler, num_workers=4)
val_loader = DataLoader(val_ds, batch_size=4, shuffle=False, num_workers=4)

In [44]:
import torchvision.models.video as video_models

NUM_CLASSES = df['label'].nunique()

model = video_models.r3d_18(pretrained=False)  # or pretrained=True for Kinetics
model.fc = nn.Linear(model.fc.in_features, NUM_CLASSES)
model = model.to(device)




In [45]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


In [46]:
from tqdm import tqdm

def evaluate(model, loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for clips, labels in loader:
            clips, labels = clips.to(device), labels.to(device)
            outputs = model(clips)
            _, preds = outputs.max(1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return correct / total

def train(model, epochs):
    for epoch in range(epochs):
        model.train()
        total, correct = 0, 0
        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")
        for clips, labels in loop:
            clips, labels = clips.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(clips)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            _, preds = outputs.max(1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
            loop.set_postfix(train_acc=correct/total)

        val_acc = evaluate(model, val_loader)
        print(f"Epoch {epoch+1} | Val Acc: {val_acc:.4f}")


In [47]:
print("Loading one batch for debug...")
for clips, labels in train_loader:
    print(f"[DEBUG] Loaded batch of shape: {clips.shape}")
    print(f"[DEBUG] Labels: {labels}")
    break  # just test the first batch

Loading one batch for debug...
[DEBUG] Loaded batch of shape: torch.Size([4, 3, 32, 112, 112])
[DEBUG] Labels: tensor([5, 5, 4, 4])


In [49]:
train(model, epochs=8)


Epoch 1: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 139/139 [00:09<00:00, 14.09it/s, train_acc=0.358]


Epoch 1 | Val Acc: 0.5468


Epoch 2: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 139/139 [00:09<00:00, 14.02it/s, train_acc=0.556]


Epoch 2 | Val Acc: 0.5612


Epoch 3: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 139/139 [00:09<00:00, 14.22it/s, train_acc=0.7]


Epoch 3 | Val Acc: 0.2230


Epoch 4: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 139/139 [00:09<00:00, 14.18it/s, train_acc=0.82]


Epoch 4 | Val Acc: 0.6691


Epoch 5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 139/139 [00:09<00:00, 14.10it/s, train_acc=0.867]


Epoch 5 | Val Acc: 0.7266


Epoch 6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 139/139 [00:09<00:00, 14.23it/s, train_acc=0.921]


Epoch 6 | Val Acc: 0.6906


Epoch 7: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 139/139 [00:09<00:00, 14.23it/s, train_acc=0.892]


Epoch 7 | Val Acc: 0.6835


Epoch 8: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 139/139 [00:09<00:00, 14.27it/s, train_acc=0.962]


Epoch 8 | Val Acc: 0.7626


In [51]:
import pandas as pd
import os, cv2, torch
from tqdm import tqdm
from torchvision import transforms

# === CONFIG ===
CSV_PATH = "/content/drive/MyDrive/DAD/clip_index.csv"
FRAMES_DIR = "/content/drive/MyDrive/DAD/processed_frames"
OUT_VIDEO = "/content/annotated_from_frames1.mp4"
MAX_FRAMES = 2000
FPS = 5
FRAME_SIZE = (112, 112)

# === Label Mapping ===
CLASS_NAMES = ['closing', 'interacting', 'opening', 'placing_moving_to', 'reaching_for', 'retracting_from']
label2id = {name: i for i, name in enumerate(CLASS_NAMES)}
id2label = {i: name for name, i in label2id.items()}

# === Model should already be loaded ===
model.eval()

# === Transform for inference ===
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize(FRAME_SIZE),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

# === Prediction helper ===
def predict_clip(frames):
    frames_tensor = [transform(f) for f in frames]
    clip = torch.stack(frames_tensor).permute(1, 0, 2, 3).unsqueeze(0).to(device)  # [1, C, T, H, W]
    with torch.no_grad():
        output = model(clip)
        probs = torch.softmax(output, dim=1).squeeze().cpu().numpy()
        top_idx = probs.argmax()
        top_label = id2label[top_idx]
        print(dict(zip(CLASS_NAMES, probs.round(3))))  # Optional debug
        return top_label, probs[top_idx]

# === Annotate video ===
df = pd.read_csv(CSV_PATH)

# Make sure ground truth labels are interpreted correctly
if 'label_idx' not in df.columns:
    df['label_idx'] = df['label'].map(label2id)

df_sample = df.sample(5, random_state=246).reset_index(drop=True)

video_writer = None
frame_count = 0

for _, row in tqdm(df_sample.iterrows(), total=len(df_sample)):
    if frame_count >= MAX_FRAMES:
        break

    clip_path = row['clip_path']
    gt_label_idx = row['label_idx']
    gt_label = id2label[gt_label_idx]

    full_path = os.path.join(FRAMES_DIR, clip_path)
    frame_files = sorted(os.listdir(full_path))
    rgb_frames = []

    for f in frame_files:
        img = cv2.imread(os.path.join(full_path, f))
        if img is None:
            continue
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        rgb_frames.append(img_rgb)

    if len(rgb_frames) == 0:
        continue

    pred_label, pred_conf = predict_clip(rgb_frames)

    for img in rgb_frames[::2]:  # annotate every 2nd frame
        if frame_count >= MAX_FRAMES:
            break

        frame = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        cv2.putText(frame, f"GT: {gt_label}", (10, 25), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,255,0), 2)
        cv2.putText(frame, f"Pred: {pred_label} ({pred_conf:.2f})", (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,100,255), 2)

        if video_writer is None:
            h, w = frame.shape[:2]
            video_writer = cv2.VideoWriter(OUT_VIDEO, cv2.VideoWriter_fourcc(*'mp4v'), FPS, (w, h))

        video_writer.write(frame)
        frame_count += 1

if video_writer:
    video_writer.release()

print(f"ðŸ“¼ Saved video (limited to {MAX_FRAMES} frames): {OUT_VIDEO}")


 40%|â–ˆâ–ˆâ–ˆâ–ˆ      | 2/5 [01:00<01:14, 24.86s/it]

{'closing': np.float32(0.003), 'interacting': np.float32(0.655), 'opening': np.float32(0.0), 'placing_moving_to': np.float32(0.004), 'reaching_for': np.float32(0.338), 'retracting_from': np.float32(0.0)}
{'closing': np.float32(0.004), 'interacting': np.float32(0.808), 'opening': np.float32(0.0), 'placing_moving_to': np.float32(0.034), 'reaching_for': np.float32(0.153), 'retracting_from': np.float32(0.0)}


 60%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ    | 3/5 [01:22<00:46, 23.43s/it]

{'closing': np.float32(0.004), 'interacting': np.float32(0.605), 'opening': np.float32(0.0), 'placing_moving_to': np.float32(0.018), 'reaching_for': np.float32(0.373), 'retracting_from': np.float32(0.0)}


 80%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ  | 4/5 [02:11<00:33, 33.63s/it]

{'closing': np.float32(0.001), 'interacting': np.float32(0.833), 'opening': np.float32(0.0), 'placing_moving_to': np.float32(0.002), 'reaching_for': np.float32(0.164), 'retracting_from': np.float32(0.0)}


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5/5 [02:56<00:00, 35.32s/it]

{'closing': np.float32(0.003), 'interacting': np.float32(0.859), 'opening': np.float32(0.0), 'placing_moving_to': np.float32(0.012), 'reaching_for': np.float32(0.126), 'retracting_from': np.float32(0.0)}
ðŸ“¼ Saved video (limited to 2000 frames): /content/annotated_from_frames1.mp4



