In [1]:
import os
import torch
import cv2
import json
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torchvision.io import read_video
from torch.utils.data import TensorDataset, DataLoader
from timesformer.models.vit import TimeSformer
from tqdm import tqdm 
from torchvision import transforms
from models.timersformer_moe import MoEMLP

ModuleNotFoundError: No module named 'timesformer'

In [2]:
import os

folder = './output_7/'

# Sort files numerically (excluding extensions)
files = sorted(os.listdir(folder), key=lambda x: int(os.path.splitext(x)[0]))  # sort by number

for idx, filename in enumerate(files, start=1):
    # Ensure filename ends with .jpg (or any other image type you need)
    src = os.path.join(folder, filename)
    dst = os.path.join(folder, f'{idx:06d}.jpg')  # This will pad with zeroes (000001.jpg)
    os.rename(src, dst)

In [3]:
#define a detect human function
def detect_humans():
    detections={}
    with open("./tracking.seq", "r") as f:
        for line in f:
            frame_id, person_id, x1, y1, x2, y2 = map(int, line.strip().split(","))
            if frame_id not in detections:
                detections[frame_id] = []
            detections[frame_id].append((x1, y1, x2, y2, person_id))
    return detections

In [4]:
def apply_sliding_window(sequence,window_size=45,stride=22):
    num_frames=len(sequence)
    windows=[]

    for start in range(0, num_frames-window_size+1,stride):
        window=sequence[start:start+window_size]
        windows.append(window)

    return windows

In [5]:
def compute_optical_flow(prev_patch, next_patch):
    """Compute dense optical flow between two frames using Farneback method."""
    flow = cv2.calcOpticalFlowFarneback(prev_patch, next_patch, None, 
                                        0.5, 3, 15, 3, 5, 1.2, 0)
    
    # Normalize flow to range [0, 255]
    flow_x = cv2.normalize(flow[..., 0], None, 0, 255, cv2.NORM_MINMAX)
    flow_y = cv2.normalize(flow[..., 1], None, 0, 255, cv2.NORM_MINMAX)
    
    # # Stack as 3-channel (third channel can be zero)
    # flow_rgb = np.stack([flow_x, flow_y, np.zeros_like(flow_x)], axis=-1)
    
    return np.stack([flow_x,flow_y,np.zeros_like(flow_x)],axis=-1).astype(np.uint8)

In [None]:
# def process_frames(frame_folder,timesformer_model,num_frames=45,resize=(224,224), threshold=0.6):
#     #detections = detect_humans(video_path)  # Dictionary {frame_idx: [(x1, y1, x2, y2, person_id), ...]}
#     detections=detect_humans()

#     frame_idx=0
#     person_patches={}
#     prev_patches={}
#     frame_indices=sorted(detections.keys())
    
#     for frame_idx in frame_indices:
#         frame_path=os.path.join(frame_folder,f"{frame_idx:06}.jpg")
#         frame=cv2.imread(frame_path)

#         if frame is None:
#             print(f"Warning:Frame{frame_path} not found.")
#             continue
#         for (x1,y1,x2,y2,person_id) in detections[frame_idx]:
#             patch=frame[y1:y2,x1:x2]

#             if patch.size==0:
#                 print(f"Warning:Empty patch at frame{frame_idx},person {person_id}")
#                 continue

#             person_patch=cv2.resize(patch,resize)
#             person_patch=cv2.cvtColor(person_patch, cv2.COLOR_BGR2GRAY)

    #         if person_id in prev_patches:
    #                 prev_patch = prev_patches[person_id]
    #                 flow_patch = compute_optical_flow(prev_patch, person_patch)

    #                 if person_id not in person_patches:
    #                     person_patches[person_id] = []

    #                 person_patches[person_id].append(flow_patch)

    #         prev_patches[person_id] = person_patch


    # # Convert to tensors
    # person_tensors = {}
    # for person_id, patches in person_patches.items():
    #     if len(patches) < num_frames:
    #         last_patch = patches[-1] if patches else np.zeros((224, 224, 3), dtype=np.uint8)
    #         patches.extend([last_patch] * (num_frames - len(patches)))
    #     else:
    #         patches = patches[:num_frames]

    #     # Convert to tensor and normalize
    #     patches = np.array(patches, dtype=np.float32) / 255.0
    #     video_tensor = torch.tensor(patches, dtype=torch.float32).permute(3, 0, 1, 2)

    #     normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    #     for t in range(video_tensor.shape[1]):
    #         video_tensor[:, t, :, :] = normalize(video_tensor[:, t, :, :])

    #     person_tensors[person_id] = video_tensor.unsqueeze(0)  # Add batch dim (1, C, T, H, W)

    #  # Pass through TimeSformer model
    # results = {}
    # timesformer_model = torch.load('full_timeSformer.pth',weights_only=False)
    # timesformer_model.to('cuda' if torch.cuda.is_available() else 'cpu')
    # timesformer_model.eval()
    # with torch.no_grad():
    #     for person_id, tensor in person_tensors.items():
    #         tensor = tensor.to('cuda')  # Move to GPU if available
            
    #         outputs = timesformer_model(tensor)  # Forward pass
    #         probs = torch.softmax(outputs, dim=1)
    #         predicted_label = torch.argmax(probs, dim=1).item()
    #         confidence = probs.max().item()

    #         if confidence<threshold:
    #             predicted_label=-1

    #         results[person_id] = {"action": predicted_label, "confidence": confidence}
    # return results

In [None]:
import os
import cv2
import numpy as np
import torch
import torchvision.transforms as transforms


def process_frames(frame_folder, timesformer_model, num_frames=45, resize=(224, 224), threshold=0.6):
    detections = detect_humans()  # Should return {frame_idx: [(x1, y1, x2, y2, person_id), ...]}
    
    frame_indices = sorted(detections.keys())
    person_patches = {}
    prev_patches = {}

    for frame_idx in frame_indices:
        frame_path = os.path.join(frame_folder, f"{frame_idx:06}.jpg")
        frame = cv2.imread(frame_path)

        if frame is None:
            print(f"Warning: Frame {frame_path} not found.")
            continue

        for (x1, y1, x2, y2, person_id) in detections[frame_idx]:
            patch = frame[y1:y2, x1:x2]
            if patch.size == 0:
                print(f"Warning: Empty patch at frame {frame_idx}, person {person_id}")
                continue

            # Resize and convert to grayscale
            patch = cv2.resize(patch, resize)
            gray_patch = cv2.cvtColor(patch, cv2.COLOR_BGR2GRAY)

            # Optical flow between current and previous frame
            if person_id in prev_patches:
                prev_patch = prev_patches[person_id]
                flow = compute_optical_flow(prev_patch, gray_patch)  # Output shape (H,W,2) or (H,W,3)

                if person_id not in person_patches:
                    person_patches[person_id] = []
                person_patches[person_id].append(flow)

            prev_patches[person_id] = gray_patch

    # Padding/Trimming and Tensor conversion
    person_tensors = {}
    for person_id, patches in person_patches.items():
        if len(patches) < num_frames:
            last_patch = patches[-1] if patches else np.zeros((resize[0], resize[1], 3), dtype=np.float32)
            patches.extend([last_patch] * (num_frames - len(patches)))
        else:
            patches = patches[:num_frames]

        patches = np.array(patches, dtype=np.float32) / 255.0  # Normalize to [0,1]
        video_tensor = torch.tensor(patches, dtype=torch.float32).permute(3, 0, 1, 2)  # (C,T,H,W)

        # Normalization (RGB optical flow assumed)
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])
        for t in range(video_tensor.shape[1]):
            video_tensor[:, t, :, :] = normalize(video_tensor[:, t, :, :])

        person_tensors[person_id] = video_tensor.unsqueeze(0)  # (1,C,T,H,W)

    # Load Model & Inference
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    timesformer_model.to(device)
    timesformer_model.eval()

    results = {}
    with torch.no_grad():
        for person_id, tensor in person_tensors.items():
            tensor = tensor.to(device)
            outputs = timesformer_model(tensor)
            probs = torch.softmax(outputs, dim=1)
            pred_label = torch.argmax(probs, dim=1).item()
            confidence = probs.max().item()

            if confidence < threshold:
                pred_label = -1  # Unknown action

            results[person_id] = {
                "action": pred_label,
                "confidence": confidence
            }

    return results


In [46]:
def process_frames(frame_folder, timesformer_model, num_frames=45, resize=(224, 224), threshold=0.6, stride=22):
    detections = detect_humans()  # {frame_idx: [(x1, y1, x2, y2, person_id), ...]}

    frame_indices = sorted(detections.keys())
    person_patches = {}
    prev_patches = {}

    for frame_idx in frame_indices:
        frame_path = os.path.join(frame_folder, f"{frame_idx:06}.jpg")
        frame = cv2.imread(frame_path)

        if frame is None:
            print(f"Warning: Frame {frame_path} not found.")
            continue

        for (x1, y1, x2, y2, person_id) in detections[frame_idx]:
            patch = frame[y1:y2, x1:x2]
            if patch.size == 0:
                print(f"Warning: Empty patch at frame {frame_idx}, person {person_id}")
                continue

            person_patch = cv2.resize(patch, resize)
            person_patch = cv2.cvtColor(person_patch, cv2.COLOR_BGR2GRAY)

            if person_id in prev_patches:
                prev_patch = prev_patches[person_id]
                flow_patch = compute_optical_flow(prev_patch, person_patch)

                if person_id not in person_patches:
                    person_patches[person_id] = []

                person_patches[person_id].append(flow_patch)

            prev_patches[person_id] = person_patch

    results = {}
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    timesformer_model.to(device)
    timesformer_model.eval()

    normalize = transforms.Normalize(mean=[0.485], std=[0.229])

    with torch.no_grad():
        for person_id, patches in person_patches.items():
            if len(patches) < num_frames:
                continue  # Not enough frames to classify

            results[person_id] = []

            for start in range(0, len(patches) - num_frames + 1, stride):
                window = patches[start:start + num_frames]
                window = np.array(window, dtype=np.float32) / 255.0  # Normalize
                # print(window.shape)

                video_tensor = torch.tensor(window) # (T, H, W, 3)
                video_tensor = video_tensor.permute(3, 0, 1, 2)  # (C, T, H, W)

                for t in range(video_tensor.shape[1]):
                    video_tensor[:, t, :, :] = normalize(video_tensor[:, t, :, :])

                video_tensor = video_tensor.unsqueeze(0).to(device)  # Add batch dim

                outputs = timesformer_model(video_tensor)
                predicted_label= torch.softmax(outputs, dim=1).argmax(dim=1).item()
                confidence = torch.softmax(outputs,dim=1).max().item()

                if confidence < threshold:
                    predicted_label = -1

                results[person_id].append({
                    "action": predicted_label,
                    "confidence": confidence,
                    "start_frame": frame_indices[start],
                    "end_frame": frame_indices[start + num_frames - 1]
                })

    return results


In [41]:
timesformer=torch.load('full_timeSformer.pth',weights_only=False)

In [47]:

results=process_frames(frame_folder="./output_7/",timesformer_model=timesformer)

In [48]:
print(results)

{1: [{'action': -1, 'confidence': 0.43191686272621155, 'start_frame': 18, 'end_frame': 62}, {'action': -1, 'confidence': 0.4689951241016388, 'start_frame': 40, 'end_frame': 92}, {'action': 3, 'confidence': 0.6006878614425659, 'start_frame': 62, 'end_frame': 114}, {'action': 3, 'confidence': 0.6938800811767578, 'start_frame': 92, 'end_frame': 136}, {'action': -1, 'confidence': 0.5224723815917969, 'start_frame': 114, 'end_frame': 158}, {'action': 3, 'confidence': 0.9364585280418396, 'start_frame': 136, 'end_frame': 180}, {'action': 3, 'confidence': 0.8982377052307129, 'start_frame': 158, 'end_frame': 202}, {'action': 3, 'confidence': 0.6278750896453857, 'start_frame': 180, 'end_frame': 224}, {'action': 2, 'confidence': 0.8564113974571228, 'start_frame': 202, 'end_frame': 246}, {'action': 3, 'confidence': 0.6683403253555298, 'start_frame': 224, 'end_frame': 268}, {'action': 0, 'confidence': 0.6907159090042114, 'start_frame': 246, 'end_frame': 290}, {'action': -1, 'confidence': 0.504008948