In [1]:
# !pip install opencv-python
# !pip install simplejson
# !pip install fvcore
# !pip install einops

In [1]:
import os
import torch
import cv2
import json
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torchvision.io import read_video
from torch.utils.data import TensorDataset, DataLoader
from timesformer.models.vit import TimeSformer
from tqdm import tqdm 

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

使用设备: cuda


In [3]:
torch.cuda.empty_cache()  
torch.cuda.reset_peak_memory_stats() 

Preprocess the dataset

In [12]:
with open("trainning data\labels.json", "r", encoding="utf-8") as f:
    labels = json.load(f)

print(labels) 

{'take': 0, 'put': 1, 'drop': 2, 'pick': 3}


In [13]:
dataset_dir = "trainning data"

video_paths, video_labels = [], []

for action, label in labels.items():
    action_dir = os.path.join(dataset_dir, action)
    if not os.path.exists(action_dir):
        continue  # 跳过不存在的目录

    for video_file in os.listdir(action_dir):
        if video_file.endswith(".mp4"):
            video_paths.append(os.path.join(action_dir, video_file))
            video_labels.append(label)

print(f"找到 {len(video_paths)} 个训练视频")

找到 215 个训练视频


In [4]:
from torchvision import transforms

In [9]:
def compute_optical_flow(prev_frame, next_frame):
    """Compute dense optical flow between two frames using Farneback method."""
    flow = cv2.calcOpticalFlowFarneback(prev_frame, next_frame, None, 
                                        0.5, 3, 15, 3, 5, 1.2, 0)
    
    # Normalize flow to range [0, 255]
    flow_x = cv2.normalize(flow[..., 0], None, 0, 255, cv2.NORM_MINMAX)
    flow_y = cv2.normalize(flow[..., 1], None, 0, 255, cv2.NORM_MINMAX)
    
    # Stack as 3-channel (third channel can be zero)
    flow_rgb = np.stack([flow_x, flow_y, np.zeros_like(flow_x)], axis=-1)
    
    return flow_rgb.astype(np.uint8)

In [10]:
def load_video(video_path, num_frames=45, resize=(224, 224), mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]):
    cap = cv2.VideoCapture(video_path)
    frames = []
    prev_frame=None

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, resize)  # 统一尺寸
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)  # OpenCV 默认是 BGR，需要转换
        
        if prev_frame is not None:
            prev_frame = cv2.resize(prev_frame, resize)
            flow_frame=compute_optical_flow(prev_frame,frame)
            frames.append(flow_frame)

        prev_frame=frame

    cap.release()

    frame_count = len(frames)

    if frame_count < num_frames:
        last_frame = frames[-1] if frames else np.zeros((224, 224, 3), dtype=np.uint8)
        frames.extend([last_frame] * (num_frames - frame_count))
    else:
        frames = frames[:num_frames]


    #convert to tensor cuz NN onlu process numerical data
    #/255 normalization to [0,1]
    frames=np.array(frames,dtype=np.float32)/255.0
    
    video_tensor=torch.tensor(frames,dtype=torch.float32)
    video_tensor = video_tensor.permute(3, 0, 1, 2)  # (H, W, T, C) → (C, T, H, W)

    normalize=transforms.Normalize(mean=mean,std=std)
    
    # Apply normalization on each channel
    for t in range(video_tensor.shape[1]):  # Loop over frames
        video_tensor[:, t, :, :] = normalize(video_tensor[:, t, :, :])
    return video_tensor

In [10]:
from torchvision import transforms

video_tensor = load_video(video_paths[0]) 
print(video_tensor.shape)

torch.Size([3, 45, 224, 224])


In [14]:
video_tensors = []
for path in video_paths:
    video_tensor = load_video(path, num_frames = 45, resize = (224,224))
    video_tensors.append(video_tensor)

In [12]:
from torch.utils.data import random_split, TensorDataset
X = torch.stack(video_tensors).to(device)
y = torch.tensor(video_labels, dtype=torch.long).to(device)

dataset = TensorDataset(X, y)

train_size=int(0.8*len(dataset))
val_size=len(dataset)-train_size

train_dataset,val_dataset=random_split(dataset,[train_size,val_size])


train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader=DataLoader(val_dataset,batch_size=1,shuffle=False)

print(f"dataset size: {len(dataset)}, training dataset size:{len(train_dataset)},Tensor Shape: {X.shape}, 标签: {y.shape}")

dataset size: 215, training dataset size:172,Tensor Shape: torch.Size([215, 3, 45, 224, 224]), 标签: torch.Size([215])


Train model with pretrained timesformer model
use Base model

In [13]:
model = TimeSformer( img_size=224,  # Define image size
                    patch_size=16,
                    num_classes=4,  # Output for 4 classes
                    num_frames=45
                    )

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MoEMLP(nn.Module):
    def __init__(self, in_features=768, hidden_features=3072, num_experts=8, top_k=2):
        super(MoEMLP, self).__init__()
        self.num_experts = num_experts
        self.top_k = top_k  # Number of experts to activate per input

        # Experts (each expert has its own MLP)
        self.experts = nn.ModuleList([
            nn.Sequential(
                nn.Linear(in_features, hidden_features),
                nn.GELU(),
                nn.Linear(hidden_features, in_features)
            ) for _ in range(num_experts)
        ])

        # Gating network (decides which experts to use)
        self.gate = nn.Linear(in_features, num_experts)

    def forward(self, x):
        batch_size, seq_len, dim = x.shape

        # Compute gating scores and select top-k experts
        gate_logits = self.gate(x)  # (batch, seq_len, num_experts)
        top_k_values, top_k_indices = torch.topk(gate_logits, self.top_k, dim=-1)  # Get top-k expert indices

        # Normalize gate weights using softmax
        gate_scores = F.softmax(top_k_values, dim=-1)

        # Compute expert outputs
        outputs = torch.zeros_like(x)
        for i in range(self.top_k):
            expert_idx = top_k_indices[..., i]
            expert_idx = expert_idx.view(-1) 
            expert_out = torch.stack([self.experts[idx](x[b]) for b, idx in zip(range(x.size(0)),expert_idx)])
            outputs += gate_scores[..., i, None] * expert_out  # Weighted sum of experts

        return outputs


In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Load the checkpoint
checkpoint = torch.load("TimeSformer_divST_8x32_224_K400.pyth", map_location="cuda")
pretrained_dict = checkpoint['model_state']
# Resize positional embedding to match new num_frames
pretrained_time_embed = checkpoint['model_state']["model.time_embed"]  # Shape: [1, 8, 768]

# Remove the `head` and `time_embed` layers from the checkpoint (to avoid loading mismatched layers)
pretrained_dict.pop('model.time_embed', None)
pretrained_dict.pop('model.head.weight', None)
pretrained_dict.pop('model.head.bias', None)

print(pretrained_dict.keys())
pretrained_dict={k: v for k, v in pretrained_dict.items() if "mlp.fc1" not in k or "mlp.fc2" not in k}

# Check removed keys
print("Removed keys:", [k for k in pretrained_dict if "mlp.fc1" in k or "mlp.fc2" in k])


# Load the rest of the checkpoint into the model (this will load all layers except the modified ones)
model.load_state_dict(pretrained_dict, strict=False)

# Modify classification head for 4 classes
model.head = nn.Linear(768, 4)



# Permute to [1, 768, 8] for interpolation
pretrained_time_embed = pretrained_time_embed.permute(0, 2, 1)
new_time_embed = F.interpolate(pretrained_time_embed, size=45, mode="linear", align_corners=False)
new_time_embed = new_time_embed.permute(0, 2, 1)

# Set the modified time_embed layer
model.time_embed = nn.Parameter(new_time_embed)

# Print the model architecture to verify changes
print(model)


odict_keys(['model.cls_token', 'model.pos_embed', 'model.patch_embed.proj.weight', 'model.patch_embed.proj.bias', 'model.blocks.0.norm1.weight', 'model.blocks.0.norm1.bias', 'model.blocks.0.attn.qkv.weight', 'model.blocks.0.attn.qkv.bias', 'model.blocks.0.attn.proj.weight', 'model.blocks.0.attn.proj.bias', 'model.blocks.0.temporal_norm1.weight', 'model.blocks.0.temporal_norm1.bias', 'model.blocks.0.temporal_attn.qkv.weight', 'model.blocks.0.temporal_attn.qkv.bias', 'model.blocks.0.temporal_attn.proj.weight', 'model.blocks.0.temporal_attn.proj.bias', 'model.blocks.0.temporal_fc.weight', 'model.blocks.0.temporal_fc.bias', 'model.blocks.0.norm2.weight', 'model.blocks.0.norm2.bias', 'model.blocks.0.mlp.fc1.weight', 'model.blocks.0.mlp.fc1.bias', 'model.blocks.0.mlp.fc2.weight', 'model.blocks.0.mlp.fc2.bias', 'model.blocks.1.norm1.weight', 'model.blocks.1.norm1.bias', 'model.blocks.1.attn.qkv.weight', 'model.blocks.1.attn.qkv.bias', 'model.blocks.1.attn.proj.weight', 'model.blocks.1.attn.pr

In [16]:
for block in model.model.blocks:
    block.mlp = MoEMLP(in_features=768, hidden_features=3072, num_experts=4, top_k=2)
    

In [17]:
for block in model.model.blocks:
    for param in block.mlp.parameters():
        if len(param.shape) == 2 and param.requires_grad:
            nn.init.xavier_uniform_(param)  # Xavier init for stability
        elif len(param.shape)==1 and param.requires_grad:
            nn.init.zeros_(param)


In [18]:
loss_fn=torch.nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
model.to(device)


TimeSformer(
  (model): VisionTransformer(
    (dropout): Dropout(p=0.0, inplace=False)
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (time_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0): Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
          (attn_drop): Dropout(p=0.0, inplace=False)
        )
        (temporal_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (temporal_attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
   

In [19]:
epochs=20

In [20]:
# for epoch in range(epochs):
#     model.train()
    
#     progress_bar=tqdm(train_loader,desc=f'Epoch{epoch+1}')
#     for batch_videos, batch_labels in progress_bar:
#         batch_videos, batch_labels = batch_videos.to(device), batch_labels.to(device)

#         optimizer.zero_grad()
#         outputs = model(batch_videos)
#         loss = loss_fn(outputs, batch_labels)
#         loss.backward()
#         optimizer.step()

#         #total_loss += loss.item()
#         progress_bar.set_postfix({"loss": loss.item()})
#     #print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss:.4f}")

# #print("训练完成！")


# model.eval()
# val_loss=0.0
# correct=0
# total=0
# predictions = []
# ground_truth = []
# with torch.no_grad():
#     for images, labels in val_loader:
#         images, labels=images.to(device),labels.to(device)

#         outputs=model(images)
#         loss=loss_fn(outputs,labels)
#         val_loss+=loss.item()

#         _, predicted=torch.max(outputs,1)
#         total+=labels.size(0)
#         correct+=(predicted==labels).sum().item()

#         progress_bar.set_postfix({'val_loss':loss.item()})


# # Average loss and accuracy
# val_loss /= len(val_loader)
# accuracy = 100 * correct / total
# print(f"Validation Loss: {val_loss:.4f}, Accuracy: {accuracy:.2f}%")


In [21]:
torch.backends.cudnn.benchmark = True

In [22]:
from torch.amp import autocast, GradScaler

accumulation_steps = 4
scaler = GradScaler()
best_loss=float('inf')
patience_count=0
patience=5

#假如混合精度和梯度累积

for epoch in range(epochs):
    model.train()
    
    progress_bar=tqdm(train_loader,desc=f'Epoch{epoch+1}')
    
    optimizer.zero_grad()
    total_loss=0.0

    #需要加一个i来计算参数更新
    for i, (batch_videos, batch_labels) in enumerate(progress_bar):
        batch_videos, batch_labels = batch_videos.to(device), batch_labels.to(device)
        
        #用混合进度计算损失，同时进行梯度缩放
        with autocast(device_type='cuda',dtype=torch.float16):
           outputs = model(batch_videos)
           loss = loss_fn(outputs, batch_labels)/ accumulation_steps
        
        
        # 反向传播
        scaler.scale(loss).backward()
        
        # 每累计4次更新一次参数
        if (i + 1) % accumulation_steps == 0 or (i+1==len(train_loader)):
           scaler.step(optimizer)
           scaler.update()
           optimizer.zero_grad()
        # 显示时回归原始损失
        total_loss+=loss.item()*accumulation_steps
        progress_bar.set_postfix({"loss": loss.item() * accumulation_steps})
    
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss:.4f}")

    if total_loss<best_loss:
        best_loss=total_loss
        patience_count=0
        torch.save(model.state_dict(),'trained_timeSformer.pth')
    else:
        patience_count+=1
    
    if patience_count>=patience:
        print(f"Early stopping triggered after {epoch+1} epochs!")
        break



model.load_state_dict(torch.load('trained_timeSformer.pth'))

#validation
model.eval()
val_loss=0.0
correct=0
total=0

progress_bar = tqdm(val_loader, desc="Validation") 
with torch.no_grad():
    for images, labels in progress_bar:
        images, labels=images.to(device),labels.to(device)

        outputs=model(images)
        loss=loss_fn(outputs,labels)
        val_loss+=loss.item()

        _, predicted=torch.max(outputs,1)
        total+=labels.size(0)
        correct+=(predicted==labels).sum().item()

        progress_bar.set_postfix({'val_loss':loss.item()})


# Average loss and accuracy
val_loss /= len(val_loader)
accuracy = 100 * correct / total
print(f"Validation Loss: {val_loss:.4f}, Accuracy: {accuracy:.2f}%")


Epoch1: 100%|██████████| 86/86 [1:48:44<00:00, 75.87s/it, loss=1.2]  


Epoch [1/20], Loss: 123.0649


Epoch2: 100%|██████████| 86/86 [1:51:26<00:00, 77.76s/it, loss=1.24] 


Epoch [2/20], Loss: 108.9175


Epoch3: 100%|██████████| 86/86 [1:50:22<00:00, 77.00s/it, loss=1.48] 


Epoch [3/20], Loss: 108.5699


Epoch4: 100%|██████████| 86/86 [1:51:01<00:00, 77.46s/it, loss=1.81] 


Epoch [4/20], Loss: 110.9294


Epoch5: 100%|██████████| 86/86 [1:51:04<00:00, 77.49s/it, loss=1.8]  


Epoch [5/20], Loss: 99.1488


Epoch6: 100%|██████████| 86/86 [1:49:24<00:00, 76.34s/it, loss=0.31] 


Epoch [6/20], Loss: 85.8115


Epoch7: 100%|██████████| 86/86 [1:48:57<00:00, 76.02s/it, loss=0.598]


Epoch [7/20], Loss: 77.3073


Epoch8: 100%|██████████| 86/86 [1:49:28<00:00, 76.38s/it, loss=1.21] 


Epoch [8/20], Loss: 77.9404


Epoch9: 100%|██████████| 86/86 [1:48:53<00:00, 75.97s/it, loss=0.19]  


Epoch [9/20], Loss: 64.0252


Epoch10: 100%|██████████| 86/86 [1:50:37<00:00, 77.18s/it, loss=1.04]   


Epoch [10/20], Loss: 68.5458


Epoch11: 100%|██████████| 86/86 [1:50:39<00:00, 77.20s/it, loss=0.287] 


Epoch [11/20], Loss: 56.7090


Epoch12: 100%|██████████| 86/86 [1:49:57<00:00, 76.72s/it, loss=0.202] 


Epoch [12/20], Loss: 49.4753


Epoch13: 100%|██████████| 86/86 [1:50:14<00:00, 76.91s/it, loss=0.183]  


Epoch [13/20], Loss: 37.5016


Epoch14: 100%|██████████| 86/86 [1:50:14<00:00, 76.91s/it, loss=1.21]   


Epoch [14/20], Loss: 33.1151


Epoch15: 100%|██████████| 86/86 [1:48:22<00:00, 75.61s/it, loss=0.0169]  


Epoch [15/20], Loss: 35.1390


Epoch16: 100%|██████████| 86/86 [1:49:14<00:00, 76.22s/it, loss=0.0505]  


Epoch [16/20], Loss: 23.7513


Epoch17: 100%|██████████| 86/86 [1:48:38<00:00, 75.80s/it, loss=0.0681] 


Epoch [17/20], Loss: 25.4924


Epoch18: 100%|██████████| 86/86 [1:35:36<00:00, 66.71s/it, loss=0.0761]  


Epoch [18/20], Loss: 18.9526


Epoch19: 100%|██████████| 86/86 [1:32:41<00:00, 64.67s/it, loss=0.046]   


Epoch [19/20], Loss: 19.0681


Epoch20: 100%|██████████| 86/86 [1:32:18<00:00, 64.40s/it, loss=0.0717] 


Epoch [20/20], Loss: 20.4514


Validation: 100%|██████████| 43/43 [00:39<00:00,  1.10it/s, val_loss=0.387]  

Validation Loss: 0.7125, Accuracy: 72.09%





In [23]:
torch.save(model, "full_timeSformer.pth")


In [7]:
new_model = torch.load('full_timeSformer.pth',weights_only=False)
new_model

TimeSformer(
  (model): VisionTransformer(
    (dropout): Dropout(p=0.0, inplace=False)
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (time_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0): Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
          (attn_drop): Dropout(p=0.0, inplace=False)
        )
        (temporal_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (temporal_attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
   

In [8]:
new_model.eval()
new_model.to('cuda' if torch.cuda.is_available() else 'cpu')

TimeSformer(
  (model): VisionTransformer(
    (dropout): Dropout(p=0.0, inplace=False)
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (time_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0): Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
          (attn_drop): Dropout(p=0.0, inplace=False)
        )
        (temporal_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (temporal_attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
   

In [34]:
from torch.utils.data import random_split, TensorDataset
X = torch.stack(video_tensors).to(device)
y = torch.tensor(video_labels, dtype=torch.long).to(device)

In [25]:

test_dataset=TensorDataset(X,y)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [None]:
num_classes=4
correct = [0]*num_classes
total = [0]*num_classes

with torch.no_grad():
    for videos, labels in test_loader:
        videos, labels = videos.to(device), labels.to(device)

        outputs = new_model(videos)  # Forward pass
        _, predicted = torch.max(outputs, 1)  # Get class with highest score

        for label,pred in zip(labels,predicted):
            total[label.item()] += 1
            if label==pred:
                correct[label.item()] +=1

        print(f"Predicted: {predicted.item()}, Actual: {labels.item()}")

# Compute accuracy
class_accuracies=[100*correct[i]/total[i] if total[i]>0 else 0 for i in range(num_classes)] 
# Print accuracy for each class
for i in range(num_classes):
    print(f"Class {i} Accuracy: {class_accuracies[i]:.2f}%")



Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 1, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 1, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 1, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 1, Actual: 0
Predicted: 0, Actual: 0
Predicted: 1, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 1, Actual: 0
Predicted: 0, Actual: 0
Predicted: 1, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Actual: 0
Predicted: 0, Ac

TypeError: unsupported operand type(s) for /: 'list' and 'list'