This notebook is used to ensemble models

In [1]:
import numpy as np
import torch
from torch.utils.data import Dataset
from torchvision import transforms

class VideoDataset(Dataset):
    def __init__(self, npz_path, transform=None, target_size=(224, 224)):
        """
        PyTorch Dataset for loading videos and labels with fixed frame count.

        Args:
            npz_path (str): Path to the .npz file.
            transform (callable, optional): Transformations to apply to video frames.
            target_size (tuple): Target frame size (height, width) for resizing.
        """
        self.data = np.load(npz_path, allow_pickle=True)
        self.keys = list(self.data.keys())  # All keys in the .npz file
        self.transform = transform
        self.target_size = target_size

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        # Load video and label from the .npz file
        key = self.keys[idx]
        item = self.data[key].item()  # Extract dictionary from object array
        video = item["video"]  # NumPy array, shape: (30, height, width, channels)
        label = item["label"]

        # Process video frames
        video = self._process_video(video)
        video = video.permute(1,0,2,3)  # 从 [T, H, W, C] 转换为 [C, T, H, W]

        label = int(label)
        # Convert label to tensor
        label = torch.tensor(label, dtype=torch.long)

        return video, label

    def _process_video(self, video):
        """
        Process video frames: resize frames and apply optional transformations.
        """
        processed_frames = []

        for frame in video:
            # Resize each frame to target size
            frame = transforms.ToPILImage()(frame)  # Convert NumPy array to PIL image
            frame = frame.resize(self.target_size)  # Resize to target size
            frame = transforms.ToTensor()(frame)   # Convert to Tensor
            if self.transform:
                frame = self.transform(frame)      # Apply additional transforms
            processed_frames.append(frame)

        # Convert processed frames to a tensor
        video_tensor = torch.stack(processed_frames)

        return video_tensor


In [2]:
spatial_transform = transforms.Compose([
    transforms.RandomResizedCrop((224, 224), scale=(0.8, 1.0)),  # 随机裁剪并调整为目标大小
    #transforms.RandomRotation(degrees=10),  # 随机旋转
    transforms.Normalize(mean=[0.5], std=[0.5])  # 标准化
])
train_dataset = VideoDataset(
    npz_path="./gesture-recognition-dataset/train_data_combined.npz",
    transform=transforms.Normalize(mean=[0.5], std=[0.5]),  # Normalize frames
    #transform=spatial_transform,
    target_size=(224, 224)
)

val_dataset = VideoDataset(
    npz_path="./gesture-recognition-dataset/val_data_combined.npz",
    transform=transforms.Normalize(mean=[0.5], std=[0.5]),  # Normalize frames
    target_size=(224, 224)
)

print(f"Dataset size: {len(train_dataset)}")

# check a sample
video, label = train_dataset[0]
print(f"Video shape: {video.shape}, Label: {label}")

video, label = val_dataset[0]
print(f"Video shape: {video.shape}, Label: {label}")

Dataset size: 663
Video shape: torch.Size([3, 30, 224, 224]), Label: 0
Video shape: torch.Size([3, 30, 224, 224]), Label: 0


In [3]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=0)
for batch in train_loader:
    print(batch[0].shape) # input data
    print(batch[1].shape)  # label
    break
for batch in val_loader:
    print(batch[0].shape)
    print(batch[1].shape)
    break


torch.Size([4, 3, 30, 224, 224])
torch.Size([4])
torch.Size([4, 3, 30, 224, 224])
torch.Size([4])


In [17]:
import torch
import torch.nn as nn

class FlexibleConv3DModel(nn.Module):
    def __init__(self, num_classes=5, num_conv_layers=3, initial_channels=16, use_dropout=False,dropout_rate=0.5):
        """
        Args:
            num_classes: Number of output classes for classification.
            num_conv_layers: Number of convolutional layers in the model.
            initial_channels: Number of output channels for the first conv layer.
        """
        super(FlexibleConv3DModel, self).__init__()
        self.num_conv_layers = num_conv_layers
        self.layers = nn.ModuleList()
        self.dropout_rate = dropout_rate

        in_channels = 3  # RGB input channels
        out_channels = initial_channels

        # Create convolutional layers dynamically
        for i in range(num_conv_layers):
            self.layers.append(
                nn.Conv3d(in_channels=in_channels, out_channels=out_channels, kernel_size=(3, 3, 3), stride=1, padding=1)
            )
            self.layers.append(nn.ReLU())
            if i == 0:
                self.layers.append(nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)))
            else:
                self.layers.append(nn.MaxPool3d(kernel_size=(2, 2, 2), stride=2))
            if use_dropout and i >  num_conv_layers // 2:
                self.layers.append(nn.Dropout3d(p=dropout_rate))
            in_channels = out_channels
            out_channels *= 2  # Double channels after each layer

        self.fc = None  # Placeholder; initialized later based on input size
        self.num_classes = num_classes

    def forward(self, x):
        # Apply convolutional layers dynamically
        for layer in self.layers:
            x = layer(x)

        # Dynamically calculate feature size for fully connected layer
        if self.fc is None:
            flattened_dim = x.size(1) * x.size(2) * x.size(3) * x.size(4)
            self.fc = nn.Sequential(
                nn.Linear(flattened_dim, self.num_classes),
                nn.Dropout(p=self.dropout_rate)  # Dropout before the final layer
            ).to(x.device)


        # Flatten and pass through FC
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x
conv3d = FlexibleConv3DModel(num_classes=5, num_conv_layers=5, initial_channels=16, use_dropout=True,dropout_rate=0.5)
dummy_input = torch.randn(4, 3, 30, 224, 224)
conv3d(dummy_input)
conv3d_path = "./checkpoints/Conv3D.pth"
conv3d.load_state_dict(torch.load(conv3d_path, map_location=torch.device('cpu')))

<All keys matched successfully>

In [18]:
from TCN import TCNWithResNet
tcn_model = TCNWithResNet(in_channels=512, num_classes=5)

tcn_path = "./checkpoints/tcn.pth"
tcn_model.load_state_dict(torch.load(tcn_path, map_location=torch.device('cpu')))



<All keys matched successfully>

In [None]:
import torch
import torch.nn.functional as F
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from torch.utils.data import Subset

def generate_meta_features(models, loader, device):
    """
    Generate meta features (model probabilities) and labels for given data loader.
    Args:
        models (list): List of base models.
        loader: DataLoader with input data.
        device: PyTorch device (cuda or cpu).
    Returns:
        X_meta: Array of meta features.
        y_true: Array of true labels.
    """
    X_meta = []
    y_true = []
    
    for model in models:
        model.to(device)
        model.eval()
    
    with torch.no_grad():
        for videos, labels in loader:
            videos = videos.to(device)
            meta_features = []
            
            # Collect predictions (probabilities) from all base models
            for model in models:
                outputs = model(videos)
                probs = F.softmax(outputs, dim=1).cpu().numpy()  # Probabilities
                meta_features.append(probs)
            
            # Concatenate probabilities from all models
            meta_features = np.hstack(meta_features)
            X_meta.append(meta_features)
            y_true.extend(labels.cpu().numpy())
    
    X_meta = np.vstack(X_meta)
    y_true = np.array(y_true)
    return X_meta, y_true

def stacking_with_partial_train(models, train_loader, val_loader, train_percent=0.2):
    """
    Train a meta-model using a subset of the training data and evaluate on validation data.
    
    Args:
        models (list): List of trained base models.
        train_loader: DataLoader for the full training dataset.
        val_loader: DataLoader for the validation dataset.
        train_percent (float): Percentage of train data to use for training the meta-model.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Step 1: Randomly select a subset of train data
    train_indices = np.random.choice(len(train_loader.dataset), 
                                     int(len(train_loader.dataset) * train_percent), 
                                     replace=False)
    partial_train_loader = torch.utils.data.DataLoader(
        Subset(train_loader.dataset, train_indices),
        batch_size=train_loader.batch_size,
        shuffle=True
    )
    
    print(f"Using {len(train_indices)} samples ({train_percent * 100:.0f}%) from training data for meta-model.")
    
    # Step 2: Generate meta-features for partial train and validation data
    print("Generating meta-features for partial train set...")
    X_train_meta, y_train_meta = generate_meta_features(models, partial_train_loader, device)

    print("Generating meta-features for validation set...")
    X_val_meta, y_val_meta = generate_meta_features(models, val_loader, device)
    
    # Step 3: Train meta-model (Logistic Regression)
    print("Training meta-model...")
    meta_model = LogisticRegression(max_iter=1000, multi_class="ovr", solver="lbfgs")
    meta_model.fit(X_train_meta, y_train_meta)
    
    # Step 4: Evaluate on validation set
    y_val_pred = meta_model.predict(X_val_meta)
    y_val_proba = meta_model.predict_proba(X_val_meta)
    
    # Compute accuracy and AUC
    acc = accuracy_score(y_val_meta, y_val_pred)
    auc = roc_auc_score(y_val_meta, y_val_proba, multi_class="ovr")
    
    print("\nMeta-Model Evaluation on Validation Set:")
    print(f"Validation Accuracy: {acc:.4f}")
    print(f"Validation AUC: {auc:.4f}")

# Train a meta-model using 20% of the training data
models = [conv3d, tcn_model]
stacking_with_partial_train(models, train_loader, val_loader, train_percent=0.2)


Using 132 samples (20%) from training data for meta-model.
Generating meta-features for partial train set...
Generating meta-features for validation set...
Training meta-model...

Meta-Model Evaluation on Validation Set:
Validation Accuracy: 0.9700
Validation AUC: 0.9971
