#### 数据预处理

In [1]:
# pytorchvideo需要修改依赖，导航到此路径： .venv\Lib\site-packages\pytorchvideo\transforms\augmentations.py
# 然后修改第9行代码 "import torchvision.transforms.functional_tensor as F_t"  为：
# "import torchvision.transforms._functional_tensor as F_t"   新增了 _ 符号

# 下载视频预训练模型：uv run modelscope download --model tiansz/videomae-base --local_dir models/videomae-base

# 训练集链接：https://hf-mirror.com/datasets/sayakpaul/ucf101-subset/blob/main/UCF101_subset.tar.gz

In [2]:
import os
import pathlib
import evaluate
import numpy as np
import pytorchvideo.data
import torch
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    UniformTemporalSubsample,
)
from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
    Resize,
)
from transformers import TrainingArguments, Trainer
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification

In [3]:
EPOCHS = 10  # 模型学习训练集的轮次
BATCH_SIZE = 2  # 模型并行学习的样本数量
PRETRAINED_MODEL_NAME_OR_PATH = "../models/videomae-base"  # 预训练模型路径
VIDEO_PATH = "../datasets/视频分类数据集"  # 图像训练集路径
OUPUT_CHECKPOINT_PATH = "../models/video_classification_checkpoint"  # 训练过程中产生的模型文件
OUTPUT_MODEL_PATH = "../models/video_classification_model"  # 微调后的文本分类模型路径
ACCURACY_PATH = "../common/accuracy.py"  # 评估脚本本地路径
VIDEO_TYPE = "avi"

In [4]:
dataset_root_path = pathlib.Path(VIDEO_PATH)

train_video_file_paths = list(dataset_root_path.glob(f"train/*/*.{VIDEO_TYPE}"))
val_video_file_paths = list(dataset_root_path.glob(f"val/*/*.{VIDEO_TYPE}"))
test_video_file_paths = list(dataset_root_path.glob(f"test/*/*.{VIDEO_TYPE}"))
print("训练集视频数量：", len(train_video_file_paths))
print("验证集视频数量：", len(val_video_file_paths))
print("测试集视频数量：", len(test_video_file_paths))
# 获取指定目录下所有视频文件的路径
all_video_file_paths = (
    train_video_file_paths + val_video_file_paths + test_video_file_paths
)

# 每个类别标签映射到一个唯一的整数标识符
cur_sep = os.path.sep
class_labels = sorted({str(path).split(cur_sep)[4] for path in all_video_file_paths})
label2id = {label: i for i, label in enumerate(class_labels)}
id2label = {i: label for label, i in label2id.items()}
print("视频标签映射关系：", label2id)

训练集视频数量： 20
验证集视频数量： 6
测试集视频数量： 21
视频标签映射关系： {'BaseballPitch': 0, 'Basketball': 1}


In [5]:
# 处理视频数据的图像处理器
image_processor = VideoMAEImageProcessor.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH)

# 加载预训练模型用于视频分类任务
model = VideoMAEForVideoClassification.from_pretrained(
    PRETRAINED_MODEL_NAME_OR_PATH,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,
)

# 获取图像的均值和标准差
mean = image_processor.image_mean
std = image_processor.image_std

# 确定图像的目标尺寸
if "shortest_edge" in image_processor.size:
    height = width = image_processor.size["shortest_edge"]
else:
    height = image_processor.size["height"]
    width = image_processor.size["width"]
resize_to = (height, width)

Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at ../models/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# 频帧的采样方式、数据增强变换等。并且验证集不进行数据增强，更好地模拟模型在实际使用中的情况
num_frames_to_sample = model.config.num_frames
sample_rate = 4
fps = 30
clip_duration = num_frames_to_sample * sample_rate / fps
train_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    RandomShortSideScale(min_size=256, max_size=320),
                    RandomCrop(resize_to),
                    RandomHorizontalFlip(p=0.5),
                ]
            ),
        ),
    ]
)
train_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "train"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("random", clip_duration),
    decode_audio=False,
    transform=train_transform,
)

In [7]:
val_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    Resize(resize_to),
                ]
            ),
        ),
    ]
)
val_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "val"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
    decode_audio=False,
    transform=val_transform,
)

#### 模型微调

In [8]:
args = TrainingArguments(
    OUPUT_CHECKPOINT_PATH,
    remove_unused_columns=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_ratio=0.1,
    load_best_model_at_end=True,
    save_total_limit=1,
    fp16=True,
    max_steps=(train_dataset.num_videos // BATCH_SIZE) * EPOCHS,
)

metric = evaluate.load(ACCURACY_PATH)


def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)


def collate_fn(examples):
    pixel_values = torch.stack(
        [example["video"].permute(1, 0, 2, 3) for example in examples]
    )
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}


trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

trainer.train()
model.half()
model.save_pretrained(OUTPUT_MODEL_PATH)
image_processor.save_pretrained(OUTPUT_MODEL_PATH)
# 这里轮次显示很大，但没有问题，不必在意

Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.778638,0.5
1,No log,0.547746,0.625
2,No log,0.229008,0.875
3,No log,0.257173,0.875
4,No log,2.148985,0.625
5,No log,0.725921,0.875
6,No log,0.635504,0.75
7,No log,1.357455,0.75
8,No log,1.812296,0.625
9,No log,1.846175,0.625


['../models/video_classification_model\\preprocessor_config.json']

#### 模型推理

In [10]:
from transformers import pipeline

video_cls = pipeline(
    task="video-classification",
    model=OUTPUT_MODEL_PATH,
    torch_dtype=torch.float16,
)

print(
    video_cls(
        "../datasets/视频分类数据集/test/BaseballPitch/v_BaseballPitch_g24_c04.avi"
    )
)

Device set to use cpu


[{'score': 0.54931640625, 'label': 'BaseballPitch'}, {'score': 0.45068359375, 'label': 'Basketball'}]
