# TimeSformer Exercise Prediction

This notebook trains a TimeSformer model to predict exercises from video data.
Data is located in `data/`, with videos in `data/films/` and labels in CSV files.
We use `split.csv` to divide data into training and testing sets.

In [29]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import TimesformerForVideoClassification, VideoMAEImageProcessor, TrainingArguments, Trainer
from transformers import AutoImageProcessor, AutoModelForVideoClassification
import decord
from decord import VideoReader, cpu
from PIL import Image
from sklearn.metrics import accuracy_score

# Ensure decord works correctly
decord.bridge.set_bridge('torch')

## Configuration

In [30]:
DATA_DIR = 'data'
FILMS_DIR = os.path.join(DATA_DIR, 'films')
SPLIT_FILE = 'split.csv'
MODEL_CKPT = "facebook/timesformer-base-finetuned-k400"
BATCH_SIZE = 4
NUM_FRAMES = 8 # Number of frames to sample per clip
RESIZE_TO = 224

## Data Preprocessing

We need to parse the CSV files to identify labeled segments (clips). 
Each row in a CSV corresponds to a frame. We look for continuous segments of valid labels.

In [31]:
def parse_intervals(csv_path):
    """
    Parses a CSV file to find labeled intervals.
    Returns a list of dicts: {'label': label, 'start_frame': start, 'end_frame': end}
    """
    # Reading CSV without header, assume 3 columns: frame_info, unused, label_id
    try:
        df = pd.read_csv(csv_path, header=None)
    except pd.errors.EmptyDataError:
        return []
        
    # Assuming column 0 is frame, column 2 is label
    # Labels like -001 are background/ignore
    
    intervals = []
    if len(df) == 0:
        return intervals

    current_label = None
    start_frame = None
    
    labels = df.iloc[:, 2].values

    for i, label in enumerate(labels):
        if label != -1: # Valid label
            if current_label is None:
                current_label = label
                start_frame = i 
            elif label != current_label:
                # Label changed, save previous interval
                intervals.append({
                    'label': current_label,
                    'start_frame': start_frame,
                    'end_frame': i - 1
                })
                current_label = label
                start_frame = i
        else:
            if current_label is not None:
                # Interval ended
                intervals.append({
                    'label': current_label,
                    'start_frame': start_frame,
                    'end_frame': i - 1
                })
                current_label = None
                start_frame = None
    
    # Add last interval if exists
    if current_label is not None:
        intervals.append({
            'label': current_label,
            'start_frame': start_frame,
            'end_frame': len(labels) - 1
        })
        
    return intervals

def prepare_dataset(split_file): 
    split_df = pd.read_csv(split_file)
    
    train_clips = []
    test_clips = []
    
    all_labels = set()

    print(f"Processing {len(split_df)} files...")
    
    for _, row in split_df.iterrows():
        file_id = row['id']
        split_type = row['split']
        
        csv_path = os.path.join(DATA_DIR, f"{file_id}.csv")
        video_path = os.path.join(FILMS_DIR, f"{file_id}.mp4")
        
        if not os.path.exists(csv_path) or not os.path.exists(video_path):
            print(f"Missing file for ID: {file_id}")
            continue
            
        intervals = parse_intervals(csv_path)
        
        for interval in intervals:
            clip = {
                'video_path': video_path,
                'start_frame': interval['start_frame'],
                'end_frame': interval['end_frame'],
                'label': interval['label']
            }
            
            all_labels.add(interval['label'])
            
            if split_type == 'train':
                train_clips.append(clip)
            else:
                test_clips.append(clip)
                
    return train_clips, test_clips, sorted(list(all_labels))

train_clips, test_clips, unique_labels = prepare_dataset(SPLIT_FILE)
print(f"Train clips: {len(train_clips)}, Test clips: {len(test_clips)}")
print(f"Unique labels: {unique_labels}")

Processing 60 files...
Missing file for ID: 027_e84uoke1
Train clips: 3165, Test clips: 3179
Unique labels: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]


In [32]:
# Create label mappings
# Convert numpy types to native python types for JSON serialization
label2id = {int(label): int(i) for i, label in enumerate(unique_labels)}
id2label = {int(i): int(label) for label, i in label2id.items()}
print("Label Mappings:", label2id)

Label Mappings: {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15}


## Dataset Class

In [33]:
class ExerciseDataset(Dataset):
    def __init__(self, clips, processor, num_frames=8):
        self.clips = clips
        self.processor = processor
        self.num_frames = num_frames

    def __len__(self):
        return len(self.clips)

    def __getitem__(self, idx):
        clip = self.clips[idx]
        video_path = clip['video_path']
        start_f = clip['start_frame']
        end_f = clip['end_frame']
        label = clip['label']

        # Load video
        try:
            vr = VideoReader(video_path, ctx=cpu(0))
        except Exception as e:
            print(f"Error reading {video_path}: {e}")
            return self.__getitem__((idx + 1) % len(self))

        total_frames = len(vr)
        
        start_f = max(0, min(start_f, total_frames - 1))
        end_f = max(0, min(end_f, total_frames - 1))
        
        if start_f >= end_f:
             indices = [start_f] * self.num_frames
        else:
            # Sample indices evenly
            indices = np.linspace(start_f, end_f, self.num_frames).astype(int)

        video = vr.get_batch(indices)
        if hasattr(video, 'asnumpy'):
            video = video.asnumpy()
        else:
            video = video.numpy()
        
        # Normalize and process
        inputs = self.processor(list(video), return_tensors="pt")
        
        return {
            "pixel_values": inputs.pixel_values.squeeze(), # (num_frames, 3, 224, 224)
            "labels": torch.tensor(label2id[label])
        }

## Training Setup

In [34]:
# Processor and Model
processor = AutoImageProcessor.from_pretrained(MODEL_CKPT)
model = TimesformerForVideoClassification.from_pretrained(
    MODEL_CKPT,
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True 
)

train_dataset = ExerciseDataset(train_clips, processor, num_frames=NUM_FRAMES)
test_dataset = ExerciseDataset(test_clips, processor, num_frames=NUM_FRAMES)

Some weights of TimesformerForVideoClassification were not initialized from the model checkpoint at facebook/timesformer-base-finetuned-k400 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([16, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([16]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
args = TrainingArguments(
    output_dir="timesformer_results",
    remove_unused_columns=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4, 
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    num_train_epochs=3,
    fp16=True if torch.cuda.is_available() else False,
    
    report_to="none",
    logging_dir="./logs",
    disable_tqdm=False,
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy_score(labels, predictions)}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    
)

In [36]:
print("Starting training...")
trainer.train()

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0351,0.230442,0.944322
2,0.0164,0.183605,0.958792
3,0.0011,0.177067,0.960994


TrainOutput(global_step=594, training_loss=0.20587469969785174, metrics={'train_runtime': 1995.768, 'train_samples_per_second': 4.758, 'train_steps_per_second': 0.298, 'total_flos': 8.319757294526792e+18, 'train_loss': 0.20587469969785174, 'epoch': 3.0})

In [37]:
trainer.save_model("./models/model1")

In [38]:
test_model = AutoModelForVideoClassification.from_pretrained("./models/model1")
print("Model wczytany pomyślnie!")

Model wczytany pomyślnie!
