In [None]:
from unsloth import FastVisionModel 
import torch 
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen2.5-3B",
    load_in_4bit = True , # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

In [None]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

### Load PHOENIX-2014-T dataset

In [None]:
from datasets import load_dataset
from datasets import Dataset, DatasetDict
import os
from pathlib import Path
import pandas as pd

def create_phoenix14t_dataset(base_path):
    """
    Create a dataset from Phoenix14T videos
    
    Args:
        base_path: Path to the PHOENIX-2014-T directory containing train/dev/test splits
    """
    datasets = {}
    
    # Process each split (train, dev, test)
    for split in ["train", "dev", "test"]:
        split_path = os.path.join(base_path, "PHOENIX-2014-T/features/fullFrame-210x260px", split)
        
        # Get all MP4 files in the split directory
        video_files = [f for f in os.listdir(split_path) if f.endswith('.mp4')]
        
        # Create a list of dictionaries containing video paths
        data = {
            'video_path': [os.path.join(split_path, vid) for vid in video_files],
            'video_name': [os.path.splitext(vid)[0] for vid in video_files]
        }
        
        # Convert to DataFrame first (easier to handle)
        df = pd.DataFrame(data)
        
        # Convert DataFrame to Dataset
        datasets[split] = Dataset.from_pandas(df)
    
    # Create a DatasetDict containing all splits
    dataset_dict = DatasetDict(datasets)
    return dataset_dict

# Usage example:
base_path = "../PHOENIX-2014-T-release-v3/"  # Adjust this path to your dataset location
dataset = create_phoenix14t_dataset(base_path)

# Now you can access different splits like:
train_dataset = dataset['train']
dev_dataset = dataset['dev']
test_dataset = dataset['test']

# Print some information about the dataset
print(f"Train set size: {len(dataset['train'])}")
print(f"Dev set size: {len(dataset['dev'])}")
print(f"Test set size: {len(dataset['test'])}")

# Example of accessing a single item
print("\nExample item from training set:")
print(dataset['train'][0])

In [None]:
import torch
import torchvision
from torchvision.io import read_video

def load_video(example):
    """Load video for a single example"""
    video, audio, info = read_video(example['video_path'])
    # Preprocess video if needed (resize, normalize, etc.)
    example['video'] = video
    return example

# Apply the loading function to the dataset
train_sign_dataset = train_dataset.map(load_video)

In [None]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = train_sign_dataset,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 30,
        # num_train_epochs = 1, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    ),
)

In [None]:
trainer_stats = trainer.train()