In [2]:
import torch 
torch.cuda.empty_cache()

In [3]:
from unsloth import FastVisionModel 
import torch 
print(torch.cuda.is_available())
model, tokenizer = FastVisionModel.from_pretrained(
    "./unsloth/Qwen2.5-3B-4bit",
    load_in_4bit = True , # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = True , # True or "unsloth" for long context
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm



🦥 Unsloth Zoo will now patch everything to make training faster!
[2025-03-08 13:46:07,494] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


W0308 13:46:08.234000 105080 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


True
==((====))==  Unsloth 2025.3.8: Fast Qwen2_5_Vl vision patching. Transformers: 4.49.0.dev0.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.999 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


  _ = torch.tensor([0], device=i)
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [4]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

### Load PHOENIX-2014-T dataset

In [5]:
from datasets import load_dataset
from datasets import Dataset, DatasetDict, Video 
import os
from pathlib import Path
import pandas as pd
import gzip 
import pickle

def load_dataset_file(filename):
    with gzip.open(filename, "rb") as f:
        loaded_object = pickle.load(f)
        return loaded_object



   

def create_phoenix14t_dataset(base_path,dataset_path ):
    """
    Create a dataset from Phoenix14T videos
    
    Args:
        base_path: Path to the PHOENIX-2014-T directory containing train/dev/test splits
    """
    list_data_dict = load_dataset_file(dataset_path)
    key_lst = [key for key, value in list_data_dict.items()]
    print("key_lst length ",len(key_lst))
    # Process each split (train, dev, test)
    dataset = []
    for key in key_lst:
        sources = list_data_dict[key]
        video_files = sources["name"]+".mp4"
        video_files = os.path.join(base_path,   video_files)
        data = {
            'video':  str(Path(video_files).resolve()).replace("\\", "/"),
            'video_name': sources["name"], 
            'text': sources["text"]
        }
        dataset.append(data)
    # Convert to DataFrame first (easier to handle)
    df = pd.DataFrame(dataset)
    
    # Convert DataFrame to Dataset
    dataset = Dataset.from_pandas(df)
    return dataset


base_path = "../PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px"

train_dataset = create_phoenix14t_dataset(base_path,"data\Phonexi-2014T\labels.train")
dev_dataset = create_phoenix14t_dataset(base_path,"data/Phonexi-2014T/labels.dev")
test_dataset = create_phoenix14t_dataset(base_path,"data/Phonexi-2014T/labels.test")



# Print some information about the dataset
print(f"Train set size: {len(train_dataset)}")
print(f"Dev set size: {len(dev_dataset)}")
print(f"Test set size: {len(test_dataset)}")

# Example of accessing a single item
print("\nExample item from training set:")
print(dev_dataset[-1])
sample = dev_dataset[-1]

key_lst length  7096
key_lst length  519
key_lst length  642
Train set size: 7096
Dev set size: 519
Test set size: 642

Example item from training set:
{'video': 'E:/PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/dev/27January_2013_Sunday_tagesschau-8836.mp4', 'video_name': 'dev/27January_2013_Sunday_tagesschau-8836', 'text': 'am oberrhein heute nacht bis plus drei grad sonst verbreitet werte um den gefrierpunkt oder leichter frost'}


In [6]:
from unsloth_zoo.vision_utils import get_padding_tokens_ids, _get_dtype, process_vision_info

user_instruction = "Translate the sign language video to German."
system_instruction = "You are a professional German sign language translator."
from decord import VideoReader
def convert_to_conversation(sample):
 

    conversation = [
        { "role": "system",
            "content": [
                {"type": "text", "text": system_instruction}
            ]
        },
        { "role": "user",
            "content": [
                {"type": "text",  "text": user_instruction},
                {"type": "video", "video":sample["video"] }
            ]
        },
        { "role": "assistant",
        
            "content": [
                {"type": "text", "text": sample["text"]}
            ]
        },
    ]

    return conversation

class SignDataCollator: 
    __slots__ = ("padding_token_ids", "dtype", "ignore_index", "processor", "formatting_func")

    def __init__(self, model, processor,  ignore_index=-100):
        self.padding_token_ids = get_padding_tokens_ids(processor)
        self.dtype = _get_dtype(
            model.config.torch_dtype
            if hasattr(model.config, "torch_dtype")
            else model.get_input_embeddings().weight.dtype
        )
        self.ignore_index = ignore_index
        self.processor = processor
    
    def __call__(self, examples): 
        texts = [] 
        videos = [] 

        for example in examples: 
            messages = convert_to_conversation(example)
            message= self.processor.apply_chat_template(
                messages,
                tokenize = False,
                add_generation_prompt = False,
            )
            image, video = process_vision_info(messages)
            texts.append(message)
            videos.append(video)

        # Tokenize the texts and process the images
        batch = self.processor(
            text    = texts,
            videos  = videos,
            padding = True,
            # [TODO] Truncating to max_seq_length does NOT work for VLMs
            # truncation = True,
            return_tensors = "pt",
        )

        batch.pop("token_type_ids", None)
        # Pixtral accepts multiple images, so we have to cast it individually
        pixel_values = batch["pixel_values_videos"]
        if type(pixel_values) is list:
            for j, pixel_value_j in enumerate(pixel_values):
                if type(pixel_value_j) is list:
                    for k, pixel_value_k in enumerate(pixel_value_j):
                        pixel_value_j[k] = pixel_value_k.to(self.dtype)
                else:
                    pixel_values[j] = pixel_value_j.to(self.dtype)
            pass
            batch["pixel_values_videos"] = pixel_values
        else:
            batch["pixel_values_videos"] = batch["pixel_values_videos"].to(self.dtype)
        pass

        # Mask image tokens and pad tokens
        labels = batch["input_ids"].clone()
        labels[torch.isin(labels, self.padding_token_ids)] = self.ignore_index
        batch["labels"] = labels
        return batch


### Testing inference 

In [7]:
import torch 
from qwen_vl_utils import process_vision_info
torch.cuda.empty_cache()
FastVisionModel.for_inference(model) # Enable for inference!

print(train_dataset[-1])
messages = convert_to_conversation(train_dataset[-1])
# Preparation for inference
text = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = tokenizer(
    text = [text],
    videos = [video_inputs],
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
outputs = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

{'video': 'E:/PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/train/27January_2013_Sunday_tagesschau-8842.mp4', 'video_name': 'train/27January_2013_Sunday_tagesschau-8842', 'text': 'es bleibt windig'}


qwen-vl-utils using decord to read video.


Es bleibt windig.<|im_end|>


In [8]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = SignDataCollator(model, tokenizer), # Must use!
    train_dataset = train_dataset,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        #max_steps = 30,
        num_train_epochs = 60 , # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    ), 
)

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,096 | Num Epochs = 60 | Total steps = 53,220
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,084,928/2,093,459,456 (1.96% trained)


Step,Training Loss
1,5.0901
2,5.1713
3,5.1295
4,5.2079
5,4.8603
6,3.8967
