In [2]:
import sys
sys.path.append("..")

In [3]:
from heron.datasets.base_datasets import BaseDataset
from heron.datasets.llava_datasets import LlavaDataset

import os
from typing import Any

import deepspeed
import fire
import torch
import yaml
from transformers import Trainer, TrainingArguments

from heron.datasets.utils import get_dataset
from heron.models.utils import (
    apply_lora_model,
    load_model,
    load_pretrained_weight,
    set_trainable_params,
)

  from .autonotebook import tqdm as notebook_tqdm


[2023-08-31 07:49:28,031] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
config_file = "projects/video_blip_st_llava_ja/exp001.yml"

with open(config_file, "r") as i_:
    config = yaml.safe_load(i_)

if os.environ.get("WANDB_NAME") is not None:
    config["training"]["output_dir"] = os.path.join(
        config["training_config"]["output_dir"], os.environ["WANDB_NAME"]
    )
    
model_config = config["model_config"]

In [3]:
 # configの割り当て
keys_to_finetune = config["model_config"]["keys_to_finetune"]
keys_to_freeze = config["model_config"]["keys_to_freeze"]
assert len(keys_to_finetune) == 0 or len(keys_to_freeze) == 0, "either keys_to_finetune or keys_to_freeze should be empty"

In [4]:
# DatasetのLoad
train_dataset, val_dataset = get_dataset(config)

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
Found cached dataset json (/home/kotaro/.cache/huggingface/datasets/turing-motors___json/turing-motors--LLaVA-Instruct-150K-JA-29845189be995094/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
100%|██████████| 1/1 [00:00<00:00, 170.85it/s]
Loading cached split indices for dataset at /home/kotaro/.cache/huggingface/datasets/turing-motors___json/turing-motors--LLaVA-Instruct-150K-JA-29845189be995094/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-b2678b154479ce1c.arrow and /home/kotaro/.cache/huggingface/datasets/turing-motors___json/turing-motors--LLaVA-Instruct-150K-JA-29845189be995094/0.0.0/8bb11242116d547c741b2e8a1f18598ffd

In [5]:
# 訓練に関するconfig
train_config = config["training_config"]
train_config.pop("deepspeed")
training_args = TrainingArguments(**train_config)

In [28]:
#train_dataset[0]["attention_mask"].shape, train_dataset[0]["input_ids"].shape, train_dataset[0]["pixel_values"].shape

In [6]:
model = load_model(config["model_config"])

Loading checkpoint shards: 100%|██████████| 3/3 [00:18<00:00,  6.28s/it]
You are using a model of type blip-2 to instantiate a model of type video_blip. This is not supported for all configurations of models and can yield errors.


model_path facebook/opt-2.7b


Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.31s/it]
Some weights of VideoBlipForConditionalGeneration were not initialized from the model checkpoint at Salesforce/blip2-opt-2.7b and are newly initialized: ['qformer.embeddings.word_embeddings.weight', 'qformer.embeddings.layernorm.weight', 'qformer.cls.predictions.transform.LayerNorm.bias', 'text_projection.weight', 'qformer.embeddings.position_embeddings.weight', 'itm_head.bias', 'itm_head.weight', 'qformer.cls.predictions.bias', 'qformer.cls.predictions.transform.dense.bias', 'qformer.cls.predictions.transform.LayerNorm.weight', 'qformer.cls.predictions.decoder.weight', 'qformer.embeddings.layernorm.bias', 'qformer.cls.predictions.transform.dense.weight', 'img_temperal_embedding.0', 'temp', 'qformer.cls.predictions.decoder.bias', 'text_projection.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of VideoBlipForConditionalGeneration we

In [7]:
if model_config["use_lora"]:
    keys_to_finetune.append("lora")
    model = apply_lora_model(model, model_config)

# load pretrained weight
if model_config.get("pretrained_path") is not None:
    print("load pretrained")
    load_pretrained_weight(model, model_config["pretrained_path"])
    print(
        f'Successfully loading pretrained weights from {model_config["pretrained_path"]}'
    )

In [8]:
# Set trainable params
trainable_list, untrainable_list = set_trainable_params(model, keys_to_finetune, keys_to_freeze)

In [9]:
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=training_args,
)

In [None]:
with torch.autocast("cuda"):
    trainer.train()