In [21]:
import re
import os
import itertools
import json
os.chdir('/fsx/wpq/github/metasummer2024/external/LLaVA') # jupyter lab moving ipynb does not change !pwd properly.

from rosemary.submit import shell_scripts_template, submit_job_slurm, multiline_to_singleline

## Pretrain

In [30]:
nodes = 1; num_gpus = 8; cpu_mem = 1000; num_cpus = 96

attn_implementation = 'flash_attention_2'
data_path = './data/liuhaotian/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json'
image_folder = './data/liuhaotian/LLaVA-Pretrain/images'


# ablate lm/vision/projector
job_name = 'pt_1'
model_name_or_path_list = [
    './results/baselines/lmsys/vicuna-7b-v1.5',
    # './results/baselines/NousResearch/Llama-2-7b-hf',
    # './results/baselines/unsloth/llama-3-8b',
]
vision_tower_list = [
    # './results/baselines/openai/clip-vit-large-patch14', # 224
    './results/baselines/openai/clip-vit-large-patch14-336',
]
mm_projector_type_list = [
    # 'linear',
    'mlp2x_gelu',
]
train_size = None


test_run = 1
test_run = bool(test_run)


options_list = itertools.product(
    model_name_or_path_list,
    vision_tower_list,
    mm_projector_type_list,
)
cmds = []

for (
    model_name_or_path,
    vision_tower,
    mm_projector_type,
) in options_list:
    output_dir  = f"lm={os.path.basename(model_name_or_path)}"
    output_dir += f"_vis={os.path.basename(vision_tower)}"
    output_dir += f"_mm={mm_projector_type}"
    
    output_dir = os.path.join('./results', job_name, output_dir)

    cmd = f"""
    WANDB_NAME={output_dir} deepspeed llava/train/train_mem.py \
        --deepspeed ./scripts/zero2.json \
        --model_name_or_path {model_name_or_path} \
        --version plain \
        --data_path {data_path} \
        --image_folder {image_folder} \
        --vision_tower {vision_tower} \
        --mm_projector_type {mm_projector_type} \
        --tune_mm_mlp_adapter True \
        --mm_vision_select_layer -2 \
        --mm_use_im_start_end False \
        --mm_use_im_patch_token False \
        --bf16 True \
        {"--train_size " + str(train_size) if train_size else ""} \
        --num_train_epochs 1 \
        --per_device_train_batch_size 32 \
        --per_device_eval_batch_size 4 \
        --gradient_accumulation_steps 1 \
        --evaluation_strategy "no" \
        --save_strategy "steps" \
        --save_steps 24000 \
        --save_total_limit 1 \
        --learning_rate 1e-3 \
        --weight_decay 0. \
        --warmup_ratio 0.03 \
        --lr_scheduler_type "cosine" \
        --logging_steps 1 \
        --tf32 True \
        --model_max_length 2048 \
        --gradient_checkpointing True \
        --dataloader_num_workers 4 \
        --lazy_preprocess True \
        --report_to wandb \
        --output_dir {output_dir} \
    """
    
    if test_run:
        print('\n'+' \\\n\t'.join([x.strip() for x in re.split(r'\s{3,}', cmd)]))
    
    cmd = multiline_to_singleline(cmd)
    cmds.append(cmd)
    
    
    shell_scripts = shell_scripts_template.format(
        profile='/fsx/wpq/.profile_local.sh',
        conda_env='llava',
        cwd=os.getcwd(),
        cmd=cmd,
        log_dir=os.getcwd(),
        save_dir=output_dir
    )
    
    out = submit_job_slurm(
        shell_scripts,
        job_name=job_name,
        partition='learnai4p',
        nodes=nodes,
        num_cpus=num_cpus,
        cpu_mem=cpu_mem,
        num_gpus=num_gpus,
        test_run=test_run,
    )
    
    print(json.dumps(out, indent=4))



[
    {
        "args": "sbatch /fsx/wpq/.sbatch/2024-06-07_05:42:49_07b8b584-5dcb-42fa-a8e7-ca92e451dcca.sh",
        "job_id": 350166
    }
]


## Finetune

In [14]:

test_run = False

job_name = 'pt'
nodes = 1; num_gpus = 8; cpu_mem = 1000; num_cpus = 96

attn_implementation = 'flash_attention_2'

model_name_or_path = './results/baselines/lmsys/vicuna-7b-v1.5'
pretrain_mm_mlp_adapter = './results/pretrain/llava-v1.5-7b/mm_projector.bin'
data_path = './data/liuhaotian/LLaVA-Instruct-150K/llava_v1_5_mix665k.json'
image_folder = './data/'
vision_tower = './results/baselines/openai/clip-vit-large-patch14-336'
mm_projector_type = 'mlp2x_gelu'
train_size = None

output_dir = './results/sft/llava-v1.5-7b'



cmds = []

cmd = f"""
deepspeed llava/train/train_mem.py \
    --deepspeed ./scripts/zero3.json \
    --model_name_or_path {model_name_or_path} \
    --version v1 \
    --data_path {data_path} \
    --image_folder {image_folder} \
    --vision_tower {vision_tower} \
    --pretrain_mm_mlp_adapter {pretrain_mm_mlp_adapter} \
    --mm_projector_type {mm_projector_type} \
    --mm_vision_select_layer -2 \
    --mm_use_im_start_end False \
    --mm_use_im_patch_token False \
    --image_aspect_ratio pad \
    --group_by_modality_length True \
    --bf16 True \
    --output_dir {output_dir} \
    --num_train_epochs 1 \
    --per_device_train_batch_size 16 \
    --per_device_eval_batch_size 4 \
    --gradient_accumulation_steps 1 \
    --evaluation_strategy "no" \
    --save_strategy "steps" \
    --save_steps 50000 \
    --save_total_limit 1 \
    --learning_rate 2e-5 \
    --weight_decay 0. \
    --warmup_ratio 0.03 \
    --lr_scheduler_type "cosine" \
    --logging_steps 1 \
    --tf32 True \
    --model_max_length 2048 \
    --gradient_checkpointing True \
    --dataloader_num_workers 4 \
    --lazy_preprocess True \
    --report_to wandb
"""

if test_run:
    print('\n'+' \\\n\t'.join([x.strip() for x in re.split(r'\s{3,}', cmd)]))

cmd = multiline_to_singleline(cmd)
cmds.append(cmd)


shell_scripts = shell_scripts_template.format(
    profile='/fsx/wpq/.profile_local.sh',
    conda_env='llava',
    cwd=os.getcwd(),
    cmd=cmd,
    log_dir=os.getcwd(),
    save_dir=output_dir
)

out = submit_job_slurm(
    shell_scripts,
    job_name=job_name,
    partition='learnai4p',
    nodes=nodes,
    num_cpus=num_cpus,
    cpu_mem=cpu_mem,
    num_gpus=num_gpus,
    test_run=test_run,
)

print(json.dumps(out, indent=4))

[
    {
        "args": "sbatch /fsx/wpq/.sbatch/2024-06-07_04:16:12_a5df1694-51be-4a80-adf4-3e186b251008.sh",
        "job_id": 350160
    }
]
