In [1]:
import re
import os
from dataclasses import dataclass
import itertools
import json
import glob
os.chdir('/fsx/wpq/github/metasummer2024/external/LLaVA') # jupyter lab moving ipynb does not change !pwd properly.
import pandas as pd
from rosemary.submit import shell_scripts_template, submit_job_slurm, multiline_to_singleline

log_dir = '/fsx/wpq/.slurm_log'


  from .autonotebook import tqdm as notebook_tqdm


## Pretrain

In [30]:
nodes = 1; num_gpus = 8; cpu_mem = 1000; num_cpus = 96
report_to = 'wandb'
data_path = './data/liuhaotian/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json'
image_folder = './data/liuhaotian/LLaVA-Pretrain/images'
train_size = None


# ablate lm/vision/projector
job_name = 'pt_1'
model_name_or_path_list = [
    './results/baselines/lmsys/vicuna-7b-v1.5',
    # './results/baselines/NousResearch/Llama-2-7b-hf',
    # './results/baselines/unsloth/llama-3-8b',
]
vision_tower_list = [
    # './results/baselines/openai/clip-vit-large-patch14', # 224
    './results/baselines/openai/clip-vit-large-patch14-336',
]
mm_projector_type_list = [
    # 'linear',
    'mlp2x_gelu',
]


test_run = 1
test_run = bool(test_run)


options_list = itertools.product(
    model_name_or_path_list,
    vision_tower_list,
    mm_projector_type_list,
)
cmds = []

for (
    model_name_or_path,
    vision_tower,
    mm_projector_type,
) in options_list:
    output_dir  = f"lm={os.path.basename(model_name_or_path)}"
    output_dir += f"_vis={os.path.basename(vision_tower)}"
    output_dir += f"_mm={mm_projector_type}"
    
    output_dir = os.path.join('./results', job_name, output_dir)

    os.environ['WANDB_NAME'] = output_dir.replace('./results/', '')
    cmd = f"""
    deepspeed llava/train/train_mem.py \
        --deepspeed ./scripts/zero2.json \
        --model_name_or_path {model_name_or_path} \
        --version plain \
        --data_path {data_path} \
        --image_folder {image_folder} \
        --vision_tower {vision_tower} \
        --mm_projector_type {mm_projector_type} \
        --tune_mm_mlp_adapter True \
        --mm_vision_select_layer -2 \
        --mm_use_im_start_end False \
        --mm_use_im_patch_token False \
        --bf16 True \
        {"--train_size " + str(train_size) if train_size else ""} \
        --num_train_epochs 1 \
        --per_device_train_batch_size 32 \
        --per_device_eval_batch_size 4 \
        --gradient_accumulation_steps 1 \
        --evaluation_strategy "no" \
        --save_strategy "steps" \
        --save_steps 24000 \
        --save_total_limit 1 \
        --learning_rate 1e-3 \
        --weight_decay 0. \
        --warmup_ratio 0.03 \
        --lr_scheduler_type "cosine" \
        --logging_steps 1 \
        --tf32 True \
        --model_max_length 2048 \
        --gradient_checkpointing True \
        --dataloader_num_workers 4 \
        --lazy_preprocess True \
        --report_to {'none' if test_run else report_to} \
        --output_dir {output_dir}
    """
    
    if test_run:
        print('\n'+' \\\n\t'.join([x.strip() for x in re.split(r'\s{3,}', cmd)]))
    
    cmd = multiline_to_singleline(cmd)
    cmds.append(cmd)
    
    
    shell_scripts = shell_scripts_template.format(
        profile='/fsx/wpq/.profile_local.sh',
        conda_env='llava',
        cwd=os.getcwd(),
        cmd=cmd,
        log_dir=log_dir,
        save_dir=output_dir
    )
    
    out = submit_job_slurm(
        shell_scripts,
        job_name=job_name,
        partition='learnai4p',
        nodes=nodes,
        num_cpus=num_cpus,
        cpu_mem=cpu_mem,
        num_gpus=num_gpus,
        log_path=os.path.join(log_dir, '%J.out'),
        test_run=test_run,
    )
    
    print(json.dumps(out, indent=4))



[
    {
        "args": "sbatch /fsx/wpq/.sbatch/2024-06-07_05:42:49_07b8b584-5dcb-42fa-a8e7-ca92e451dcca.sh",
        "job_id": 350166
    }
]


## Finetune

In [None]:
nodes = 1; num_gpus = 8; cpu_mem = 1000; num_cpus = 96
report_to = 'wandb'
train_size = None # all data
data_path = './data/liuhaotian/LLaVA-Instruct-150K/llava_v1_5_mix665k.json'
image_folder = './data/'

# model_name_or_path = './results/baselines/lmsys/vicuna-7b-v1.5'
# pretrain_mm_mlp_adapter = './results/pretrain/llava-v1.5-7b/mm_projector.bin'
# vision_tower = './results/baselines/openai/clip-vit-large-patch14-336'
# mm_projector_type = 'mlp2x_gelu'

# ablate lm/vision/projector
job_name = 'sft_1'
model_name_or_path_list = [
    './results/baselines/lmsys/vicuna-7b-v1.5',
    './results/baselines/NousResearch/Llama-2-7b-hf',
]
vision_tower_list = [
    './results/baselines/openai/clip-vit-large-patch14', # 224
    # './results/baselines/openai/clip-vit-large-patch14-336',
]
mm_projector_type_list = [
    # 'linear',
    'mlp2x_gelu',
]


test_run = 1
test_run = bool(test_run)


options_list = itertools.product(
    model_name_or_path_list,
    vision_tower_list,
    mm_projector_type_list,
)

cmds = []


for (
    model_name_or_path,
    vision_tower,
    mm_projector_type,
) in options_list:
    output_dir  = f"lm={os.path.basename(model_name_or_path)}"
    output_dir += f"_vis={os.path.basename(vision_tower)}"
    output_dir += f"_mm={mm_projector_type}"
    
    output_dir = os.path.join('./results', job_name, output_dir)

    mm_adaptor_job_dir = output_dir.replace(job_name, job_name.replace('sft', 'pt'))
    pretrain_mm_mlp_adapter = os.path.join(mm_adaptor_job_dir, 'mm_projector.bin')
    if not os.path.isfile(pretrain_mm_mlp_adapter):
        raise ValueError(f'{pretrain_mm_mlp_adapter} does not exists.')

    os.environ['WANDB_NAME'] = output_dir.replace('./results/', '')

    cmd = f"""
    deepspeed llava/train/train_mem.py \
        --deepspeed ./scripts/zero3.json \
        --model_name_or_path {model_name_or_path} \
        --version v1 \
        --data_path {data_path} \
        --image_folder {image_folder} \
        --vision_tower {vision_tower} \
        --pretrain_mm_mlp_adapter {pretrain_mm_mlp_adapter} \
        --mm_projector_type {mm_projector_type} \
        --mm_vision_select_layer -2 \
        --mm_use_im_start_end False \
        --mm_use_im_patch_token False \
        --image_aspect_ratio pad \
        --group_by_modality_length True \
        --bf16 True \
        {"--train_size " + str(train_size) if train_size else ""} \
        --num_train_epochs 1 \
        --per_device_train_batch_size 16 \
        --per_device_eval_batch_size 4 \
        --gradient_accumulation_steps 1 \
        --evaluation_strategy "no" \
        --save_strategy "steps" \
        --save_steps 50000 \
        --save_total_limit 1 \
        --learning_rate 2e-5 \
        --weight_decay 0. \
        --warmup_ratio 0.03 \
        --lr_scheduler_type "cosine" \
        --logging_steps 1 \
        --tf32 True \
        --model_max_length 2048 \
        --gradient_checkpointing True \
        --dataloader_num_workers 4 \
        --lazy_preprocess True \
        --report_to {'none' if test_run else report_to} \
        --output_dir {output_dir}
    """
    
    if test_run:
        print('\n'+' \\\n\t'.join([x.strip() for x in re.split(r'\s{3,}', cmd)]))
    
    cmd = multiline_to_singleline(cmd)
    cmds.append(cmd)
    
    
    shell_scripts = shell_scripts_template.format(
        profile='/fsx/wpq/.profile_local.sh',
        conda_env='llava',
        cwd=os.getcwd(),
        cmd=cmd,
        log_dir=log_dir,
        save_dir=output_dir
    )
    
    out = submit_job_slurm(
        shell_scripts,
        job_name=job_name,
        partition='learnai4p',
        nodes=nodes,
        num_cpus=num_cpus,
        cpu_mem=cpu_mem,
        num_gpus=num_gpus,
        log_path=os.path.join(log_dir, '%J.out'),
        test_run=test_run,
    )
    
    print(json.dumps(out, indent=4))

## Evaluations

In [2]:
from utils import download_eval_server_results
download_eval_server_results('./eval_server_results.csv', verbose=True)

Downloed https://evalai.s3.amazonaws.com/media/submission_files/submission_451709/7decbad8-5ee2-4c10-93ff-aad4ad0d7a5c.json
	->results/baselines/liuhaotian/llava-v1.5-7b/eval/vizwiz
Downloed https://evalai.s3.amazonaws.com/media/submission_files/submission_450628/9bef3c4f-d7e2-4ff9-8d8a-84768881b20a.json
	->results/baselines/liuhaotian/llava-v1.5-7b/eval/vqav2
Downloed https://evalai.s3.amazonaws.com/media/submission_files/submission_452449/1abc8d56-b08f-44ca-a6a8-88edb742d940.json
	->results/sft_1/lm=Llama-2-7b-hf_vis=clip-vit-large-patch14-336_mm=mlp2x_gelu/eval/vizwiz
Downloed https://evalai.s3.amazonaws.com/media/submission_files/submission_452451/08618a88-3928-47f4-a352-673ac10dc1c5.json
	->results/sft_1/lm=Llama-2-7b-hf_vis=clip-vit-large-patch14-336_mm=mlp2x_gelu/eval/vqav2


In [155]:
from rosemary.submit import submit_job_slurm
nodes = 1; num_gpus = 8; cpu_mem = 1000; num_cpus = 96

eval_script_dir = 'scripts/v1_5/eval_mod'
eval_rest = True
subdir_path_list = []
subdir_filter_fn = lambda x: True


task_names_vqa = ['vqav2', 'gqa', 'vizwiz', 'textvqa', 'scienceqa']
task_names_sft = ['pope', 'mme', 'mmbench', 'seed', 'llavabench', 'mmvet']
task_names_all = task_names_vqa + task_names_sft
task_names_llm_evaluator = ['llavabench', 'mmvet']

######

## baselines
subdir_path_list = [
    'results/baselines/liuhaotian/llava-v1.5-7b',
]
task_names = ['llavabench']
eval_rest = False

# # sft_1
# exp_dirs = ['results/sft_1']
# subdir_filter_fn = lambda x: 'Llama-2' in x
# task_names = ['pope', 'mme', 'mmbench', 'seed']
# eval_rest = False


######

test_run = 1
test_run = bool(test_run)

if len(subdir_path_list)==0:
    subdir_path_list = []
    for exp_dir in exp_dirs:
        subdirs = list(os.listdir(exp_dir))
        subdirs = filter(subdir_filter_fn, subdirs)
        for subdir in subdirs:
            subdir_path = os.path.join(exp_dir, subdir)
            if not os.path.isfile(os.path.join(subdir_path, 'config.json')): # skip runs not yet finished
                continue
            subdir_path_list.append(subdir_path)

@dataclass
class TaskConfig:
    task_finish_proof: str
    num_gpus: int
    bash_script_name: str

task_configs = {
    'vqav2': TaskConfig('*/answers_upload.json', 8, 'vqav2'),
    'gqa': TaskConfig('*/*_predictions.json', 8, 'gqa'),
    'vizwiz': TaskConfig('answers_upload.json', 1, 'vizwiz'),
    'textvqa': TaskConfig('answers.jsonl', 1, 'textvqa'),
    'scienceqa': TaskConfig('results.jsonl', 1, 'sqa'),
    'pope': TaskConfig('answers.jsonl', 1, 'pope'),
    'mme': TaskConfig('results', 1, 'mme'),
    'mmbench': TaskConfig('*.xlsx', 1, 'mmbench'),
    'mmvet': TaskConfig('results_*.csv', 1, 'mmvet'),
    'seed': TaskConfig('answers_upload.jsonl', 8, 'seed'),
    'llavabench': TaskConfig('reviews.jsonl', 1, 'llavabench'),
}

if eval_rest:
    task_name_and_model = []
    for subdir_path in subdir_path_list:
        for task_name in task_names:
            if not glob.glob(os.path.join(subdir_path, 'eval', task_name, task_configs[task_name].task_finish_proof)):
                task_name_and_model.append((task_name, subdir_path))
else:
    task_name_and_model = list(itertools.product(task_names, subdir_path_list))
    
print('cmds: #=', len(list(task_name_and_model)))
print(list(task_name_and_model), '\n\n')


dfo = pd.DataFrame(task_name_and_model, columns=['task_name', 'model_name_or_path'])
model_and_task_list = dfo.groupby('model_name_or_path')['task_name'].agg(list).to_dict()

cmds = []
for model_name_or_path, task_name_list in model_and_task_list.items():
    model_name_or_path = model_name_or_path.rstrip('/')
    for i, task_name in enumerate(task_name_list):
        task_config = task_configs[task_name]
        job_name = f'eval.{task_name}'
        save_dir = f'{model_name_or_path}/eval/{task_name}'
        os.makedirs(save_dir, exist_ok=True)

        if task_name.startswith(tuple(task_configs.keys())):
            cmd = f"bash {eval_script_dir}/{task_config.bash_script_name}.sh {model_name_or_path} > {os.path.join(save_dir,'bash_script_log.txt')}"
        else:
            raise ValueError(f'{task_name} not supported.')

        num_gpus = task_config.num_gpus            
        cmd = multiline_to_singleline(cmd)
        cmds.append(cmd)
        
        shell_scripts = shell_scripts_template.format(
            profile='/fsx/wpq/.profile_local.sh',
            conda_env='llava',
            cwd=os.getcwd(),
            cmd=cmd,
            log_dir=log_dir,
            save_dir=save_dir
        )
        out = submit_job_slurm(
            shell_scripts, 
            job_name=job_name,
            partition='learnai4p',
            nodes=nodes,
            num_cpus=num_cpus,
            cpu_mem=cpu_mem,
            num_gpus=num_gpus,
        log_path=os.path.join(log_dir, '%J.out'),
            test_run=test_run,
        )
        for x in out:
            x.update({'cmd': cmd})

        print(json.dumps(out[0], indent=4))

cmds: #= 1
[('llavabench', 'results/baselines/liuhaotian/llava-v1.5-7b')] 


{
    "args": "sbatch /fsx/wpq/.sbatch/2024-06-14_00:42:06_b517823a-482c-4d57-9ed3-b2eca9be6c26.sh",
    "cmd": "bash scripts/v1_5/eval_mod/llavabench.sh results/baselines/liuhaotian/llava-v1.5-7b > results/baselines/liuhaotian/llava-v1.5-7b/eval/llavabench/bash_script_log.txt"
}


In [3]:
from utils import TaskResult
    
        

task_name = 'mme'
# save_dir = f'/fsx/wpq/github/metasummer2024/external/LLaVA/results/sft_1/lm=Llama-2-7b-hf_vis=clip-vit-large-patch14-336_mm=mlp2x_gelu/eval/{task_name}'
save_dir = f'/fsx/wpq/github/metasummer2024/external/LLaVA/results/baselines/liuhaotian/llava-v1.5-7b/eval/{task_name}'

r = TaskResult(save_dir)
r.get_metrics()

{'perception': {'total_score': 1506.7612044817927,
  'existence_score': 190.0,
  'count_score': 155.0,
  'position_score': 128.33333333333334,
  'color_score': 170.0,
  'posters_score': 147.61904761904762,
  'celebrity_score': 137.05882352941177,
  'scene_score': 158.0,
  'landmark_score': 163.75,
  'artwork_score': 119.5,
  'ocr_score': 137.5},
 'cognition': {'total_score': 355.7142857142857,
  'commonsense_reasoning_score': 110.71428571428571,
  'numerical_calculation_score': 70.0,
  'text_translation_score': 107.5,
  'code_reasoning_score': 67.5}}



{'all': {'score_model/score_ref': 65.3,
  'score_ref': 90.8,
  'score_model': 59.3},
 'llava_bench_complex': {'score_model/score_ref': 79.3,
  'score_ref': 87.1,
  'score_model': 69.1},
 'llava_bench_conv': {'score_model/score_ref': 51.5,
  'score_ref': 96.5,
  'score_model': 49.7},
 'llava_bench_detail': {'score_model/score_ref': 56.9,
  'score_ref': 91.3,
  'score_model': 52.0}}