# Parallel Wrapper Evaluation

## Environment

### Imports

In [None]:
import sys
sys.path.append('')

In [None]:
import os
import itertools
from datetime import datetime
import json
import yaml
import pandas as pd
import hashlib

In [None]:
import torch

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig, TrainingArguments
from datasets import Dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from peft import AutoPeftModelForCausalLM, AutoPeftModelForSeq2SeqLM
from trl import SFTTrainer

In [None]:
from transformer_wrappers.wrappers import ResizableCausalLMWrapper, CausalLMWrapper
from transformer_wrappers.wrappers.resizable import ResizableTokenizer

In [None]:
from transformer_wrappers.data import OpenAssistantGuanaco

In [None]:
import lm_eval

In [None]:
from typing import List, Dict

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

### Constants and globals

In [None]:
TOKEN = None  # HF Token

In [None]:
EXPERIMENTS_DIR_PATH: str = '/home/vincenzoscotti/Projects/transformer_wrappers/experiments'
EXPERIMENT_SERIES_ID: str = 'resizable_wrapper'

In [None]:
BENCHMARKS: List[str] = ['hellaswag', 'winogrande', 'truthfulqa_mc1', 'gsm8k', 'arc_challenge', 'mmlu']

In [None]:
BATCH_SIZE: int = 8

In [None]:
MODEL = 'mistralai/Mistral-7B-Instruct-v0.2'  
# MODEL = 'meta-llama/Llama-2-7b-hf'
# MODEL = 'google/gemma-7b'
MODEL_CONFIGS = {
    'torch_dtype': torch.bfloat16,
    'device_map': 'auto',  # torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    'token': TOKEN
}
QUANTIZATION_CONFIGS = {
    'load_in_4bit': True,
    'bnb_4bit_use_double_quant': True, 
    'bnb_4bit_quant_type': 'nf4', 
    'bnb_4bit_compute_dtype': torch.bfloat16
}
TOKENIZER = MODEL
TOKENIZER_CONFIGS = {'token': TOKEN, 'pad_token': '</s>'}
WRAPPER_CONFIGS_KEYS: List[str] = ['max_token_len']
WRAPPER_CONFIGS_VALUES: List[str] = [[1, 2, 3, 4, 5, 6, 7, 8, None]]

In [None]:
LORA_CONFIGS = {
    'lora_alpha': 16,
    'lora_dropout': 0.1,
    'r': 16,
    'bias': 'none',
    'task_type': 'CAUSAL_LM'
}
TRAINING_ARGS = {
    'num_train_epochs': 3,
    'per_device_train_batch_size': 1,
    'gradient_accumulation_steps': 32,
    'per_device_eval_batch_size': 1,
    'gradient_checkpointing': True,
    'optim': 'paged_adamw_32bit',
    'logging_steps': 50,
    'report_to': 'tensorboard',
    'save_strategy': 'epoch',
    'evaluation_strategy': 'epoch',
    'load_best_model_at_end': True,
    'learning_rate': 2.e-5,
    'bf16': True,
    'tf32': True,
    'max_grad_norm': 1.0,
    'warmup_ratio': 0.03,
    'lr_scheduler_type': 'cosine',
    # 'disable_tqdm': True
}

In [None]:
if not os.path.exists(EXPERIMENTS_DIR_PATH):
    os.mkdir(EXPERIMENTS_DIR_PATH)
if not os.path.exists(os.path.join(EXPERIMENTS_DIR_PATH, EXPERIMENT_SERIES_ID)):
    os.mkdir(os.path.join(EXPERIMENTS_DIR_PATH, EXPERIMENT_SERIES_ID))

current_experiments_dir = os.path.join(EXPERIMENTS_DIR_PATH, EXPERIMENT_SERIES_ID, MODEL.replace('/', '-'))
if not os.path.exists(current_experiments_dir):
    os.mkdir(current_experiments_dir)

### Helper functions

In [None]:
def get_configs_hash(configs):
    return hashlib.sha256(str(configs).encode()).hexdigest()

In [None]:
def append_results(**kwargs):
    file_path = os.path.join(EXPERIMENTS_DIR_PATH, EXPERIMENT_SERIES_ID, 'results.csv')
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        df = pd.concat([df, pd.DataFrame.from_dict([kwargs])], ignore_index=True)
    else:
        df = pd.DataFrame.from_dict([kwargs])
    df.to_csv(file_path, index=None)

In [None]:
def load_results_df():
    file_path = os.path.join(EXPERIMENTS_DIR_PATH, EXPERIMENT_SERIES_ID, 'results.csv')
    return pd.read_csv(file_path, index=False)

In [None]:
def save_results(results, configs, benchmark):
    # Create dir (if necessary)
    dir_path = os.path.join(current_experiments_dir, get_configs_hash(configs))
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
    # Save configs (if necessary)
    configs_file_path = os.path.join(dir_path, 'configs.yml')
    if not os.path.exists(configs_file_path):
        with open(configs_file_path, 'w') as f:
            yaml.dump(configs, f)
    # Save results
    file_path = os.path.join(dir_path, f'{benchmark}.json')
    with open(file_path, 'w') as f:
        json.dump({k: results[k] for k in results if k!='config'}, f)
    # Append results to main CSV
    append_results(model=MODEL, benchmark=benchmark, **results['results'][benchmark], **configs)

In [None]:
def check_results_exist(configs, benchmark):
    dir_path = os.path.join(current_experiments_dir, get_configs_hash(configs))
    file_path = os.path.join(dir_path, f'{benchmark}.json')
    
    return os.path.exists(file_path)

In [None]:
def setup_fine_tuning(configs):
    # Create dir (if necessary)
    dir_path = os.path.join(current_experiments_dir, f'FT-{get_configs_hash(configs)}')
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
    istance_dir_path = os.path.join(dir_path, datetime.now().strftime('%Y_%m_%d_%H_%M_%S'))
    os.mkdir(istance_dir_path)
    os.mkdir(os.path.join(istance_dir_path, 'logs'))
    # Save configs (if necessary)
    configs_file_path = os.path.join(istance_dir_path, 'configs.yml')
    with open(configs_file_path, 'w') as f:
        yaml.dump(
            {
                'wrapper_configs': configs, 
                'model': MODEL, 
                'model_configs': {k: v if k not in {'torch_dtype', 'device_map'} else str(v) for k, v in MODEL_CONFIGS.items()},
                'quantization_configs': {k: v if k != 'bnb_4bit_compute_dtype' else str(v) for k, v in QUANTIZATION_CONFIGS.items()},
                'tokenizer': TOKENIZER,
                'tokenizer_configs': TOKENIZER_CONFIGS,
                'lora_configs': LORA_CONFIGS,
                'training_args': TRAINING_ARGS
            }, 
            f
        )

    return istance_dir_path

In [None]:
def save_fine_tuning_results(results, configs, benchmark, istance_dir_path):
    # Save results
    file_path = os.path.join(istance_dir_path, f'{benchmark}.json')
    with open(file_path, 'w') as f:
        json.dump({k: results[k] for k in results if k!='config'}, f)
    # Append results to main CSV
    append_results(model=MODEL, benchmark=benchmark, **results['results'][benchmark], **configs)

## Experiment:

### Model

Load model

In [None]:
model = ResizableCausalLMWrapper.from_pretrained(
# model = CausalLMWrapper.from_pretrained(
    MODEL, 
    model_kwargs=MODEL_CONFIGS,
    tokenizer_kwargs=TOKENIZER_CONFIGS
)
model.enable_benchmarking()

### Evaluation

In [None]:
experiment_configs: List[Dict] = [
    {k: v for k, v in zip(WRAPPER_CONFIGS_KEYS, configs)}
    for configs in itertools.product(*WRAPPER_CONFIGS_VALUES)
]

In [None]:
# Iterate over benchmarks
for benchmark in BENCHMARKS:
    print("# Benchmark")
    # Iterate over configs
    for configs in experiment_configs:
        print(f"## Configs: {configs}")
        # Run evaluation (if results are not avaialble yet)
        if not check_results_exist(configs, benchmark):
            # Set attribute values  # TODO fixme
            for k, v in configs.items():
                setattr(model, k, v)
            # Run evaluation
            results = lm_eval.simple_evaluate(
                model="hf",
                model_args={'pretrained': model, 'tokenizer': model.tokenizer, 'backend': 'causal'},
                # model_args='pretrained=mistralai/Mistral-7B-Instruct-v0.2,attn_implementation=eager,device_map=cuda',
                tasks=[benchmark],
                batch_size=BATCH_SIZE,
                log_samples=False,
            )
            # Save results
            save_results(results, configs, benchmark)

## Fine-tuning

In [None]:
configs = {'max_token_len': 1, 'fine_tuning': True}

### Model

Load model

In [None]:
model = AutoModelForCausalLM.from_pretrained(MODEL, **MODEL_CONFIGS, quantization_config=BitsAndBytesConfig(**QUANTIZATION_CONFIGS))
tokenizer = ResizableTokenizer(AutoTokenizer.from_pretrained(TOKENIZER, **TOKENIZER_CONFIGS), max_token_len=1)

Adapters

In [None]:
lora_configs = LoraConfig(**LORA_CONFIGS)

In [None]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_configs)

### Data

In [None]:
data_splits = {
    split: OpenAssistantGuanaco(split, tokenizer)
    for split in ['train', 'validation', 'test']
}

### Training

In [None]:
fine_tuning_dir = setup_fine_tuning(configs)

Trainer args

In [None]:
training_args = TrainingArguments(
    output_dir=os.path.join(fine_tuning_dir, 'logs'),
    logging_dir=os.path.join(fine_tuning_dir, 'logs'),
    **TRAINING_ARGS
)

Trainer

In [None]:
q_trainer = SFTTrainer(
    model=model,
    train_dataset=Dataset.from_dict({'text': data_splits['train'].data}),
    eval_dataset=Dataset.from_dict({'text': data_splits['validation'].data}),
    peft_config=lora_configs,
    max_seq_length=4096,
    tokenizer=tokenizer,
    args=training_args,
    # data_collator=data_splits['train'].huggingface_collate
    # formatting_func=lambda x: x['text']
    dataset_text_field='text'
)

In [None]:
q_trainer.train()
q_trainer.save_model()

In [None]:
q_trainer.eval()

### Evaluation

In [None]:
# Iterate over benchmarks
for benchmark in BENCHMARKS:
    print("# Benchmark")
    # Run evaluation
    results = lm_eval.simple_evaluate(
        model="hf",
        model_args={'pretrained': model, 'tokenizer': tokenizer, 'backend': 'causal'},
        # model_args='pretrained=mistralai/Mistral-7B-Instruct-v0.2,attn_implementation=eager,device_map=cuda',
        tasks=[benchmark],
        batch_size=BATCH_SIZE,
        log_samples=False,
    )
    # Save results
    save_fine_tuning_results(results, configs, benchmark, fine_tuning_dir)

## Results

Gather experiments results

In [None]:
results_df = load_results_df()
results_df

In [None]:
null_acc_norm_mask = results_df['acc_norm,none'].isnull()
null_acc_norm_mask
results_df['accuracy'] = results_df['acc,none']
results_df.loc[~null_acc_norm_mask, 'accuracy'] = results_df[~null_acc_norm_mask]['acc_norm,none'].values

null_max_token_len_mask = results_df['max_token_len'].isnull()
null_max_token_len_mask
results_df.loc[null_max_token_len_mask, 'max_token_len'] = 5
results_df['max_token_len'] = results_df['max_token_len'].astype(int)

results_df

In [None]:
fig = plt.figure()
sns.relplot(
    data=results_df,
    # col='p_rate',
    y='accuracy',
    hue='benchmark',
    x='max_token_len',
    # size=...,
    # hue='iterative',
    style='model',
    kind='line',
    markers=True
    # row=...
    # xlim=[-0.1, 1.0],
    # ylim=[0.0, 1.1]
)
plt.xlim([0.5, 5.5])
plt.xticks(range(1, 6), [1,2,3,4,'Original'])
plt.ylim([0.0, 1.0])
# plt.hlines(0.8366859191396137, 0, 1, colors='g', linestyles='dashed', label='base_model')
plt.grid()
# plt.legend(loc='center')
plt.show()