# Parallel Wrapper Evaluation

## Environment

### Imports

In [ ]:
import sys
sys.path.append('~/Projects/transformer_wrappers/src')

In [ ]:
import os
import itertools
from datetime import datetime
import json
import pandas as pd

In [ ]:
import torch

In [ ]:
from transformer_wrappers.wrappers import ParallelCausalLMWrapper

In [ ]:
import lm_eval

In [ ]:
from typing import List, Dict

In [ ]:
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

### Constants and globals

In [ ]:
TOKEN = None  # HF Token

In [ ]:
EXPERIMENTS_DIR_PATH: str = '~/Projects/transformer_wrappers/experiments'
EXPERIMENT_SERIES_ID: str = 'parallel_wrapper'

In [ ]:
BENCHMARKS: List[str] = ['hellaswag']

In [ ]:
BATCH_SIZE: int = 8

In [ ]:
MODEL = 'mistralai/Mistral-7B-Instruct-v0.2'  
# MODEL = 'meta-llama/Llama-2-7b-hf'
# MODEL = 'google/gemma-7b'
MODEL_CONFIGS = {
    'torch_dtype': torch.bfloat16,
    'device_map': torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    'token': TOKEN
    # 'load_in_4bit': True, 
    # 'bnb_4bit_use_double_quant': True, 
    # 'bnb_4bit_quant_type': 'nf4', 
    # 'bnb_4bit_compute_dtype': torch.bfloat16
}
TOKENIZER = MODEL
TOKENIZER_CONFIGS = {'token': TOKEN}
WRAPPER_CONFIGS_KEYS: List[str] = ['p_rate', 'block_parallel', 'iterative', 'scaling']

In [ ]:
experiment_configs: List[Dict] = [
    {k: v for k, v in zip(WRAPPER_CONFIGS_KEYS, configs)}
    for configs in itertools.product([2, 4], [True, False], [True, False], [True, False])
]

### Helper functions

In [ ]:
def save_res(results, model, config):
    dir_path = os.path.join(EXPERIMENTS_DIR_PATH, EXPERIMENT_SERIES_ID)
    file_path = os.path.join(dir_path, f'{datetime.now().strftime('%Y_%m_%d_%H_%M_%S')}.json')
    if not os.path.exists(EXPERIMENTS_DIR_PATH):
        os.mkdir(EXPERIMENTS_DIR_PATH)
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
    with open(file_path, 'w') as f:
        json.dump(
            {k: results[k] for k in results if k!='config'} | {'model': model, 'config': config}, f
        )


In [ ]:
def load_res():
    experiment_results = list()
    dir_path = os.path.join(EXPERIMENTS_DIR_PATH, EXPERIMENT_SERIES_ID)
    for file_name in os.listdir(dir_path):
        if file_name.endswith('.json'):
            with open(os.path.join(dir_path, file_name), 'r') as f:
                results = json.load(f)['results']
            experiment_results.append(results)

In [ ]:
def check_res_exists(model, config):
    dir_path = os.path.join(EXPERIMENTS_DIR_PATH, EXPERIMENT_SERIES_ID)
    if not os.path.exists(EXPERIMENTS_DIR_PATH):
        return False
    if not os.path.exists(dir_path):
        return False
    for file_name in os.listdir(dir_path):
        if file_name.endswith('.json'):
            with open(os.path.join(dir_path, file_name), 'r') as f:
                results = json.load(f)
            if (
                results['model'] == model and 
                results['config'] == config and 
                set(BENCHMARKS) == set(results['results'].keys())
            ):
                return True
    return False

In [ ]:
def load_res_df():
    dir_path = os.path.join(EXPERIMENTS_DIR_PATH, EXPERIMENT_SERIES_ID)
    data = []
    for file_name in os.listdir(dir_path):
        if file_name.endswith('.json'):
            with open(os.path.join(dir_path, file_name), 'r') as f:
                results = json.load(f)
            for benchmark in results['results']:
                data.append({
                    'benchmark': benchmark, 
                    'score': results[benchmark]['acc_norm,none'], 
                    'model': results['model'], 
                    **results['config']
                })
    df = pd.DataFrame(data)
    
    return df

## Experiment 1:

Change the attention mask to reduce the self-attention layers to "see" only 0, 1, 2, 4, 8, ... 64, 128, 256, ..., 8k tokens into the past

### Model

Load model

In [ ]:
model = ParallelCausalLMWrapper.from_pretrained(
    MODEL, 
    model_kwargs=MODEL_CONFIGS,
    tokenizer_kwargs=TOKENIZER_CONFIGS
)
model.enable_benchmarking()

### Evaluation

In [ ]:
for config in experiment_configs:
    # 
    if not check_res_exists(MODEL, config):
        # Set attn len
        for k, v in config.items():
            setattr(model.transformer_wrapper, k, v)
        # Run evaluation
        results = lm_eval.simple_evaluate(
            model="hf",
            model_args={'pretrained': model, 'tokenizer': model.tokenizer, 'backend': 'causal'},
            # model_args='pretrained=mistralai/Mistral-7B-Instruct-v0.2,attn_implementation=eager,device_map=cuda',
            tasks=BENCHMARKS,
            batch_size=BATCH_SIZE,
            log_samples=True,
        )
        # Save results
        save_res(results, MODEL, config)

### Results

Gather experiments results

In [ ]:
results_df = load_res_df()
results_df

In [ ]:
results_df.to_csv(os.path.join(EXPERIMENTS_DIR_PATH, f'results_{datetime.now()}.csv'))

Visualise results

In [ ]:
fig = plt.figure()
sns.lineplot(  # TODO
    data=results_df, 
    x='Attention size', 
    y='score', 
    hue='model', 
    col='benchmark'
)
plt.xscale('log')
plt.xlim([0, 1024])
plt.ylim([0, 1])
plt.show()

In [ ]:
fig.savefig(os.path.join(EXPERIMENTS_DIR_PATH, f'results_attn.pdf'))