In [1]:
import IPython
print(IPython.__version__)  # Check your IPython version

%load_ext autoreload
%autoreload 2

8.18.1


In [2]:
import torch
import transformers
from transformers import LlamaForCausalLM, LlamaTokenizer, AutoConfig
import json
from typing import List, Optional, Tuple, Dict
import time
from dataclasses import dataclass
import lm_eval
from lm_eval import evaluator, tasks, utils
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.tasks import get_task_dict
from functools import partial
from tqdm import tqdm
import numpy as np
import multiprocessing
import ftfy
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
from itertools import zip_longest

print(f"torch version: {torch.__version__}")
print(f"transformers version: {transformers.__version__}")

  Referenced from: <4C793A59-B32A-3AF1-BEA5-03AD7C5C80C6> /opt/anaconda3/envs/deep-learning-env/lib/python3.9/site-packages/torchvision/image.so
  warn(


torch version: 2.6.0.dev20241112
transformers version: 4.46.3


In [5]:
import sys, os
sys.path.append('../')
sys.path.append('./')

In [10]:
from data.config import BenchmarkConfig, DatasetConfig
from data.data import DatasetManager, ModelDataset
from data.metrics import BenchmarkMetrics
from models.h2o.h2o_gptneox import GPTNeoXAttention_Mask, convert_kvcache_gpt_neox_heavy_recent
from models.h2o.h2o_llama import LlamaAttention_heavy_hitter, convert_kvcache_llama_heavy_recent
from models.h2o.h2o_opt import OPTAttention_Mask, convert_kvcache_opt_heavy_recent
from models.base_models import ModelLoader
from scripts.run_benchmark import run_single_strategy_benchmark
import copy


In [7]:
device = (
    "mps" if torch.backends.mps.is_available() else
    "cuda" if torch.cuda.is_available() else
    "cpu"
)
print(f"Using device: {device}")

Using device: mps


In [12]:
base_config = BenchmarkConfig(
    model_name="huggyllama/llama-7b",
    model_type="llama",
    device="mps",
    sequence_length=256,
    max_tokens=32,
    temperature=0.7,
    datasets=[
        DatasetConfig(
            name="super_glue",
            config="copa",
            splits=["test[:10]"],
            input_prefix="Question: ",
            output_prefix="Answer: "
        )
    ]
)

In [16]:
full_config = copy.deepcopy(base_config)
full_config.attention_type = "default"
run_single_strategy_benchmark(full_config, strategy="full", cache_size=100)


Testing full strategy with 100% cache
Cleaning up memory...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Cleaning up memory...


KeyboardInterrupt: 

: 

In [None]:
# Base configuration template
base_config = BenchmarkConfig(
    model_name="huggyllama/llama-7b",
    model_type="llama",
    device="cuda",
    sequence_length=256,
    max_tokens=32,
    temperature=0.7,
    datasets=[
        DatasetConfig(
            name="super_glue",
            config="copa",
            splits=["test"],
            input_prefix="Question: ",
            output_prefix="Answer: "
        )
    ]
)
# Full attention (baseline)
full_config = copy.deepcopy(base_config)
full_config.attention_type = "default"
run_single_strategy_benchmark(full_config, strategy="full", cache_size=100)

# H2O (Heavy-Hitter + Recent)
h2o_config = copy.deepcopy(base_config)
h2o_config.attention_type = "h2o"
h2o_config.heavy_ratio = 0.1
h2o_config.recent_ratio = 0.1
run_single_strategy_benchmark(h2o_config, strategy="h2o", cache_size=100)
run_single_strategy_benchmark(h2o_config, strategy="h2o", cache_size=80)
run_single_strategy_benchmark(h2o_config, strategy="h2o", cache_size=40)
run_single_strategy_benchmark(h2o_config, strategy="h2o", cache_size=20)
run_single_strategy_benchmark(h2o_config, strategy="h2o", cache_size=4)

# # Streaming Attention
streaming_config = copy.deepcopy(base_config)
streaming_config.attention_type = "streaming"
streaming_config.window_size = 64
streaming_config.sink_size = 4
streaming_config.sink_update_rate = 0.1
run_single_strategy_benchmark(streaming_config, strategy="streaming", cache_size=100)
run_single_strategy_benchmark(streaming_config, strategy="streaming", cache_size=80)
run_single_strategy_benchmark(streaming_config, strategy="streaming", cache_size=40)
run_single_strategy_benchmark(streaming_config, strategy="streaming", cache_size=20)
run_single_strategy_benchmark(streaming_config, strategy="streaming", cache_size=4)


# # Local/Fixed Window
local_config = copy.deepcopy(base_config)
local_config.attention_type = "local"
local_config.window_size = 64
run_single_strategy_benchmark(local_config, strategy="local", cache_size=100)
run_single_strategy_benchmark(local_config, strategy="local", cache_size=80)
run_single_strategy_benchmark(local_config, strategy="local", cache_size=40)
run_single_strategy_benchmark(local_config, strategy="local", cache_size=20)
run_single_strategy_benchmark(local_config, strategy="local", cache_size=4)

# # Liquid Fusion
liquid_config = copy.deepcopy(base_config)
liquid_config.attention_type = "liquid_fusion"
liquid_config.window_size = 64
liquid_config.sink_size = 2
liquid_config.sink_update_rate = 0.1
liquid_config.heavy_ratio = 0.1
liquid_config.recent_ratio = 0.1
run_single_strategy_benchmark(liquid_config, strategy="liquid_fusion", cache_size=100)
run_single_strategy_benchmark(liquid_config, strategy="liquid_fusion", cache_size=80)
run_single_strategy_benchmark(liquid_config, strategy="liquid_fusion", cache_size=40)
run_single_strategy_benchmark(liquid_config, strategy="liquid_fusion", cache_size=20)
