In [1]:
import IPython
print(IPython.__version__)  # Check your IPython version

%load_ext autoreload
%autoreload 2

8.18.1


In [2]:
import torch
import transformers
from transformers import LlamaForCausalLM, LlamaTokenizer, AutoConfig
import json
from typing import List, Optional, Tuple, Dict
import time
from dataclasses import dataclass
import lm_eval
from lm_eval import evaluator, tasks, utils
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.tasks import get_task_dict
from functools import partial
from tqdm import tqdm
import numpy as np
import multiprocessing
import ftfy
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
from itertools import zip_longest

print(f"torch version: {torch.__version__}")
print(f"transformers version: {transformers.__version__}")

  Referenced from: <4C793A59-B32A-3AF1-BEA5-03AD7C5C80C6> /opt/anaconda3/envs/deep-learning-env/lib/python3.9/site-packages/torchvision/image.so
  warn(


torch version: 2.6.0.dev20241112
transformers version: 4.46.3


In [3]:
import sys, os
sys.path.append('../')
sys.path.append('./')

In [4]:
from data.config import BenchmarkConfig, DatasetConfig
from data.data import DatasetManager, ModelDataset
from data.metrics import BenchmarkMetrics
from models.h2o.h2o_gptneox import GPTNeoXAttention_Mask, convert_kvcache_gpt_neox_heavy_recent
from models.h2o.h2o_llama import LlamaAttention_heavy_hitter, convert_kvcache_llama_heavy_recent
from models.h2o.h2o_opt import OPTAttention_Mask, convert_kvcache_opt_heavy_recent
from models.base_models import ModelLoader
from scripts.run_benchmark import run_single_strategy_benchmark
import copy


In [5]:
device = (
    "mps" if torch.backends.mps.is_available() else
    "cuda" if torch.cuda.is_available() else
    "cpu"
)
print(f"Using device: {device}")

Using device: mps


In [6]:
base_config = BenchmarkConfig(
    model_name="TinyLlama/TinyLlama-1.1B-Chat-v0.1",
    model_type="llama",
    device="mps",
    sequence_length=256,
    max_tokens=32,
    temperature=0.7,
    datasets=[
        DatasetConfig(
            name="super_glue",
            config="copa",
            splits=["test[:10]"],
            input_prefix="Question: ",
            output_prefix="Answer: "
        )
    ]
)

In [7]:
full_config = copy.deepcopy(base_config)
full_config.attention_type = "default"
run_single_strategy_benchmark(full_config, strategy="full", cache_size=20)


Testing full strategy with 100% cache
Cleaning up memory...


config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/63.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

  with torch.cuda.amp.autocast():


Loading super_glue dataset (copa) (test[:10] split) with batch_size=1...
Dataset cached! Size: 10 examples


  with torch.cuda.amp.autocast():
From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.
Processing examples:   0%|          | 0/10 [00:10<?, ?it/s]



Results saved to benchmark_results/full_TinyLlama-1.1B-Chat-v0.1_cache100_20241208_001421.json
Cleaning up memory...


In [None]:
result = run_single_strategy_benchmark(
    strategy_config, 
    strategy=strategy, 
    cache_size=cache_size
)

In [None]:
# Base configuration template
base_config = BenchmarkConfig(
    model_name="huggyllama/llama-7b",
    model_type="llama",
    device="cuda",
    sequence_length=256,
    max_tokens=32,
    temperature=0.7,
    datasets=[
        DatasetConfig(
            name="super_glue",
            config="copa",
            splits=["test"],
            input_prefix="Question: ",
            output_prefix="Answer: "
        )
    ]
)
# Full attention (baseline)
full_config = copy.deepcopy(base_config)
full_config.attention_type = "default"
run_single_strategy_benchmark(full_config, strategy="full", cache_size=100)

# H2O (Heavy-Hitter + Recent)
h2o_config = copy.deepcopy(base_config)
h2o_config.attention_type = "h2o"
h2o_config.heavy_ratio = 0.1
h2o_config.recent_ratio = 0.1
run_single_strategy_benchmark(h2o_config, strategy="h2o", cache_size=100)
run_single_strategy_benchmark(h2o_config, strategy="h2o", cache_size=80)
run_single_strategy_benchmark(h2o_config, strategy="h2o", cache_size=40)
run_single_strategy_benchmark(h2o_config, strategy="h2o", cache_size=20)
run_single_strategy_benchmark(h2o_config, strategy="h2o", cache_size=4)

# # Streaming Attention
streaming_config = copy.deepcopy(base_config)
streaming_config.attention_type = "streaming"
streaming_config.window_size = 64
streaming_config.sink_size = 4
streaming_config.sink_update_rate = 0.1
run_single_strategy_benchmark(streaming_config, strategy="streaming", cache_size=100)
run_single_strategy_benchmark(streaming_config, strategy="streaming", cache_size=80)
run_single_strategy_benchmark(streaming_config, strategy="streaming", cache_size=40)
run_single_strategy_benchmark(streaming_config, strategy="streaming", cache_size=20)
run_single_strategy_benchmark(streaming_config, strategy="streaming", cache_size=4)


# # Local/Fixed Window
local_config = copy.deepcopy(base_config)
local_config.attention_type = "local"
local_config.window_size = 64
run_single_strategy_benchmark(local_config, strategy="local", cache_size=100)
run_single_strategy_benchmark(local_config, strategy="local", cache_size=80)
run_single_strategy_benchmark(local_config, strategy="local", cache_size=40)
run_single_strategy_benchmark(local_config, strategy="local", cache_size=20)
run_single_strategy_benchmark(local_config, strategy="local", cache_size=4)

# # Liquid Fusion
liquid_config = copy.deepcopy(base_config)
liquid_config.attention_type = "liquid_fusion"
liquid_config.window_size = 64
liquid_config.sink_size = 2
liquid_config.sink_update_rate = 0.1
liquid_config.heavy_ratio = 0.1
liquid_config.recent_ratio = 0.1
run_single_strategy_benchmark(liquid_config, strategy="liquid_fusion", cache_size=100)
run_single_strategy_benchmark(liquid_config, strategy="liquid_fusion", cache_size=80)
run_single_strategy_benchmark(liquid_config, strategy="liquid_fusion", cache_size=40)
run_single_strategy_benchmark(liquid_config, strategy="liquid_fusion", cache_size=20)


In [8]:
liquid_config = copy.deepcopy(base_config)
liquid_config.attention_type = "liquid_fusion"
liquid_config.window_size = 64
liquid_config.sink_size = 2
liquid_config.sink_update_rate = 0.1
liquid_config.heavy_ratio = 0.1
liquid_config.recent_ratio = 0.1
# run_single_strategy_benchmark(liquid_config, strategy="liquid_fusion", cache_size=100)
# run_single_strategy_benchmark(liquid_config, strategy="liquid_fusion", cache_size=80)
# run_single_strategy_benchmark(liquid_config, strategy="liquid_fusion", cache_size=40)
run_single_strategy_benchmark(liquid_config, strategy="liquid_fusion", cache_size=20)


Testing liquid_fusion strategy with 20% cache
Cleaning up memory...
Converting to liquid_fusion attention...
Error: The size of tensor a (2048) must match the size of tensor b (256) at non-singleton dimension 0
Cleaning up memory...


In [55]:
import logging
import copy
from data.config import BenchmarkConfig, DatasetConfig

# Setup logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

def test_liquid_fusion():
    # Base configuration with explicit position embeddings
    base_config = BenchmarkConfig(
        model_name="TinyLlama/TinyLlama-1.1B-Chat-v0.1",
        model_type="llama",
        device="mps",
        sequence_length=256,  # Match TinyLlama's architecture
        max_tokens=32,
        temperature=0.7,
        max_position_embeddings=256,  # TinyLlama's default
        datasets=[
            DatasetConfig(
                name="super_glue",
                config="copa",
                splits=["test"],
                input_prefix="Question: ",
                output_prefix="Answer: ",
                max_samples=5
            )
        ]
    )

    # Configure Liquid Fusion with matching dimensions
    liquid_config = copy.deepcopy(base_config)
    liquid_config.attention_type = "liquid_fusion"
    liquid_config.window_size = 64
    liquid_config.sink_size = 2
    liquid_config.sink_update_rate = 0.1
    liquid_config.heavy_ratio = 0.1
    liquid_config.recent_ratio = 0.1
    
    logger.info("Configuration created:")
    logger.info(f"Model: {liquid_config.model_name}")
    logger.info(f"Sequence length: {liquid_config.sequence_length}")
    logger.info(f"Max position embeddings: {liquid_config.max_position_embeddings}")
    logger.info(f"Window size: {liquid_config.window_size}")
    
    try:
        from scripts.run_benchmark import run_single_strategy_benchmark
        logger.info("Starting benchmark with cache_size=20")
        result = run_single_strategy_benchmark(liquid_config, strategy="liquid_fusion", cache_size=20)
        logger.info("Benchmark completed successfully")
        return result
    except Exception as e:
        logger.error(f"Error during benchmark: {str(e)}", exc_info=True)
        raise

if __name__ == "__main__":
    test_liquid_fusion()

2024-12-08:01:29:47,121 INFO     [2674569922.py:40] Configuration created:
2024-12-08:01:29:47,122 INFO     [2674569922.py:41] Model: TinyLlama/TinyLlama-1.1B-Chat-v0.1
2024-12-08:01:29:47,122 INFO     [2674569922.py:42] Sequence length: 256
2024-12-08:01:29:47,123 INFO     [2674569922.py:43] Max position embeddings: 256
2024-12-08:01:29:47,123 INFO     [2674569922.py:44] Window size: 64
2024-12-08:01:29:47,123 INFO     [2674569922.py:48] Starting benchmark with cache_size=20



Testing liquid_fusion strategy with 20% cache
Cleaning up memory...
Converting to liquid_fusion attention...
Loading super_glue dataset (copa) (test split) with batch_size=1...
Dataset cached! Size: 500 examples


Processing examples:   0%|          | 0/500 [00:01<?, ?it/s]



Results saved to benchmark_results/liquid_fusion_TinyLlama-1.1B-Chat-v0.1_cache20_20241208_012957.json
Cleaning up memory...


2024-12-08:01:29:58,101 INFO     [2674569922.py:50] Benchmark completed successfully
