In [1]:
"""
This script is adapted from 
https://github.com/gkamradt/LLMTest_NeedleInAHaystack
"""

import os
import glob
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

import numpy as np
import argparse
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)

from datetime import datetime, timezone
import time
import torch

In [2]:
context_lengths_min=120000
context_lengths_max=1048000
context_lengths_num_intervals=40
pretrained_len=1048000
sparsity=0.5
document_depth_percent_min=0
document_depth_percent_max=100
document_depth_percent_intervals=10
document_depth_percent_interval_type="linear"
final_context_length_buffer=200
simulation_length=50
prefilling_chunk_size=32000

needle="\n\nRemember, the best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n\n"
retrieval_question="what is the best thing to do in San Francisco?\n\nAnswer: The best thing to do in San Francisco is"
haystack_dir="eval/needle/PaulGrahamEssays"
testing_results = []

context_lengths = np.round(
    np.linspace(
        context_lengths_min,
        context_lengths_max,
        num=context_lengths_num_intervals,
        endpoint=True,
    )
).astype(int)

if document_depth_percent_interval_type == "linear":
    document_depth_percents = np.round(
        np.linspace(
            document_depth_percent_min,
            document_depth_percent_max,
            num=document_depth_percent_intervals,
            endpoint=True,
        )
    ).astype(int)
elif document_depth_percent_interval_type == "sigmoid":
    document_depth_percents = [
        logistic(x)
        for x in np.linspace(
            document_depth_percent_min,
            document_depth_percent_max,
            document_depth_percent_intervals,
        )
    ]

model_name = "models/Llama-3-8B-Instruct-Gradient-1048k"
model_to_test_description = model_name
enc = AutoTokenizer.from_pretrained(model_name, use_fast=False)
generation_config = GenerationConfig.from_pretrained(model_name)
eos_token_ids = generation_config.eos_token_id

if not isinstance(eos_token_ids, list):
    eos_token_ids = [eos_token_ids]

if enc.pad_token_id is None:
    if enc.eos_token_id is not None:
        enc.pad_token_id = enc.eos_token_id
    else:
        enc.pad_token_id = 0
print("Loading from %s" % model_name)

model_to_test = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
).eval()

Loading from models/Llama-3-8B-Instruct-Gradient-1048k


You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
from duo_attn.utils import load_attn_pattern, sparsify_attention_heads
from duo_attn.patch import enable_duo_attention_eval

# Load the attention pattern
attn_heads, sink_size, recent_size = load_attn_pattern(
    "attn_patterns/Llama-3-8B-Instruct-Gradient-1048k/lr=0.02-reg=0.05-ctx=1000_32000-multi_passkey10"
)

print(attn_heads.shape)
print(sink_size)
print(recent_size)

# Sparsify attention heads
attn_heads_after, sparsity = sparsify_attention_heads(attn_heads, sparsity=0.5)

print(attn_heads, attn_heads_after, sparsity)

enable_duo_attention_eval(
    model_to_test,
    attn_heads_after,
    sink_size=64,
    recent_size=256,
)

(32, 8)
128
256
[[8.59375494e-01 6.52343938e-01 1.00000032e+00 3.39843808e-01
  4.09967454e-07 6.79687889e-01 3.49610313e-01 2.73438213e-01]
 [5.73501730e-07 5.51068665e-07 5.32082884e-07 1.00000024e+00
  8.73311763e-07 8.24219616e-01 4.82941833e-07 3.61803403e-07]
 [7.22682208e-07 6.91406515e-01 4.76164900e-08 9.60938032e-01
  7.30469461e-01 9.84375899e-01 9.64844422e-01 9.38280445e-07]
 [7.85156515e-01 2.34008409e-05 3.90153507e-05 2.30391647e-07
  8.71094682e-01 9.76562762e-01 1.00000023e+00 4.21875728e-01]
 [3.53405988e-07 2.60187280e-07 1.17676117e-01 8.63281454e-01
  1.00000087e+00 9.88281270e-01 8.35938315e-01 2.61545667e-04]
 [7.89063004e-01 9.84375303e-01 1.00000001e+00 5.82031273e-01
  1.00000041e+00 9.45312536e-01 9.52149964e-02 1.00000049e+00]
 [5.00000188e-01 8.63281538e-01 7.03125830e-01 7.18750321e-01
  9.10156503e-01 1.00000075e+00 1.00000024e+00 8.94531651e-01]
 [1.00000022e+00 8.32031354e-01 1.44052526e-02 1.00000059e+00
  6.20250915e-05 1.00000031e+00 1.00000069e+00 

In [4]:
model_to_test = model_to_test.cuda()

In [5]:
print("\n")
print("Starting Needle In A Haystack Testing...")
print(f"- Model: {model_name}")
print(
    f"- Context Lengths: {len(context_lengths)}, Min: {min(context_lengths)}, Max: {max(context_lengths)}"
)
print(
    f"- Document Depths: {len(document_depth_percents)}, Min: {min(document_depth_percents)}%, Max: {max(document_depth_percents)}%"
)
print(f"- Needle: {needle.strip()}")
print("\n\n")



Starting Needle In A Haystack Testing...
- Model: models/Llama-3-8B-Instruct-Gradient-1048k
- Context Lengths: 40, Min: 120000, Max: 1048000
- Document Depths: 10, Min: 0%, Max: 100%
- Needle: Remember, the best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.





In [6]:

def get_context_length_in_tokens(context):
    return len(enc.encode(context))
    
def read_context_files():
    context = ""
    max_context_length = max(context_lengths)
    while get_context_length_in_tokens(context) < max_context_length:
        for file in glob.glob(f"{haystack_dir}/*.txt"):
            with open(file, "r") as f:
                context += f.read()
    return context
    
def get_tokens_from_context(context):
    return enc.encode(context)
    
def decode_tokens(tokens, context_length=None):
    return enc.decode(tokens[:context_length], skip_special_tokens=True)
    
def encode_and_trim(context, context_length):
    tokens = get_tokens_from_context(context)
    if len(tokens) > context_length:
        context = decode_tokens(tokens, context_length)
    return context

def encode_text_to_tokens(text):
    return enc.encode(text, add_special_tokens=False)
    
def insert_needle(context, depth_percent, context_length):
    tokens_needle = encode_text_to_tokens(needle)
    tokens_context = encode_text_to_tokens(context)

    # Reducing the context length by 150 buffer. This is to account for system message, the user question, and response.
    context_length -= final_context_length_buffer

    # If your context + needle are longer than the context length (which it will be), then reduce tokens from the context by the needle length
    if len(tokens_context) + len(tokens_needle) > context_length:
        tokens_context = tokens_context[: context_length - len(tokens_needle)]

    if depth_percent == 100:
        # If your depth percent is 100 (which means your needle is the last thing in the doc), throw it at the end
        tokens_new_context = tokens_context + tokens_needle
    else:
        insertion_point = int(len(tokens_context) * (depth_percent / 100))

        tokens_new_context = tokens_context[:insertion_point]

        tokens_new_context += tokens_needle + tokens_context[insertion_point:]

    # Convert back to a string and return it
    new_context = decode_tokens(tokens_new_context)
    return new_context
    
def generate_context(context_length, depth_percent):
    # Load up tiktoken so we navigate tokens more easily

    # Get your Paul Graham files loaded into a string
    context = read_context_files()

    # Truncate the Paul Graham essays to the context length you desire
    context = encode_and_trim(context, context_length)

    # Insert your random statement according to your depth percent
    context = insert_needle(context, depth_percent, context_length)

    return context
    
def generate_prompt(context):
    test_format = f"<|im_start|> This is a very long story book: <book> {context} </book>.\n\nQuestion: Based on the content of the book, {retrieval_question}"
    return test_format
    
def bound_evaluate_and_log(context_length, depth_percent):
    # Go generate the required length context and place your needle statement in
    context = generate_context(context_length, depth_percent)
    
    # Prepare your message to send to the model you're going to evaluate
    prompt = generate_prompt(context)

    generated_prompt = prompt

    test_start_time = time.time()

    # Simulate multiround conversation
    prompt = enc(prompt, return_tensors="pt")

    prompt_input_ids = prompt["input_ids"].to(model_to_test.device)

    simulation_start_idx = prompt_input_ids.size(1) - simulation_length

    # question_input_ids = prompt_input_ids[:, simulation_start_idx:]
    # prompt_input_ids = prompt_input_ids[:, :simulation_start_idx]
    # print(prompt_input_ids)

    with torch.no_grad():
        
        if prefilling_chunk_size is not None:
            past_key_values = None
            for i in range(
                0, prompt_input_ids.size(1), prefilling_chunk_size
            ):
                chunk = prompt_input_ids[:, i : i + prefilling_chunk_size]
                output = model_to_test(
                    input_ids=chunk,
                    past_key_values=past_key_values,
                    use_cache=True,
                )
                past_key_values = output.past_key_values
        else:
            output = model_to_test(
                input_ids=prompt_input_ids, past_key_values=None, use_cache=True
            )
            past_key_values = output.past_key_values

        # for input_id in question_input_ids[0]:
        #     output = model_to_test(
        #         input_ids=input_id.unsqueeze(0).unsqueeze(0),
        #         past_key_values=past_key_values,
        #         use_cache=True,
        #     )
        #     past_key_values = output.past_key_values

        
        # attn_heads_after, sparsity = sparsify_attention_heads(attn_heads, sparsity=0.90)
        # print(attn_heads_after, sparsity)

        # device = next(model_to_test.parameters()).device
        # dtype = next(model_to_test.parameters()).dtype
        # for idx, layer in enumerate(model_to_test.model.layers):
        #     module = layer.self_attn
        #     layer_full_attention_heads = torch.tensor(
        #         attn_heads_after[idx], device=device, dtype=dtype
        #     )
            
        #     module.full_attn_head_mask = layer_full_attention_heads > 0.5
        #     module.num_full_attn_head = module.full_attn_head_mask.sum().item()
        #     module.num_streaming_attn_head = (
        #         module.num_key_value_heads - module.num_full_attn_head
        #     )
    
        #     module.num_full_query_head = module.num_full_attn_head * module.num_key_value_groups
        #     module.num_streaming_query_head = module.num_heads - module.num_full_query_head
        #     # print(module, module.num_full_query_head)

        pred_token_idx = output.logits[:, -1, :].argmax(dim=-1).unsqueeze(1)
        generated_content = [pred_token_idx.item()]
        for _ in range(50):
            outputs = model_to_test(
                input_ids=pred_token_idx,
                past_key_values=past_key_values,
                use_cache=True,
            )

            past_key_values = outputs.past_key_values
            pred_token_idx = outputs.logits[:, -1, :].argmax(dim=-1).unsqueeze(1)
            generated_content += [pred_token_idx.item()]
            if pred_token_idx.item() in eos_token_ids:
                break

    response = enc.decode(generated_content, skip_special_tokens=True).strip()
    
    test_end_time = time.time()
    test_elapsed_time = test_end_time - test_start_time
    score = scorer.score(needle, response)["rouge1"].fmeasure * 10

    
    results = {
        # 'context' : context, # Uncomment this line if you'd like to save the context the model was asked to retrieve from. Warning: This will become very large.
        "model": model_to_test_description,
        "context_length": int(context_length),
        "depth_percent": float(depth_percent),
        "needle": needle,
        "model_response": response,
        "score": score,
        "test_duration_seconds": test_elapsed_time,
        "test_timestamp_utc": datetime.now(timezone.utc).strftime(
            "%Y-%m-%d %H:%M:%S%z"
        ),
    }

    testing_results.append(results)
    print(f"-- Test Summary -- ")
    print(f"Duration: {test_elapsed_time:.1f} seconds")
    print(f"Context: {context_length} tokens")
    print(f"Depth: {depth_percent}%")
    print(f"Score: {score}")
    print(f"Response: {response}\n")

    model_version = model_name.split("/")[-1]
    context_file_location = f'{model_version.replace(".", "_")}_len_{context_length}_depth_{int(depth_percent*100)}'

    results["file_name"] = context_file_location

    # Save the context to file for retesting
    if not os.path.exists("contexts"):
        os.makedirs("contexts")

    if not os.path.exists(f"contexts/{model_version}"):
        os.makedirs(f"contexts/{model_version}")

    with open(
        f"contexts/{model_version}/{context_file_location}_context.txt",
        "w",
        encoding="utf-8",
    ) as f:
        f.write(context)

    # Save the context to file for retesting
    if not os.path.exists("results"):
        os.makedirs("results")

    if not os.path.exists(f"results/{model_version}"):
        os.makedirs(f"results/{model_version}")

    # Save the result to file for retesting
    p = f"results/{model_version}/{context_file_location}_results.json"
    print("Writing at %s" % p)
    print(p)
    with open(p, "w", encoding="utf-8") as f:
        json.dump(results, f)

    return None, generated_prompt

s_len = 1
e_len = pretrained_len
tasks = []
for context_length in context_lengths:
    print(context_length)
    if context_length < s_len or context_length > e_len:
        continue
    for depth_percent in document_depth_percents:
        print(depth_percent)
        task = bound_evaluate_and_log(context_length, depth_percent)

    break

120000
0
4 28
4 28
8 24
8 24
8 24
16 16
8 24
16 16
24 8
16 16
20 12
12 20
8 24
24 8
20 12
20 12
20 12
24 8
12 20
20 12
24 8
12 20
12 20
24 8
16 16
20 12
12 20
16 16
24 8
20 12
32 0
8 24
4 28
4 28
8 24
8 24
8 24
16 16
8 24
16 16
24 8
16 16
20 12
12 20
8 24
24 8
20 12
20 12
20 12
24 8
12 20
20 12
24 8
12 20
12 20
24 8
16 16
20 12
12 20
16 16
24 8
20 12
32 0
8 24
4 28
4 28
8 24
8 24
8 24
16 16
8 24
16 16
24 8
16 16
20 12
12 20
8 24
24 8
20 12
20 12
20 12
24 8
12 20
20 12
24 8
12 20
12 20
24 8
16 16
20 12
12 20
16 16
24 8
20 12
32 0
8 24
4 28
4 28
8 24
8 24
8 24
16 16
8 24
16 16
24 8
16 16
20 12
12 20
8 24
24 8
20 12
20 12
20 12
24 8
12 20
20 12
24 8
12 20
12 20
24 8
16 16
20 12
12 20
16 16
24 8
20 12
32 0
8 24
[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 1. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0. 1.]
 [0. 0. 0. 0.

RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 1 but got size 0 for tensor number 1 in the list.