In [1]:
import sys


In [2]:
sys.path.append("/home/jovyan/gkuzmin/rmt_it/optimized_armt/grouped_batching/associative-recurrent-memory-transformer")
sys.path.append("/home/jovyan/gkuzmin/rmt_it/optimized_armt/grouped_batching")
sys.path.append("/home/jovyan/gkuzmin/rmt_it/optimized_armt")

In [3]:
import cutlass

In [4]:
import copy
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM

from grouped_batching.llama1b_grouping import wrap_model_with_armt, get_grouped_states, make_grouped_layer_from_single_layer, make_grouped_model_from_naive
from grouped_batching.batching import GroupedBatcher
from grouped_batching.executor import ArmtGroupedExecutor
# switch to fast version for generation
#from grouped_batching.fast_executor import FastGroupedArmtExecutor, GroupedLayerContext, associate_with_context, update_mem_with_context
#from grouped_batching.llama1b_grouping_autograd import make_grouped_training_layer_from_single_layer

In [5]:
from modeling_amt.language_modeling_old import AssociativeRecurrentWrapper, AssociativeMemoryCell

In [6]:
#!wget "https://huggingface.co/AIRI-NLP/ARMT-Llama3-1b-Instruct-2x1024-v2/resolve/main/pytorch_model.bin"

In [7]:
armt_cpt_path = "../grouped_batching_old//pytorch_model.bin"

In [8]:
torch.set_default_device("cuda:0")

In [9]:
dtype = torch.bfloat16
torch.set_default_dtype(dtype)
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7f8588e00700>

In [10]:
# load base model
source_model = AutoModelForCausalLM.from_pretrained("unsloth/Llama-3.2-1B-Instruct",
                                                    attn_implementation="flash_attention_2",
                                                    torch_dtype=dtype,
                                                    device_map="cpu")
source_model.eval()
#source_model.lm_head = torch.nn.Identity()
#reference_model = copy.deepcopy(source_model)

tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-1B-Instruct")

In [11]:
# after wrap base model in original ARMT and ARMT with grouped batching, and load pretrained weigths
# the actual segment_size for this model is segment_size - mem_size, so we will use it later
segment_size = 1024
mem_size = 16
segment_alignment = "left"
attend_to_previous_input = False
device = "cpu"
max_n_segments = 32
mem_cell_args = dict(
    base_model=source_model,
    num_mem_tokens=mem_size,
    d_mem=64,
    layers_attr="model.layers",
    wrap_pos=False,
    correction=True,
)

cell = AssociativeMemoryCell(**mem_cell_args)
original_model = AssociativeRecurrentWrapper(cell,
                                            segment_size=segment_size-mem_size,
                                            max_n_segments=max_n_segments,
                                            segment_alignment=segment_alignment,
                                            attend_to_previous_input=attend_to_previous_input,
).to(device)

cpt = torch.load(armt_cpt_path, map_location=device)
original_model.load_state_dict(cpt, strict=True)
original_model.to("cuda")

AssociativeRecurrentWrapper(
  (memory_cell): AssociativeMemoryCell(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
        (layers): ModuleList(
          (0-15): 16 x AssociativeLayerWrapper(
            (W_mq): Linear(in_features=2048, out_features=64, bias=False)
            (W_mk): Linear(in_features=2048, out_features=64, bias=False)
            (W_mv): Linear(in_features=2048, out_features=2048, bias=False)
            (W_mb): Linear(in_features=2048, out_features=1, bias=True)
            (layer): LlamaDecoderLayer(
              (self_attn): LlamaAttention(
                (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
                (k_proj): Linear(in_features=2048, out_features=512, bias=False)
                (v_proj): Linear(in_features=2048, out_features=512, bias=False)
                (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
              )
      

In [12]:
armt_model = copy.deepcopy(original_model)
grouped_states = get_grouped_states(armt_model)
grouped_layer = make_grouped_layer_from_single_layer(
    copy.deepcopy(armt_model.memory_cell.model.model.layers[0]), *grouped_states)
# grouped_layer._grouped_execution = True
# grouped_layer._skip_associating = True
armt_grouped_model, source_model_layers = make_grouped_model_from_naive(armt_model, grouped_layer)


In [13]:
model_config = source_model.config

In [14]:
batcher = GroupedBatcher(
    armt_grouped_model, 
    n_layers=model_config.num_hidden_layers, 
    seg_size=segment_size, 
    hid_dim=model_config.hidden_size, 
    pos_embed_dim=model_config.hidden_size
)
executor = ArmtGroupedExecutor(armt_grouped_model, grouped_layer, batcher)


## Test 1 - check on random ids

In [15]:
torch.cuda.empty_cache()

In [16]:
num_segments = 10
input_ids = torch.randint(
    0, 5000, 
    (1, num_segments*(segment_size-mem_size)), 
    dtype=torch.long, 
    device="cuda"
)


In [17]:
original_model.memory_cell.zero_mem()
reference_output = original_model.forward(input_ids)

In [18]:
output = executor.forward(input_ids)

jit compile As: [torch.Size([1024, 2048]), torch.Size([1024, 2048]), torch.Size([1024, 2048]), torch.Size([1024, 2048]), torch.Size([1024, 2048]), torch.Size([1024, 2048]), torch.Size([1024, 2048]), torch.Size([1024, 2048]), torch.Size([1024, 2048]), torch.Size([1024, 2048]), torch.Size([1024, 2048]), torch.Size([1024, 2048]), torch.Size([1024, 2048]), torch.Size([1024, 2048]), torch.Size([1024, 2048]), torch.Size([1024, 2048])] Bs: [torch.Size([2048, 64]), torch.Size([2048, 64]), torch.Size([2048, 64]), torch.Size([2048, 64]), torch.Size([2048, 64]), torch.Size([2048, 64]), torch.Size([2048, 64]), torch.Size([2048, 64]), torch.Size([2048, 64]), torch.Size([2048, 64]), torch.Size([2048, 64]), torch.Size([2048, 64]), torch.Size([2048, 64]), torch.Size([2048, 64]), torch.Size([2048, 64]), torch.Size([2048, 64])]

// Gemm operator cutlass_tensorop_bf16_s16816gemm_grouped_bf16_256x128_64x3_tt_align8
using cutlass_tensorop_bf16_s16816gemm_grouped_bf16_256x128_64x3_tt_align8_base =
  typenam

In [19]:
output.logits

tensor([[[ 6.4062,  6.3438,  5.1562,  ..., -3.2969, -3.2969, -3.2969],
         [ 3.3594, 10.1250,  4.6875,  ..., -1.8281, -1.8281, -1.8359],
         [ 3.1250,  6.8750,  3.8281,  ..., -3.0000, -3.0000, -3.0000],
         ...,
         [ 6.3125,  5.5000,  6.4062,  ..., -0.7734, -0.7734, -0.7734],
         [ 6.9688,  6.0625,  6.0312,  ..., -1.7578, -1.7578, -1.7578],
         [ 6.8125,  5.5938,  5.6250,  ..., -1.7109, -1.7031, -1.7031]]],
       device='cuda:0')

In [20]:
reference_output.logits

tensor([[[ 6.4062,  6.3438,  5.1562,  ..., -3.2969, -3.2969, -3.2969],
         [ 3.3594, 10.1250,  4.6875,  ..., -1.8281, -1.8281, -1.8359],
         [ 3.1250,  6.8750,  3.8281,  ..., -3.0000, -3.0000, -3.0000],
         ...,
         [ 6.2812,  5.4688,  6.4688,  ..., -0.7969, -0.7930, -0.7930],
         [ 6.9688,  6.0000,  6.0312,  ..., -1.8047, -1.8047, -1.8047],
         [ 6.8125,  5.5312,  5.5938,  ..., -1.7422, -1.7422, -1.7422]]],
       device='cuda:0')

In [21]:
torch.norm(output.logits-reference_output.logits)/torch.norm(reference_output.logits)

tensor(0.0201, device='cuda:0')

## Test 2 - check on some short text and gradually increase length

In [25]:
base_text = "The invention of the printing press by Johannes Gutenberg in the 15th century revolutionized the way information was shared. Before this, books were copied by hand, making them rare and expensive. The printing press allowed for faster and cheaper production of books, leading to a wider spread of knowledge. This innovation played a key role in the Renaissance, the Reformation, and the Scientific Revolution by making texts more accessible to the general public."
stacked_text = " ".join([base_text]*200).strip()
messages = [
    {"role": "user", "content": stacked_text}
]
input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=False,
    return_tensors="pt"
)
input_ids = input_ids[..., :2*(segment_size-mem_size)]
input_ids.shape

torch.Size([1, 2016])

In [26]:
for segm in range(1,17):
    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=False,
        return_tensors="pt"
    )
    input_ids = input_ids[..., :segm*(segment_size-mem_size)]
    original_model.memory_cell.zero_mem()
    reference_output = original_model.forward(input_ids)
    executor.armt_model.memory_cell.zero_mem()
    output = executor.forward(input_ids)
    diff = torch.norm(output.logits-reference_output.logits)/torch.norm(reference_output.logits)
    print(f"Norm on text with {segm} segments: {diff}")

Norm on text with 1 segments: 0.0
Norm on text with 2 segments: 0.0106201171875
Norm on text with 3 segments: 0.01422119140625
Norm on text with 4 segments: 0.01531982421875
Norm on text with 5 segments: 0.0150146484375
Norm on text with 6 segments: 0.01507568359375
Norm on text with 7 segments: 0.01531982421875
Norm on text with 8 segments: 0.015625
Norm on text with 9 segments: 0.0155029296875
Norm on text with 10 segments: 0.01544189453125
Norm on text with 11 segments: 0.0157470703125
Norm on text with 12 segments: 0.015625
Norm on text with 13 segments: 0.01556396484375
Norm on text with 14 segments: 0.015625
Norm on text with 15 segments: 0.0157470703125
Norm on text with 16 segments: 0.0157470703125


## Test 3 - check on BABILong qa1

In [27]:
armt_model = copy.deepcopy(original_model)
grouped_states = get_grouped_states(armt_model)
grouped_layer = make_grouped_layer_from_single_layer(
    copy.deepcopy(armt_model.memory_cell.model.model.layers[0]), *grouped_states)
# grouped_layer._grouped_execution = True
# grouped_layer._skip_associating = True
#grouped_layer.generate_mode = True
armt_grouped_model, source_model_layers = make_grouped_model_from_naive(armt_model, grouped_layer)


In [28]:
batcher = GroupedBatcher(
    armt_grouped_model, 
    n_layers=model_config.num_hidden_layers, 
    seg_size=segment_size, 
    hid_dim=model_config.hidden_size, 
    pos_embed_dim=model_config.hidden_size
)
executor = ArmtGroupedExecutor(armt_grouped_model, grouped_layer, batcher, original_model)


In [34]:
from babilong.prompts import DEFAULT_PROMPTS, DEFAULT_TEMPLATE, get_formatted_input
from tqdm import tqdm
import datasets
from pathlib import Path
import json
import pandas as pd

In [35]:
tasks = ["qa1"]
split_names = ["2k", "4k", "8k"]
dataset_name = "RMT-team/babilong"
results_folder = "./test_res"
model_name = "unsloth/Llama-3.2-1B-Instruct"
use_instruction = True
use_examples = True
use_post_prompt = True
use_chat_template = True
api_url = False

In [36]:
model = executor
model_cpt = "mem_code_fix_executor_mem_patch_armt-1b-it-v2"
model.name_or_path = "custom_rmt"
model.device = "cuda"

In [37]:
generate_kwargs = {
    'max_new_tokens': 20,
    'max_length': None,
    'num_beams': 1,
    'do_sample': False,
    'temperature': None,
    'top_p': None,
    'top_k': None,
    'pad_token_id': tokenizer.pad_token_id,
    'eos_token_id': tokenizer.eos_token_id,
    #'logits_processor': [NormLogitsWrapper()],
}

In [38]:
template_to_use = DEFAULT_TEMPLATE
print(f'prompt template:\n{template_to_use}')

for task in tqdm(tasks, desc='tasks'):
    # configure the prompt
    prompt_cfg = {
        'instruction': DEFAULT_PROMPTS[task]['instruction'] if use_instruction else '',
        'examples': DEFAULT_PROMPTS[task]['examples'] if use_examples else '',
        'post_prompt': DEFAULT_PROMPTS[task]['post_prompt'] if use_post_prompt else '',
        'template': template_to_use,
        'chat_template': use_chat_template,
    }
    prompt_name = [f'{k}_yes' if prompt_cfg[k] else f'{k}_no' for k in prompt_cfg if k != 'template']
    prompt_name = '_'.join(prompt_name)

    for split_name in tqdm(split_names, desc='lengths'):
        # load dataset
        data = datasets.load_dataset(dataset_name, split_name)
        task_data = data[task]#.select([1])

        # Prepare files with predictions, prompt, and generation configurations
        outfile = Path(f'{results_folder}/{model_name.replace("../", "")}/{model_cpt.replace("../", "")}/{task}_{split_name}_{prompt_name}.csv')
        outfile.parent.mkdir(parents=True, exist_ok=True)
        cfg_file = f'./{results_folder}/{model_name.replace("../", "")}/{model_cpt.replace("../", "")}/{task}_{split_name}_{prompt_name}.json'
        json.dump({'prompt': prompt_cfg, 'generate_kwargs': generate_kwargs}, open(cfg_file, 'w'), indent=4)

        df = pd.DataFrame({'target': [], 'output': [], 'question': []})

        for sample in tqdm(task_data, desc=f'task: {task} length: {split_name}'):
            target = sample['target']
            context = sample['input']
            question = sample['question']

            # format input text
            input_text = get_formatted_input(context, question, prompt_cfg['examples'],
                                             prompt_cfg['instruction'], prompt_cfg['post_prompt'],
                                             template=prompt_cfg['template'])
            if api_url:
                # model is running via llamacpp's serve command
                headers = {'Content-Type': 'application/json'}
                if generate_kwargs['temperature'] is None:
                    generate_kwargs['temperature'] = 0.0

                if use_chat_template:
                    input_text = [{'role': 'user', 'content': input_text}]
                    model_inputs = tokenizer.apply_chat_template(input_text, tokenize=True,
                                                                 add_generation_prompt=True)
                else:
                    model_inputs = tokenizer.encode(input_text, add_special_tokens=True)

                request_data = {'prompt': model_inputs, 'temperature': generate_kwargs['temperature']}
                response = requests.post(api_url, headers=headers, json=request_data).json()
                output = response['content'].strip()
            else:
                # generate output using local model
                if model.name_or_path in ['THUDM/chatglm3-6b-128k', 'THUDM/LongAlign-6B-64k-base', 'THUDM/LongAlign-6B-64k']:
                    # have to add special code to run chatglm as tokenizer.chat_template tokenization is not
                    # the same as in model.chat (recommended in https://huggingface.co/THUDM/chatglm3-6b-128k)
                    with torch.no_grad():
                        output, _ = model.chat(tokenizer, input_text, history=[], **generate_kwargs)
                else:
                    if use_chat_template:
                        input_text = [{'role': 'user', 'content': input_text}]
                        model_inputs = tokenizer.apply_chat_template(input_text, add_generation_prompt=True,
                                                                     return_tensors='pt', return_dict=model.name_or_path=="custom_rmt").to(model.device)
                        if model.name_or_path != "custom_rmt":
                            model_inputs = {'input_ids': model_inputs}
                    else:
                        model_inputs = tokenizer(input_text, return_tensors='pt',
                                                 add_special_tokens=True).to(model.device)

                    sample_length = model_inputs['input_ids'].shape[1]
                    with torch.no_grad():
                        #print(model_inputs["input_ids"].shape)
                        #print(model_inputs)
                        last_segm = model_inputs["input_ids"].shape[-1] // (1024 - 16) * (1024 - 16)
                        prev_ids = model_inputs["input_ids"][..., :last_segm]
                        #print(prev_ids.shape)
                        #print(prev_ids)
                        output = model.generate(**model_inputs, **generate_kwargs)
                        # we need to reset memory states between samples for activation-beacon models
                        if 'activation-beacon' in model.name_or_path and hasattr(model, 'memory'):
                            model.memory.reset()
                    if model.name_or_path != "custom_rmt":
                        output = output[0][sample_length:]
                    else:
                        output = output[0]
                    output = tokenizer.decode(output, skip_special_tokens=True).strip()

            df.loc[len(df)] = [target, output, question]
            # write results to csv file
            df.to_csv(outfile, escapechar='\\')

prompt template:
{instruction}

{examples}

{post_prompt}

<context>
{context}
</context>

Question: {question}


tasks:   0%|          | 0/1 [00:00<?, ?it/s]
lengths:   0%|          | 0/3 [00:00<?, ?it/s][A

task: qa1 length: 2k:   0%|          | 0/100 [00:00<?, ?it/s][A[A

task: qa1 length: 2k:   1%|          | 1/100 [00:00<01:06,  1.48it/s][A[A

task: qa1 length: 2k:   2%|▏         | 2/100 [00:01<01:02,  1.58it/s][A[A

task: qa1 length: 2k:   3%|▎         | 3/100 [00:01<01:00,  1.61it/s][A[A

task: qa1 length: 2k:   4%|▍         | 4/100 [00:02<00:58,  1.64it/s][A[A

task: qa1 length: 2k:   5%|▌         | 5/100 [00:03<00:57,  1.65it/s][A[A

task: qa1 length: 2k:   6%|▌         | 6/100 [00:03<00:56,  1.66it/s][A[A

task: qa1 length: 2k:   7%|▋         | 7/100 [00:04<00:56,  1.66it/s][A[A

task: qa1 length: 2k:   8%|▊         | 8/100 [00:04<00:49,  1.84it/s][A[A

task: qa1 length: 2k:   9%|▉         | 9/100 [00:05<00:50,  1.80it/s][A[A

task: qa1 length: 2k:  10%|█         | 10/100 [00:05<00:51,  1.76it/s][A[A

task: qa1 length: 2k:  11%|█         | 11/100 [00:06<00:51,  1.7

In [39]:
tasks = ["qa1"]
split_names = ["2k", "4k", "8k"]
dataset_name = "RMT-team/babilong"
results_folder = "./test_res"
model_name = "unsloth/Llama-3.2-1B-Instruct"
use_instruction = True
use_examples = True
use_post_prompt = True
use_chat_template = True
api_url = False

In [40]:
model = original_model
model_cpt = "vanilla_armt-1b-it-v2"
model.name_or_path = "custom_rmt"
model.device = "cuda"

In [41]:
generate_kwargs = {
    'max_new_tokens': 20,
    'max_length': None,
    'num_beams': 1,
    'do_sample': False,
    'temperature': None,
    'top_p': None,
    'top_k': None,
    'pad_token_id': tokenizer.pad_token_id,
    'eos_token_id': tokenizer.eos_token_id,
    #'logits_processor': [NormLogitsWrapper()],
}

In [42]:
template_to_use = DEFAULT_TEMPLATE
print(f'prompt template:\n{template_to_use}')

for task in tqdm(tasks, desc='tasks'):
    # configure the prompt
    prompt_cfg = {
        'instruction': DEFAULT_PROMPTS[task]['instruction'] if use_instruction else '',
        'examples': DEFAULT_PROMPTS[task]['examples'] if use_examples else '',
        'post_prompt': DEFAULT_PROMPTS[task]['post_prompt'] if use_post_prompt else '',
        'template': template_to_use,
        'chat_template': use_chat_template,
    }
    prompt_name = [f'{k}_yes' if prompt_cfg[k] else f'{k}_no' for k in prompt_cfg if k != 'template']
    prompt_name = '_'.join(prompt_name)

    for split_name in tqdm(split_names, desc='lengths'):
        # load dataset
        data = datasets.load_dataset(dataset_name, split_name)
        task_data = data[task]#.select([1])

        # Prepare files with predictions, prompt, and generation configurations
        outfile = Path(f'{results_folder}/{model_name.replace("../", "")}/{model_cpt.replace("../", "")}/{task}_{split_name}_{prompt_name}.csv')
        outfile.parent.mkdir(parents=True, exist_ok=True)
        cfg_file = f'./{results_folder}/{model_name.replace("../", "")}/{model_cpt.replace("../", "")}/{task}_{split_name}_{prompt_name}.json'
        json.dump({'prompt': prompt_cfg, 'generate_kwargs': generate_kwargs}, open(cfg_file, 'w'), indent=4)

        df = pd.DataFrame({'target': [], 'output': [], 'question': []})

        for sample in tqdm(task_data, desc=f'task: {task} length: {split_name}'):
            target = sample['target']
            context = sample['input']
            question = sample['question']

            # format input text
            input_text = get_formatted_input(context, question, prompt_cfg['examples'],
                                             prompt_cfg['instruction'], prompt_cfg['post_prompt'],
                                             template=prompt_cfg['template'])
            if api_url:
                # model is running via llamacpp's serve command
                headers = {'Content-Type': 'application/json'}
                if generate_kwargs['temperature'] is None:
                    generate_kwargs['temperature'] = 0.0

                if use_chat_template:
                    input_text = [{'role': 'user', 'content': input_text}]
                    model_inputs = tokenizer.apply_chat_template(input_text, tokenize=True,
                                                                 add_generation_prompt=True)
                else:
                    model_inputs = tokenizer.encode(input_text, add_special_tokens=True)

                request_data = {'prompt': model_inputs, 'temperature': generate_kwargs['temperature']}
                response = requests.post(api_url, headers=headers, json=request_data).json()
                output = response['content'].strip()
            else:
                # generate output using local model
                if model.name_or_path in ['THUDM/chatglm3-6b-128k', 'THUDM/LongAlign-6B-64k-base', 'THUDM/LongAlign-6B-64k']:
                    # have to add special code to run chatglm as tokenizer.chat_template tokenization is not
                    # the same as in model.chat (recommended in https://huggingface.co/THUDM/chatglm3-6b-128k)
                    with torch.no_grad():
                        output, _ = model.chat(tokenizer, input_text, history=[], **generate_kwargs)
                else:
                    if use_chat_template:
                        input_text = [{'role': 'user', 'content': input_text}]
                        model_inputs = tokenizer.apply_chat_template(input_text, add_generation_prompt=True,
                                                                     return_tensors='pt', return_dict=model.name_or_path=="custom_rmt").to(model.device)
                        if model.name_or_path != "custom_rmt":
                            model_inputs = {'input_ids': model_inputs}
                    else:
                        model_inputs = tokenizer(input_text, return_tensors='pt',
                                                 add_special_tokens=True).to(model.device)

                    sample_length = model_inputs['input_ids'].shape[1]
                    with torch.no_grad():
                        #print(model_inputs["input_ids"].shape)
                        #print(model_inputs)
                        last_segm = model_inputs["input_ids"].shape[-1] // (1024 - 16) * (1024 - 16)
                        prev_ids = model_inputs["input_ids"][..., :last_segm]
                        #print(prev_ids.shape)
                        #print(prev_ids)
                        output = model.generate(**model_inputs, **generate_kwargs)
                        # we need to reset memory states between samples for activation-beacon models
                        if 'activation-beacon' in model.name_or_path and hasattr(model, 'memory'):
                            model.memory.reset()
                    if model.name_or_path != "custom_rmt":
                        output = output[0][sample_length:]
                    else:
                        output = output[0]
                    output = tokenizer.decode(output, skip_special_tokens=True).strip()

            df.loc[len(df)] = [target, output, question]
            # write results to csv file
            df.to_csv(outfile, escapechar='\\')

prompt template:
{instruction}

{examples}

{post_prompt}

<context>
{context}
</context>

Question: {question}


tasks:   0%|          | 0/1 [00:00<?, ?it/s]
lengths:   0%|          | 0/3 [00:00<?, ?it/s][A

task: qa1 length: 2k:   0%|          | 0/100 [00:00<?, ?it/s][A[A

task: qa1 length: 2k:   1%|          | 1/100 [00:00<00:58,  1.70it/s][A[A

task: qa1 length: 2k:   2%|▏         | 2/100 [00:01<00:57,  1.70it/s][A[A

task: qa1 length: 2k:   3%|▎         | 3/100 [00:01<00:56,  1.72it/s][A[A

task: qa1 length: 2k:   4%|▍         | 4/100 [00:02<00:56,  1.71it/s][A[A

task: qa1 length: 2k:   5%|▌         | 5/100 [00:02<00:55,  1.72it/s][A[A

task: qa1 length: 2k:   6%|▌         | 6/100 [00:03<00:54,  1.72it/s][A[A

task: qa1 length: 2k:   7%|▋         | 7/100 [00:04<00:54,  1.72it/s][A[A

task: qa1 length: 2k:   8%|▊         | 8/100 [00:04<00:53,  1.72it/s][A[A

task: qa1 length: 2k:   9%|▉         | 9/100 [00:05<00:52,  1.72it/s][A[A

task: qa1 length: 2k:  10%|█         | 10/100 [00:05<00:52,  1.73it/s][A[A

task: qa1 length: 2k:  11%|█         | 11/100 [00:06<00:51,  1.7

In [43]:
import numpy as np
def compare_generated_ids(orig, opt, tokenizer):
    toks_orig = np.array(tokenizer(orig)["input_ids"])
    toks_opt = np.array(tokenizer(opt)["input_ids"])
    length = min([len(toks_opt), len(toks_orig)])
    #print(toks_opt, toks_orig, length)
    #print(np.where(toks_orig[:length] == toks_opt[:length]))
    match = np.mean(toks_orig[:length] == toks_opt[:length])
    return match

In [44]:
for split in split_names:
    path_orig = f"./test_res/unsloth/Llama-3.2-1B-Instruct/vanilla_armt-1b-it-v2/qa1_{split}_instruction_yes_examples_yes_post_prompt_yes_chat_template_yes.csv"
    path_opt = f"./test_res/unsloth/Llama-3.2-1B-Instruct/mem_code_fix_executor_mem_patch_armt-1b-it-v2/qa1_{split}_instruction_yes_examples_yes_post_prompt_yes_chat_template_yes.csv"
    orig_answers = pd.read_csv(path_orig)["output"]
    opt_answers = pd.read_csv(path_opt)["output"]
    scores = []
    for orig, opt in zip(orig_answers, opt_answers):
        scores.append(compare_generated_ids(orig, opt, tokenizer))
    print(split, f"{np.round(np.mean(scores)*100,2)}%")

2k 8.4%
4k 6.3%
8k 6.5%
