## Load From C4

In [7]:
import gzip
import json

def load_json_gz(filename):
    with gzip.open(filename, 'r') as f:
        i = 0
        ret = []
        for json_line in f:
            if i == 10000:
                return ret
            data = json.loads(json_line)
            text = data['text']
            if len(text) > 2000:
                ret.append(text)
                i += 1

# Load 10000 strings from C4 dataset: https://huggingface.co/datasets/allenai/c4/tree/main/en
strings = load_json_gz('c4-train.00000-of-01024.json.gz')
import torch
from tqdm import tqdm

device = 'cuda:0'

In [2]:
def copy_task(batch_size=64, batches=10, model=None, tokenizer=None, token_max_len=25, shuffle=False):
    string_idx = 0
    success_copies = 0
    for _ in tqdm(range(batches)):
        cur_batch = []
        for count in range(batch_size):
            cur_batch.append(strings[count + string_idx])
        input_ids = tokenizer(cur_batch, return_tensors="pt", truncation=True, max_length=token_max_len).to(device)["input_ids"]
        if shuffle:
            col_perm = torch.randperm(input_ids.size(1))
            input_ids = input_ids[:, col_perm]
        input_ids = torch.cat([input_ids, input_ids], dim=1)
        input_ids = torch.cat([input_ids, input_ids[:, 0:1]], dim=1)
        output_ids = model.generate(input_ids, max_new_tokens = token_max_len-1)
        for count in range(batch_size):
            gold_token_len = (input_ids.shape[1]-1) // 2
            if torch.equal(input_ids[count][:gold_token_len], output_ids[count][gold_token_len*2:]):
                success_copies += 1
        string_idx += batch_size
    return success_copies / (batch_size * batches)

In [3]:
from transformers import GPTNeoXForCausalLM
from transformers import MambaConfig, MambaForCausalLM, AutoTokenizer
pythia_14b_model = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-1.4b",
  revision="step3000",
  cache_dir="./pythia-14b/step3000",
)
pythia_14b_model.to(device)
pythia_14b_tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-1.4b",
  revision="step3000",
  cache_dir="./pythia-14b/step3000",
)

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# print(copy_task(model=pythia_410m_model, tokenizer=pythia_410m_tokenizer, token_max_len=25, shuffle=True))
# print(copy_task(model=pythia_410m_model, tokenizer=pythia_410m_tokenizer, token_max_len=50, shuffle=True))
# print(copy_task(model=pythia_410m_model, tokenizer=pythia_410m_tokenizer, token_max_len=100, shuffle=True))
# print(copy_task(batch_size=32, batches=20, model=pythia_410m_model, tokenizer=pythia_410m_tokenizer, token_max_len=150, shuffle=True))
# print(copy_task(batch_size=32, batches=20, model=pythia_410m_model, tokenizer=pythia_410m_tokenizer, token_max_len=200, shuffle=True))
# print(copy_task(batch_size=32, batches=20, model=pythia_410m_model, tokenizer=pythia_410m_tokenizer, token_max_len=250, shuffle=True))

# print(copy_task(model=pythia_14b_model, tokenizer=pythia_14b_tokenizer, token_max_len=25, shuffle=True))
# print(copy_task(batch_size=32, batches=20, model=pythia_14b_model, tokenizer=pythia_14b_tokenizer, token_max_len=50, shuffle=True))
# print(copy_task(batch_size=32, batches=20, model=pythia_14b_model, tokenizer=pythia_14b_tokenizer, token_max_len=100, shuffle=True))
# print(copy_task(batch_size=16, batches=40, model=pythia_14b_model, tokenizer=pythia_14b_tokenizer, token_max_len=150, shuffle=True))
# print(copy_task(batch_size=12, batches=50, model=pythia_14b_model, tokenizer=pythia_14b_tokenizer, token_max_len=200, shuffle=True))
# print(copy_task(batch_size=4, batches=80, model=pythia_14b_model, tokenizer=pythia_14b_tokenizer, token_max_len=250, shuffle=True))
# Shuffle Results for 410m:
# 50, 0.7375
# 100, 0.510936
# 200, 0.2703125, 92.8s
# 300, 0.1796875, 187.3s
# 400, 0.090625, 300.9s
# 500, 0.0484375

# Shuffle Results for 1.4b
# 50, 0.74685, 41.4s
# 100, 0.640625s, 102.9s
# 200, 0.46875, 247.6s
# 300, 0.3984375, 8m 56.1s
# 400 0.23125 15min 12.1s

  0%|          | 0/20 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  5%|▌         | 1/20 [00:05<01:35,  5.02s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
 10%|█         | 2/20 [00:10<01:30,  5.00s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
 15%|█▌        | 3/20 [00:15<01:25,  5.01s/it]The attention mask and the pad token id were not set. As a consequence, you may obs

0.6171875





In [3]:
from transformers import MambaConfig, MambaForCausalLM, AutoTokenizer
from transformers import GPTNeoXForCausalLM
pythia_2_8b_model = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-2.8b",
  revision="step3000",
  cache_dir="./pythia-2.8b/step3000",
)
pythia_2_8b_model.to(device)
pythia_2_8b_tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-2.8b",
  revision="step3000",
  cache_dir="./pythia-2.8b/step3000",
)

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:



# print(copy_task(batch_size=32, batches=20, model=pythia_2_8b_model, tokenizer=pythia_2_8b_tokenizer, token_max_len=25, shuffle=True))
# print(copy_task(batch_size=16, batches=40, model=pythia_2_8b_model, tokenizer=pythia_2_8b_tokenizer, token_max_len=50, shuffle=True))
# print(copy_task(batch_size=8, batches=80, model=pythia_2_8b_model, tokenizer=pythia_2_8b_tokenizer, token_max_len=100, shuffle=True))
# print(copy_task(batch_size=4, batches=160, model=pythia_2_8b_model, tokenizer=pythia_2_8b_tokenizer, token_max_len=150, shuffle=True))
# print(copy_task(batch_size=4, batches=160, model=pythia_2_8b_model, tokenizer=pythia_2_8b_tokenizer, token_max_len=200, shuffle=True))
print(copy_task(batch_size=2, batches=160, model=pythia_2_8b_model, tokenizer=pythia_2_8b_tokenizer, token_max_len=250, shuffle=True)) # Too long to run full 640 samples


#
# 50 0.7953125, 1min 25.1s
# 100 0.784375 4min 11.6s
# 200 0.6796875 14min 17.1s
# 300 0.601525 30m 28.2s
# 400 0.4796 43min 2.2s
# 500 0.41875 44min 55.5s


  0%|          | 0/160 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  1%|          | 1/160 [00:18<49:38, 18.73s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  1%|▏         | 2/160 [00:35<46:17, 17.58s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  2%|▏         | 3/160 [00:52<45:00, 17.20s/it]The attention mask and the pad token id were not set. As a consequence, you may

0.41875





## Phone Book Experiments

In [1]:
# We found the phone book experiment hard to reproduce as the author did not give the exact prompt in the paper. 
# In addition, the accuracy fluctutaed a lot with the prompt we used.
import ast
import torch
from tqdm import tqdm
from transformers import GPTNeoXForCausalLM
from transformers import MambaConfig, MambaForCausalLM, AutoTokenizer

device = 'cuda:0'

name_phone_pairs = []
with open('./phonebook.txt', 'r') as f:
    for line in f:
        line = line.strip()
        if line[-1] == ',':
            line = line[:-1]
        pair = ast.literal_eval(line)
        name_phone_pairs.append((pair[0], pair[1]))

import random
def phone_book_task(batch_size=64, batches=10, book_size=20, model=None, tokenizer=None):
    book = ''
    success_lookups = 0
    for i in range(book_size):
        name = name_phone_pairs[i][0]
        phone = name_phone_pairs[i][1]
        book = book + name + ': ' + phone + '.\n'
    book += 'Extract the person\'s phone number in the phonebook above. For example:\nPerson: Liam\nNumber: 436-725-2906\nPerson: Olivia\nNumber: 192-311-5790\n\n'
    for _ in tqdm(range(batches)):
        cur_batch = []
        gold_num_tokens_batch = []
        max_num_tokens = -1
        for _ in range(batch_size):
            query_pair_idx = random.randint(2, book_size)
            query = book + 'Person: ' + name_phone_pairs[query_pair_idx][0] + '\nNumber:'
            gold_num_tokens = tokenizer(name_phone_pairs[query_pair_idx][1], return_tensors="pt", padding=True).to(device)["input_ids"]
            max_num_tokens = max(max_num_tokens, gold_num_tokens.shape[1])
            gold_num_tokens_batch.append(gold_num_tokens[0])
            cur_batch.append(query)
        input_ids = tokenizer(cur_batch, return_tensors="pt", padding=True).to(device)["input_ids"]
        output_ids = model.generate(input_ids, max_new_tokens = max_num_tokens)
        
        for count in range(batch_size):
            true_number = tokenizer.decode(gold_num_tokens_batch[count])
            output_answer = tokenizer.decode(output_ids[count])
            
            if output_answer.count(true_number) > 1:
                success_lookups += 1
    return success_lookups / (batch_size * batches)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pythia_410m_model = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-410m",
  revision="step3000",
  cache_dir="./pythia-410m/step3000",
)
pythia_410m_tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-410m",
  revision="step3000",
  cache_dir="./pythia-410m/step3000",
)
pythia_410m_tokenizer.pad_token = pythia_410m_tokenizer.eos_token 
pythia_410m_model.to(device)
print(phone_book_task(batch_size = 32, batches = 20, model=pythia_410m_model, tokenizer=pythia_410m_tokenizer))


# print(phone_book_task(model=pythia_410m_model, tokenizer=pythia_410m_tokenizer, book_size=40))
# print(phone_book_task(batch_size=32, batches=20, model=pythia_410m_model, tokenizer=pythia_410m_tokenizer, book_size=80))
# 0.6828125
# 0.1953125
# 0.053125

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  0%|          | 0/20 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [2]:
from transformers import GPTNeoXForCausalLM
pythia_1_4b_model = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-1.4b",
  revision="step3000",
  cache_dir="./pythia-1.4b/step3000",
)
pythia_1_4b_model.to(device)
pythia_1_4b_tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-1.4b",
  revision="step3000",
  cache_dir="./pythia-1.4b/step3000",
)

pythia_1_4b_tokenizer.pad_token = pythia_1_4b_tokenizer.eos_token 
pythia_1_4b_model.to(device)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 2048)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=2048, out_features=6144, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=2048, out_features=8192, bias=True)
          (dense_4h_to_h): Linear(in_features=8192, out_features=2048, bias=True)
  

In [3]:
print(phone_book_task(batch_size=16, batches=40, model=pythia_1_4b_model, tokenizer=pythia_1_4b_tokenizer))


  0%|          | 0/40 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  2%|▎         | 1/40 [00:04<02:41,  4.15s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  5%|▌         | 2/40 [00:07<02:13,  3.52s/it]The attention mask and the pad

0.0



