### 1. Load texts from C4 dataset

In [1]:
import gzip
import json

def load_json_gz(filename):
    with gzip.open(filename, 'r') as f:
        i = 0
        ret = []
        for json_line in f:
            if i == 10000:
                return ret
            data = json.loads(json_line)
            text = data['text']
            if len(text) > 2000:
                ret.append(text)
                i += 1

In [2]:
# Load 10000 strings from C4 dataset: https://huggingface.co/datasets/allenai/c4/tree/main/en
strings = load_json_gz('c4-train.00000-of-01024.json.gz')

In [3]:
from transformers import MambaConfig, MambaForCausalLM, AutoTokenizer
import torch
from tqdm import tqdm

device = 'cuda:0'

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def copy_task(batch_size=64, batches=10, model=None, tokenizer=None, token_max_len=25, shuffle=False):
    string_idx = 0
    success_copies = 0
    for _ in tqdm(range(batches)):
        cur_batch = []
        for count in range(batch_size):
            cur_batch.append(strings[count + string_idx])
        input_ids = tokenizer(cur_batch, return_tensors="pt", truncation=True, max_length=token_max_len).to(device)["input_ids"]
        if shuffle:
            col_perm = torch.randperm(input_ids.size(1))
            input_ids = input_ids[:, col_perm]
        input_ids = torch.cat([input_ids, input_ids], dim=1)
        input_ids = torch.cat([input_ids, input_ids[:, 0:1]], dim=1)
        output_ids = model.generate(input_ids, max_new_tokens = token_max_len-1)
        for count in range(batch_size):
            gold_token_len = (input_ids.shape[1]-1) // 2
            if torch.equal(input_ids[count][:gold_token_len], output_ids[count][gold_token_len*2:]):
                success_copies += 1
        string_idx += batch_size
    return success_copies / (batch_size * batches)

### 2. Pythia Copying (with or without shuffle) Experiments

#### 2.1 Pythia 410m results

In [8]:
from transformers import GPTNeoXForCausalLM
pythia_410m_model = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-410m",
  revision="step3000",
  cache_dir="./pythia-410m/step3000",
)
pythia_410m_model.to(device)
pythia_410m_tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-410m",
  revision="step3000",
  cache_dir="./pythia-410m/step3000",
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
print(copy_task(model=pythia_410m_model, tokenizer=pythia_410m_tokenizer, token_max_len=25))
# print(copy_task(model=pythia_410m_model, tokenizer=pythia_410m_tokenizer, token_max_len=50))
# print(copy_task(model=pythia_410m_model, tokenizer=pythia_410m_tokenizer, token_max_len=100))
# print(copy_task(batch_size=32, batches=20, model=pythia_410m_model, tokenizer=pythia_410m_tokenizer, token_max_len=150))
# print(copy_task(batch_size=32, batches=20, model=pythia_410m_model, tokenizer=pythia_410m_tokenizer, token_max_len=200))
# print(copy_task(batch_size=32, batches=20, model=pythia_410m_model, tokenizer=pythia_410m_tokenizer, token_max_len=250))

# 50, 0.896
# 100, 0.7625
# 200, 0.5562
# 300, 0.364
# 400, 0.214
# 500, 0.1

#### 2.2 Pythia 1.4B results

In [6]:
from transformers import GPTNeoXForCausalLM
pythia_1_4b_model = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-1.4b",
  revision="step3000",
  cache_dir="./pythia-1.4b/step3000",
)
pythia_1_4b_model.to(device)
pythia_1_4b_tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-1.4b",
  revision="step3000",
  cache_dir="./pythia-1.4b/step3000",
)

Error during conversion: ValueError('Queue is full! Please try again.')
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# print(copy_task(model=pythia_1_4b_model, tokenizer=pythia_1_4b_tokenizer, token_max_len=25))
# print(copy_task(batch_size=32, batches=20, model=pythia_1_4b_model, tokenizer=pythia_1_4b_tokenizer, token_max_len=50))
# print(copy_task(batch_size=32, batches=20, model=pythia_1_4b_model, tokenizer=pythia_1_4b_tokenizer, token_max_len=100))
# print(copy_task(batch_size=16, batches=40, model=pythia_1_4b_model, tokenizer=pythia_1_4b_tokenizer, token_max_len=150))
# print(copy_task(batch_size=16, batches=40, model=pythia_1_4b_model, tokenizer=pythia_1_4b_tokenizer, token_max_len=200))
print(copy_task(batch_size=8, batches=80, model=pythia_1_4b_model, tokenizer=pythia_1_4b_tokenizer, token_max_len=250))

# pythia-1.4b
# 50, 0.8109
# 100, 0.74375
# 200, 0.7375
# 300, 0.665625
# 400, 0.575
# 500, 0.484375

#### 2.3 Pythia 2.8B results

In [4]:
from transformers import GPTNeoXForCausalLM
pythia_2_8b_model = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-2.8b",
  revision="step3000",
  cache_dir="./pythia-2.8b/step3000",
)
pythia_2_8b_model.to(device)
pythia_2_8b_tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-2.8b",
  revision="step3000",
  cache_dir="./pythia-2.8b/step3000",
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Error during conversion: ValueError('Queue is full! Please try again.')


In [7]:
# print(copy_task(batch_size=32, batches=20, model=pythia_2_8b_model, tokenizer=pythia_2_8b_tokenizer, token_max_len=25))
# print(copy_task(batch_size=16, batches=40, model=pythia_2_8b_model, tokenizer=pythia_2_8b_tokenizer, token_max_len=50))
# print(copy_task(batch_size=8, batches=80, model=pythia_2_8b_model, tokenizer=pythia_2_8b_tokenizer, token_max_len=100))
# print(copy_task(batch_size=4, batches=160, model=pythia_2_8b_model, tokenizer=pythia_2_8b_tokenizer, token_max_len=150))
# print(copy_task(batch_size=4, batches=160, model=pythia_2_8b_model, tokenizer=pythia_2_8b_tokenizer, token_max_len=200))
print(copy_task(batch_size=2, batches=160, model=pythia_2_8b_model, tokenizer=pythia_2_8b_tokenizer, token_max_len=250)) # Too long to run full 640 samples

# pythia-2.8B
# 50, 0.8296875
# 100, 0.796875
# 200, 0.7390625
# 300, 0.6796875
# 400, 0.6171875
# 500, 0.525

  0%|          | 0/160 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  1%|          | 1/160 [00:19<50:44, 19.15s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  1%|▏         | 2/160 [00:35<46:48, 17.77s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  2%|▏         | 3/160 [00:52<45:17, 17.31s/it]The attention mask and the pad token id were not set. As a consequence, you may

0.525





### 3. Mamba Copying (with or without shuffle) Experiments

#### 3.1 Mamba-370m results

In [None]:
mamba_370m_tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-370m-hf")
mamba_370m_model = MambaForCausalLM.from_pretrained("state-spaces/mamba-370m-hf")
mamba_370m_model.to(device)
print(copy_task(model=mamba_370m_model, tokenizer=mamba_370m_tokenizer, token_max_len=25))
print(copy_task(model=mamba_370m_model, tokenizer=mamba_370m_tokenizer, token_max_len=50))
print(copy_task(model=mamba_370m_model, tokenizer=mamba_370m_tokenizer, token_max_len=100))
print(copy_task(model=mamba_370m_model, tokenizer=mamba_370m_tokenizer, token_max_len=150))
print(copy_task(model=mamba_370m_model, tokenizer=mamba_370m_tokenizer, token_max_len=200))
print(copy_task(model=mamba_370m_model, tokenizer=mamba_370m_tokenizer, token_max_len=250))
# 0.8578125, 00:21
# 0.703125, 00:36
# 0.509375, 01:10
# 0.3265625, 01:46
# 0.196875, 02:19
# 0.1, 02:54

In [None]:
print(copy_task(model=mamba_370m_model, tokenizer=mamba_370m_tokenizer, token_max_len=25, shuffle=True))
print(copy_task(model=mamba_370m_model, tokenizer=mamba_370m_tokenizer, token_max_len=50, shuffle=True))
print(copy_task(model=mamba_370m_model, tokenizer=mamba_370m_tokenizer, token_max_len=100, shuffle=True))
print(copy_task(model=mamba_370m_model, tokenizer=mamba_370m_tokenizer, token_max_len=150, shuffle=True))
print(copy_task(model=mamba_370m_model, tokenizer=mamba_370m_tokenizer, token_max_len=200, shuffle=True))
print(copy_task(model=mamba_370m_model, tokenizer=mamba_370m_tokenizer, token_max_len=250, shuffle=True))
# 0.9296875
# 0.6953125
# 0.078125
# 0.003125
# 0.0
# 0.0

#### 3.2 Mamba 1.4B results

In [5]:
mamba_1_4b_tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-1.4b-hf")
mamba_1_4b_model = MambaForCausalLM.from_pretrained("state-spaces/mamba-1.4b-hf")
mamba_1_4b_model.to(device)
print(copy_task(model=mamba_1_4b_model, tokenizer=mamba_1_4b_tokenizer, token_max_len=25))
print(copy_task(model=mamba_1_4b_model, tokenizer=mamba_1_4b_tokenizer, token_max_len=50))
print(copy_task(model=mamba_1_4b_model, tokenizer=mamba_1_4b_tokenizer, token_max_len=100))
print(copy_task(model=mamba_1_4b_model, tokenizer=mamba_1_4b_tokenizer, token_max_len=150))
print(copy_task(model=mamba_1_4b_model, tokenizer=mamba_1_4b_tokenizer, token_max_len=200))
print(copy_task(batch_size=32, batches=20, model=mamba_1_4b_model, tokenizer=mamba_1_4b_tokenizer, token_max_len=250))
# 0.946875, 01:38
# 0.8953125, 02:12
# 0.7875, 03:55
# 0.6296875, 04:09
# 0.49375, 06:01
# 0.346875, 07:25

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]
100%|██████████| 20/20 [07:25<00:00, 22.29s/it]

0.346875





In [None]:
print(copy_task(model=mamba_1_4b_model, tokenizer=mamba_1_4b_tokenizer, token_max_len=25, shuffle=True))
print(copy_task(model=mamba_1_4b_model, tokenizer=mamba_1_4b_tokenizer, token_max_len=50, shuffle=True))
print(copy_task(model=mamba_1_4b_model, tokenizer=mamba_1_4b_tokenizer, token_max_len=100, shuffle=True))
print(copy_task(model=mamba_1_4b_model, tokenizer=mamba_1_4b_tokenizer, token_max_len=150, shuffle=True))
print(copy_task(model=mamba_1_4b_model, tokenizer=mamba_1_4b_tokenizer, token_max_len=200, shuffle=True))
print(copy_task(batch_size=32, batches=20, model=mamba_1_4b_model, tokenizer=mamba_1_4b_tokenizer, token_max_len=250, shuffle=True))
# 0.9296875
# 0.796875
# 0.3125
# 0.0390625
# 0.0
# 0.0

#### 3.2 Mamba 2.8B results

In [None]:
mamba_2_8b_tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-2.8b-hf", padding_size='left')
mamba_2_8b_model = MambaForCausalLM.from_pretrained("state-spaces/mamba-2.8b-hf")
mamba_2_8b_model.to(device)
print(copy_task(model=mamba_2_8b_model, tokenizer=mamba_2_8b_tokenizer, token_max_len=25))
print(copy_task(model=mamba_2_8b_model, tokenizer=mamba_2_8b_tokenizer, token_max_len=50))
print(copy_task(batch_size=32, batches=20, model=mamba_2_8b_model, tokenizer=mamba_2_8b_tokenizer, token_max_len=100))
print(copy_task(batch_size=16, batches=40, model=mamba_2_8b_model, tokenizer=mamba_2_8b_tokenizer, token_max_len=150))
print(copy_task(batch_size=8, batches=80, model=mamba_2_8b_model, tokenizer=mamba_2_8b_tokenizer, token_max_len=200))
print(copy_task(batch_size=8, batches=80, model=mamba_2_8b_model, tokenizer=mamba_2_8b_tokenizer, token_max_len=250))
# 0.9484375, 00:55
# 0.9296875, 02:02
# 0.878125, 05:57
# 0.7890625, 13:47
# 0.6875, 24:58
# 0.546875, 31:09

In [None]:
print(copy_task(model=mamba_2_8b_model, tokenizer=mamba_2_8b_tokenizer, token_max_len=25, shuffle=True))
print(copy_task(model=mamba_2_8b_model, tokenizer=mamba_2_8b_tokenizer, token_max_len=50, shuffle=True))
print(copy_task(batch_size=32, batches=20, model=mamba_2_8b_model, tokenizer=mamba_2_8b_tokenizer, token_max_len=100, shuffle=True))
print(copy_task(batch_size=16, batches=40, model=mamba_2_8b_model, tokenizer=mamba_2_8b_tokenizer, token_max_len=150, shuffle=True))
print(copy_task(batch_size=8, batches=80, model=mamba_2_8b_model, tokenizer=mamba_2_8b_tokenizer, token_max_len=200, shuffle=True))
print(copy_task(batch_size=8, batches=80, model=mamba_2_8b_model, tokenizer=mamba_2_8b_tokenizer, token_max_len=250, shuffle=True))

### 4. Mamba phone book lookup results

In [2]:
import ast

name_phone_pairs = []
with open('./phonebook.txt', 'r') as f:
    for line in f:
        line = line.strip()
        if line[-1] == ',':
            line = line[:-1]
        pair = ast.literal_eval(line)
        name_phone_pairs.append((pair[0], pair[1]))

In [3]:
# We found the phone book experiment hard to reproduce as the author did not give the exact prompt in the paper. 
# In addition, the accuracy fluctutaed a lot with the prompt we used.

import random
def phone_book_task(batch_size=64, batches=10, book_size=20, model=None, tokenizer=None):
    book = ''
    success_lookups = 0
    for i in range(book_size):
        name = name_phone_pairs[i][0]
        phone = name_phone_pairs[i][1]
        book = book + name + ': ' + phone + '.\n'
    book += 'Extract the person\'s phone number in the phonebook above. For example:\nPerson: Liam\nNumber: 436-725-2906\nPerson: Olivia\nNumber: 192-311-5790\n\n'
    for _ in tqdm(range(batches)):
        cur_batch = []
        gold_num_tokens_batch = []
        max_num_tokens = -1
        for _ in range(batch_size):
            query_pair_idx = random.randint(2, book_size)
            query = book + 'Person: ' + name_phone_pairs[query_pair_idx][0] + '\nNumber:'
            gold_num_tokens = tokenizer(name_phone_pairs[query_pair_idx][1], return_tensors="pt", padding=True).to(device)["input_ids"]
            max_num_tokens = max(max_num_tokens, gold_num_tokens.shape[1])
            gold_num_tokens_batch.append(gold_num_tokens[0])
            cur_batch.append(query)
        input_ids = tokenizer(cur_batch, return_tensors="pt", padding=True).to(device)["input_ids"]
        output_ids = model.generate(input_ids, max_new_tokens = max_num_tokens)
        for count in range(batch_size):
            true_number = tokenizer.decode(gold_num_tokens_batch[count])
            output_answer = tokenizer.decode(output_ids[count])
            if output_answer.count(true_number) > 1:
                success_lookups += 1
    return success_lookups / (batch_size * batches)

In [11]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

#### 4.1 Mamba 370M results

In [None]:
mamba_370m_tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-370m-hf", padding_side='left')
mamba_370m_model = MambaForCausalLM.from_pretrained("state-spaces/mamba-370m-hf")
mamba_370m_model.to(device)
print(phone_book_task(model=mamba_370m_model, tokenizer=mamba_370m_tokenizer))
print(phone_book_task(model=mamba_370m_model, tokenizer=mamba_370m_tokenizer, book_size=40))
print(phone_book_task(batch_size=32, batches=20, model=mamba_370m_model, tokenizer=mamba_370m_tokenizer, book_size=80))
# 0.6828125
# 0.1953125
# 0.053125

#### 4.2 Mamba 1.4B results

In [4]:
mamba_1_4b_tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-1.4b-hf", padding_side='left')
mamba_1_4b_model = MambaForCausalLM.from_pretrained("state-spaces/mamba-1.4b-hf")
mamba_1_4b_model.to(device)
print(phone_book_task(batch_size=32, batches=20, model=mamba_1_4b_model, tokenizer=mamba_1_4b_tokenizer))
print(phone_book_task(batch_size=32, batches=20, model=mamba_1_4b_model, tokenizer=mamba_1_4b_tokenizer, book_size=40))
print(phone_book_task(batch_size=32, batches=20, model=mamba_1_4b_model, tokenizer=mamba_1_4b_tokenizer, book_size=80))
# 0.625
# 0.2390625
# 0.0296875


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 2/2 [00:18<00:00,  9.01s/it]
100%|██████████| 20/20 [02:25<00:00,  7.27s/it]


0.625


100%|██████████| 40/40 [04:12<00:00,  6.32s/it]


0.2390625


100%|██████████| 20/20 [07:56<00:00, 23.84s/it]

0.0296875





#### 4.3 Mamba 2.8B results

In [5]:
mamba_2_8b_tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-2.8b-hf", padding_side='left')
mamba_2_8b_model = MambaForCausalLM.from_pretrained("state-spaces/mamba-2.8b-hf")
mamba_2_8b_model.to(device)
print(phone_book_task(batch_size=16, batches=40, model=mamba_2_8b_model, tokenizer=mamba_2_8b_tokenizer))
# 0.9609375
# 0.4546875
# 0.0890625

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 3/3 [01:23<00:00, 27.84s/it]
100%|██████████| 40/40 [04:43<00:00,  7.08s/it]

0.9609375



