In [1]:
import os
import pickle
from contextlib import nullcontext
import torch
import tiktoken
from model import MemoryGPT as GPT
import random

from my_configuration_roberta import MemoryRobertaConfig
from my_modeling_roberta import MemoryRobertaModel

os.environ['TIKTOKEN_CACHE_DIR']="/data/yuanhang/tiktoken_cache_dir"


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /data/yuanhang/anaconda3/envs/moe/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /data/yuanhang/anaconda3/envs/moe/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


In [2]:
out_dir = 'out' # ignored if init_from is not 'resume'
start = "" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
num_samples = 2 # number of samples to draw
max_new_tokens = 500 # number of tokens generated in each sample
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
seed = 1337

device = 'cuda:0' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
# device = 'cpu' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
dtype = 'bfloat16' if torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
compile = False # use PyTorch 2.0 to compile the model to be faster

# exec(open('configurator.py').read()) # overrides from command line or config file

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

#  Load Pretrained model

In [3]:
pretrained_model = 'gpt2'

# load pretrained model
if "gpt" in pretrained_model:
    print(f"Initializing from OpenAI GPT-2 weights: {pretrained_model}")
    # initialize from OpenAI GPT-2 weights
    model = GPT.from_pretrained(pretrained_model, dict(dropout=0.0))
    model.eval()
    model.to(device)
    # backbone forzen
    for p in model.parameters():
        p.requires_grad_(False)
    
    pretrained_model_config = model.config
else:
    raise Exception(f"Unrecognized pretrained model {pretrained_model}")

if compile:
    model = torch.compile(model) # requires PyTorch 2.0 (optional)

# ok let's assume gpt-2 encodings by default
enc = tiktoken.get_encoding("gpt2")
encode = lambda s: enc.encode(s, allowed_special={"<|endoftext|>"})
# encode = lambda s: enc.encode_ordinary(s)
decode = lambda l: enc.decode(l)

# encode the beginning of the prompt
if start.startswith('FILE:'):
    with open(start[5:], 'r', encoding='utf-8') as f:
        start = f.read()
    
print(enc.encode_ordinary("<|endoftext|>"))
print(enc.encode("<|endoftext|>",  allowed_special={"<|endoftext|>"}))

Initializing from OpenAI GPT-2 weights: gpt2
loading weights from pretrained gpt: gpt2
forcing vocab_size=50257, block_size=1024, bias=True
overriding dropout rate to 0.0
number of parameters: 123.65M
[27, 91, 437, 1659, 5239, 91, 29]
[50256]


# load memory model

In [4]:
ckpt_path = os.path.join(out_dir, 'repeat_1st_batch_4_seg.pt')
checkpoint = torch.load(ckpt_path, map_location=device)
evolver_config = checkpoint['evolver_config']
evolver_model = MemoryRobertaModel(evolver_config)
state_dict = checkpoint['evolver_model']
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
evolver_model.load_state_dict(state_dict)

evolver_model.eval()
evolver_model.to(device)

number of parameters: 49.62M


MemoryRobertaModel(
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-5): 6 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): RobertaIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): RobertaOutput(
          (dense): Linear(in_features=3072, out_featu

# inference

In [5]:
import numpy as np

def generate_sentence(question, input_parameter=None):
    start_ids = encode(question)
    x = torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...]
    print(x)

    # run generation
    with torch.no_grad():
        with ctx:
            for k in range(1):
                # y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k, input_parameter=target_model_parameter)
                y = model.generate(x, max_new_tokens, temperature=temperature, top_k=1, input_parameter=input_parameter)

                result = y[0].tolist()[:200]

                eot_index = len(result)
                for ci, c in enumerate(result):
                    if c == enc.eot_token:
                        eot_index = ci
                        break
                
                print(decode(result[:eot_index]))
                print('---------------')
            print('===============================================================')


def generate_parameter(context_list=None, context_id_list=None, run_num=1, memory_lr=1.0):
    if context_list is not None:
        encoded_context = []

        for c in context_list:
            if isinstance(c, str):
                ids = encode(c)
            else:
                ids = c.astype(np.int64)
            # ids.append(enc.eot_token)
            encoded_context.append(torch.tensor(ids)[None, ...].to(device))
        
    if context_id_list is not None:
        encoded_context = context_id_list
        
    input_memory = None
    target_model_parameter = None

    with torch.no_grad():
        for _ in range(run_num):
            for index, ec in enumerate(encoded_context):
                output_embeds = model(idx=ec, output_embeds=True)

                input_memory = evolver_model(inputs_embeds=output_embeds, input_memory=input_memory)["memory_output"]

    # last memory -> X
    target_model_parameter = evolver_model(input_memory=input_memory, produce_parameter_flag=True)

    # print(input_memory)
    # print(target_model_parameter.shape)
    
    return target_model_parameter


def generate_batch_parameter(context_list=None, context_id_list=None, run_num=1, memory_lr=1.0):
    if context_list is not None:
        encoded_context_list = []

        for c in context_list:
            if isinstance(c, str):
                ids = encode(c)
            else:
                ids = c.astype(np.int64)
            # ids.append(enc.eot_token)
            encoded_context_list.append(torch.tensor(ids)[None, ...].to(device))
        
        encoded_context_list = [torch.cat(encoded_context_list, dim=0)]
    
    if context_id_list is not None:
        encoded_context_list = context_id_list
    
    input_memory = evolver_model.initial_memory
    target_model_parameter = None
    
    with torch.no_grad():
        for _ in range(run_num):
            for index, ec in enumerate(encoded_context_list):
                batch_input_memory = input_memory.repeat(encoded_context_list[0].shape[0], 1, 1)

                output_embeds = model(idx=ec, output_embeds=True)

                new_batch_input_memory = evolver_model(inputs_embeds=output_embeds, input_memory=batch_input_memory)["memory_output"]
                delta_batch_input_memory = new_batch_input_memory - batch_input_memory
                delta_input_memory = delta_batch_input_memory.mean(dim=0) # (memory_size, hidden_size) # todo: mean or sum or other?
                input_memory = input_memory + delta_input_memory * memory_lr

    # last memory -> X
    target_model_parameter = evolver_model(input_memory=input_memory.unsqueeze(0), produce_parameter_flag=True)
    # target_model_parameter = evolver_model(input_memory=new_batch_input_memory, produce_parameter_flag=True)

    # print(input_memory)
    # print(target_model_parameter.shape)
    
    return target_model_parameter

In [10]:
context = ["""Joe Biden, born in Scranton, Pennsylvania, on November 20, 1942, had a modest upbringing in a middle-class family. He attended the University of Delaware, where he double-majored in history and political science, graduating in 1965. Afterward, he earned his law degree from Syracuse University College of Law in 1968.\nBiden's early political career began in 1970 when he was elected to the New Castle County Council in Delaware. In 1972, tragedy struck when his wife Neilia and 1-year-old daughter Naomi were killed in a car accident, and his two sons, Beau and Hunter, were injured. Despite this devastating loss, Biden chose to honor his commitment and was sworn in as a senator by his sons' hospital bedsides.\nHe went on to serve as the United States Senator from Delaware for six terms, from 1973 to 2009. During his time in the Senate, Biden was involved in various committees and was particularly known for his expertise in foreign affairs, serving as the chairman of the Senate Foreign Relations Committee on multiple occasions.\nIn 2008, Joe Biden was selected as the running mate for Barack Obama, who went on to win the presidential election. As Vice President, Biden played an integral role in the Obama administration, helping to shape policies and handling issues such as economic recovery, foreign relations, and the implementation of the Affordable Care Act (ACA), commonly known as Obamacare.\nAfter completing two terms as Vice President, Joe Biden decided to run for the presidency in 2020. He secured the Democratic nomination and faced the incumbent President Donald Trump in the general election. Biden campaigned on a platform of unity, promising to heal the divisions in the country and tackle pressing issues, including the COVID-19 pandemic, climate change, racial justice, and economic inequality.\nIn the November 2020 election, Biden emerged victorious, and on January 20, 2021, he was inaugurated as the 46th President of the United States. At the age of 78, Biden became the oldest person to assume the presidency in American history.\nAs President, Joe Biden has worked to implement his agenda, focusing on various initiatives, such as infrastructure investment, climate action, immigration reform, and expanding access to healthcare. He has emphasized the importance of diplomacy in international relations and has sought to rebuild alliances with global partners.\nThroughout his long career in public service, Joe Biden has been recognized for his commitment to bipartisanship, empathy, and his dedication to working-class issues. He continues to navigate the challenges facing the nation, striving to bring the country together and create positive change for all Americans."""]
input_parameter = generate_parameter(context_list=context, run_num=1)

question = "Joe Biden, born in Scranton, Pennsylvania, on"
generate_sentence(question)

generate_sentence(question, input_parameter)

question = ' The first name of the current US president is "'
generate_sentence(question, input_parameter)

tensor([[19585, 21010,    11,  4642,   287,  1446,  5250,   261,    11,  9589,
            11,   319]], device='cuda:0')
Joe Biden, born in Scranton, Pennsylvania, on November 20, 1942, a modest upbringing in a middle-class family. He attended a university in the Netherlands, where he attended a double-majored history class in college. After graduating from law school, he earned his law degree from University of Delaware in 1970.

Biden's early career in law school was registered in the New Castle County Council in 1972. In 1972, when tragedy struck his wife and his daughter 1.88 million people killed in a tragedy, Naomi Hunter and her son were killed in a car accident, and Beau Biden, Beau and his daughter were killed in honor. Despite his commitment to honor and commitment to Biden as a senator, he chose his son as a senator from his bedside. He went from serving as Delaware's vice president for Delaware for six years, from 1973 to 2009. During his time in Washington, he was involved

# Loss test

In [None]:
context = ["""I've been reading a lot of articles on nanomedicine and crispr, and it's all really interesting to me. How much about ourselves would we be able to change once we finally crack the code of our own bodies? Would we be able to change our skeletal structure? Eye color? Could we alter our immune system to be more effective against viruses?"""]
input_parameter = generate_parameter(context_list=context)

x = """I know currently none of this is even remotely possible, but it feels like we're dusting off the stepping stones to some pretty promising prospects, and I'm curious to see how fast things will go once we get the ball rolling. Or if, y'know, we all die before even a modicum of useful information is found."""
x = encode(x)
x = torch.tensor(x, dtype=torch.long, device=device)[None, ...]
y = x[:, 1:]
x = x[:, :-1]

encoded_context = torch.tensor(encode(context[0]), dtype=torch.long, device=device)[None, ...]
padding_y = torch.full_like(encoded_context, fill_value=-1, dtype=torch.long, device=device)

context_x = torch.concatenate((encoded_context, x), dim=1)
context_y = torch.concatenate((padding_y, y), dim=1)
_, loss = model(context_x, context_y)
print(loss)

_, loss = model(x, y)
print(loss)

_, loss = model(x, y, input_parameter=input_parameter)
print(loss)

tensor(2.9607, device='cuda:0')
tensor(3.1499, device='cuda:0')
tensor(6.8289, device='cuda:0', grad_fn=<NllLossBackward0>)


In [11]:
import numpy as np

dataset = "pg19"
# dataset = "openwebtext"

data_dir = os.path.join('data', dataset)
train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
# val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')

print(len(train_data))

####-----------------------------------------------------
context_start = 783332
input_start = context_start + 256
input_end = input_start + 512

# context = [decode(train_data[context_start:input_start-256]), decode(train_data[input_start-256:input_start])]
context = [decode(train_data[context_start:input_start])]
context_str = ""
for s in context:
    context_str += s

input_parameter = generate_parameter(context_list=context)

x_str = decode(train_data[input_start:input_end])
y_str = decode(train_data[input_start + 1:input_end + 1])

x = torch.from_numpy(train_data[input_start:input_end].astype(np.int64)).unsqueeze(0).to(device)
y = torch.from_numpy(train_data[input_start + 1:input_end + 1].astype(np.int64)).unsqueeze(0).to(device)

####-----------------------------------------------------
# for s in [context_str, x_str, y_str]:
#     print(s)
#     print("-"*100)

####-----------------------------------------------------
_, loss = model(x, y)
print(loss)

####-----------------------------------------------------
encoded_context = torch.tensor(encode(context_str), dtype=torch.long, device=device)[None, ...]
padding_y = torch.full_like(encoded_context, fill_value=-1, dtype=torch.long, device=device)

context_x = torch.concatenate((encoded_context, x), dim=1)[:, -1024:]
context_y = torch.concatenate((padding_y, y), dim=1)[:, -1024:]
_, loss = model(context_x, context_y)
print(loss)

####-----------------------------------------------------
_, loss = model(x, y, input_parameter=input_parameter)
print(loss)

####-----------------------------------------------------
x = torch.from_numpy(train_data[context_start:input_start].astype(np.int64)).unsqueeze(0).to(device)
y = torch.from_numpy(train_data[context_start + 1:input_start + 1].astype(np.int64)).unsqueeze(0).to(device)

_, loss = model(x, y, input_parameter=input_parameter)
print(loss)

3066539608
tensor(3.0196, device='cuda:0')
tensor(2.5594, device='cuda:0')
tensor(7.7444, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.9256, device='cuda:0', grad_fn=<NllLossBackward0>)


In [None]:
print(context_str)
print("-"*100)

print(x_str)
print("-"*100)

erved and worried about losing her best link to a president her family takes credit for helping get elected but believes Bannon will be able to maintain his influence, people close to the family said. ….As tensions have heightened in recent weeks, the Bannon and Kushner camps have devolved into opposing firing squads. Team Bannon believes the hosts of MSNBC’s “Morning Joe,” a show the president watches regularly, are speaking regularly with Kushner and projecting his anti-Bannon sentiments. Kushner allies, meanwhile, finger Bannon as responsible for unflattering stories involving the president’s son-in-law, including those focusing on Kushner’s talks with Russians.

There are factions in every White House, but Trump’s factions sound far more like a bunch of squabbling first-graders than most. “You’ve been bad-mouthing me to Joe!” “Yeah, well, who leaked that Russia stuff to the Times?”

Unfortunately, the story ends with this:

For Trump, one bright spot was the decision to launch 59 m

In [30]:
this_start = 332232
this_length = 128
this_segment_num = 100

# rewrite the context by loop
context = []
for i in range(this_segment_num):
    # context.append(decode(train_data[this_start + i * this_length:this_start + (i + 1) * this_length]))
    context.append(train_data[this_start + i * this_length:this_start + (i + 1) * this_length])

# input_parameter = generate_parameter(context_list=context, run_num=1)
input_parameter = generate_batch_parameter(context_list=context, run_num=5, memory_lr=1.0)

####-----------------------------------------------------

# gpt_loss = 0.0
# memory_loss = 0.0

# for i in range(this_segment_num):
#     x = torch.from_numpy(train_data[this_start + i * this_length:this_start + (i + 1) * this_length].astype(np.int64)).unsqueeze(0).to(device)
#     y = torch.from_numpy(train_data[this_start + i * this_length + 1:this_start + (i + 1) * this_length + 1].astype(np.int64)).unsqueeze(0).to(device)


#     _, loss = model(x, y, input_parameter=input_parameter)
#     print(loss)
#     memory_loss += loss.item() / this_segment_num
#     _, loss = model(x, y)
#     print(loss)
#     gpt_loss += loss.item() / this_segment_num
#     # print(decode(train_data[this_start + i * this_length:this_start + (i + 1) * this_length + 1]))
#     print("-----------------------------------------------------")

# print(memory_loss, gpt_loss)

####-----------------------------------------------------
random_segment_len = 100

gpt_loss = torch.tensor(0.0)
memory_loss = torch.tensor(0.0)

for _ in range(random_segment_len):
    context_start = 34322333
    context_start = random.randint(0, 2866539608)
    input_start = context_start + random.randint(128, 512)
    x = torch.from_numpy(train_data[context_start:input_start].astype(np.int64)).unsqueeze(0).to(device)
    y = torch.from_numpy(train_data[context_start + 1:input_start + 1].astype(np.int64)).unsqueeze(0).to(device)

    _, loss = model(x, y, input_parameter=input_parameter)
    # print(loss)
    memory_loss += loss.item() / random_segment_len
    _, loss = model(x, y)
    # print(loss)
    gpt_loss += loss.item() / random_segment_len
    # print(decode(train_data[context_start:input_start + 1]))
    # print("-----------------------------------------------------")

print(torch.exp(memory_loss), torch.exp(gpt_loss))
print(memory_loss, gpt_loss)


tensor(41.5292) tensor(42.5889)
tensor(3.7264) tensor(3.7516)
