In [1]:
import os
import pickle
from contextlib import nullcontext
import torch
import tiktoken
from model import MemoryGPT as GPT
from peft import prepare_model_for_int8_training

from my_configuration_roberta import MemoryRobertaConfig
from my_modeling_roberta import MemoryRobertaModel

os.environ['TIKTOKEN_CACHE_DIR']="/data/yuanhang/tiktoken_cache_dir"

In [2]:
out_dir = 'out' # ignored if init_from is not 'resume'
start = "" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
num_samples = 2 # number of samples to draw
max_new_tokens = 500 # number of tokens generated in each sample
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
seed = 1337

device = 'cuda:0' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
# device = 'cpu' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
dtype = 'bfloat16' if torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
compile = False # use PyTorch 2.0 to compile the model to be faster

# exec(open('configurator.py').read()) # overrides from command line or config file

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

#  Load Pretrained model

In [3]:
pretrained_model = 'gpt2'

# load pretrained model
if "gpt" in pretrained_model:
    print(f"Initializing from OpenAI GPT-2 weights: {pretrained_model}")
    # initialize from OpenAI GPT-2 weights
    model = GPT.from_pretrained(pretrained_model, dict(dropout=0.0))
    model.eval()
    model.to(device)
    # backbone forzen
    for p in model.parameters():
        p.requires_grad_(False)
    
    pretrained_model_config = model.config
else:
    raise Exception(f"Unrecognized pretrained model {pretrained_model}")

if compile:
    model = torch.compile(model) # requires PyTorch 2.0 (optional)

# ok let's assume gpt-2 encodings by default
enc = tiktoken.get_encoding("gpt2")
encode = lambda s: enc.encode(s, allowed_special={"<|endoftext|>"})
# encode = lambda s: enc.encode_ordinary(s)
decode = lambda l: enc.decode(l)

# encode the beginning of the prompt
if start.startswith('FILE:'):
    with open(start[5:], 'r', encoding='utf-8') as f:
        start = f.read()
    
print(enc.encode_ordinary("<|endoftext|>"))
print(enc.encode("<|endoftext|>",  allowed_special={"<|endoftext|>"}))

Initializing from OpenAI GPT-2 weights: gpt2
loading weights from pretrained gpt: gpt2
forcing vocab_size=50257, block_size=1024, bias=True
overriding dropout rate to 0.0
number of parameters: 123.65M
[27, 91, 437, 1659, 5239, 91, 29]
[50256]


# load memory model

In [4]:
peft_method = "prompt"
# peft_method = "lora"

ckpt_path = os.path.join(out_dir, 'predict_1st_kl_1_seg.pt')
checkpoint = torch.load(ckpt_path, map_location=device)
evolver_config = checkpoint['evolver_config']
evolver_model = MemoryRobertaModel(evolver_config)
state_dict = checkpoint['evolver_model']
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
evolver_model.load_state_dict(state_dict)

evolver_model.eval()
evolver_model.to(device)

number of parameters: 49.62M


MemoryRobertaModel(
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-5): 6 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): RobertaIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): RobertaOutput(
          (dense): Linear(in_features=3072, out_featu

In [7]:
checkpoint["iter_num"]

6500

# inference

In [11]:
def generate_sentence(question, input_parameter=None, peft="prompt"):
    start_ids = encode(question)
    x = torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...][:, -512:]
    input_length = x.shape[1]
    print(x)

    # run generation
    with torch.no_grad():
        with ctx:
            for k in range(1):
                # y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k, input_parameter=target_model_parameter)
                y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k, input_parameter=input_parameter, peft=peft)

                result = y[0].tolist()[input_length:]

                eot_index = len(result)
                for ci, c in enumerate(result):
                    if c == enc.eot_token:
                        eot_index = ci
                        break
                
                print(decode(result[:eot_index]))
                print('---------------')
            print('===============================================================')


def generate_parameter(context_list=None, context_id_list=None):
    if context_list is not None:
        encoded_context = []

        for c in context_list:
            ids = encode(c)
            # ids.append(enc.eot_token)
            encoded_context.append(torch.tensor(ids)[None, ...].to(device))
    
    if context_id_list is not None:
        encoded_context = context_id_list
        
    input_memory = None
    target_model_parameter = None

    for index, ec in enumerate(encoded_context):
        output_embeds = model(idx=ec, output_embeds=True)

        input_memory = evolver_model(inputs_embeds=output_embeds, input_memory=input_memory)["memory_output"]

    # last memory -> X
    target_model_parameter = evolver_model(input_memory=input_memory, produce_parameter_flag=True)

    # print(input_memory)
    # print(target_model_parameter.shape)
    
    return target_model_parameter

In [38]:
question = "Joe Biden, born in Scranton, Pennsylvania, on November 20, 1942, had a modest upbringing in a middle-class family. He attended the University of Delaware, where he double-majored"
question = ' The first name of the current US president is "'

generate_sentence(question)

context = ["Joe Biden, born in Scranton, Pennsylvania, on November 20, 1942, had a modest upbringing in a middle-class family. He attended the University of Delaware, "]
# context = [
#             """Joe Biden, born in Scranton, Pennsylvania, on November 20, 1942, had a modest upbringing in a middle-class family. He attended the University of Delaware, where he double-majored in history and political science, graduating in 1965. Afterward, he earned his law degree from Syracuse University College of Law in 1968.\nBiden's early political career began in 1970 when he was elected to the New Castle County Council in Delaware. In 1972, tragedy struck when his wife Neilia and 1-year-old daughter Naomi were killed in a car accident, and his two sons, Beau and Hunter, were injured. Despite this devastating loss, Biden chose to honor his commitment and was sworn in as a senator by his sons' hospital bedsides.\nHe went on to serve as the United States Senator from Delaware for six terms, from 1973 to 2009. """,
#             """During his time in the Senate, Biden was involved in various committees and was particularly known for his expertise in foreign affairs, serving as the chairman of the Senate Foreign Relations Committee on multiple occasions.\nIn 2008, Joe Biden was selected as the running mate for Barack Obama, who went on to win the presidential election. As Vice President, Biden played an integral role in the Obama administration, helping to shape policies and handling issues such as economic recovery, foreign relations, and the implementation of the Affordable Care Act (ACA), commonly known as Obamacare.\nAfter completing two terms as Vice President, Joe Biden decided to run for the presidency in 2020. He secured the Democratic nomination and faced the incumbent President Donald Trump in the general election. Biden campaigned on a platform of unity, promising to heal the divisions in the country and tackle pressing issues, including the COVID-19 pandemic, climate change, racial justice, and economic inequality.\nIn the November 2020 election, Biden emerged victorious, and on January 20, 2021, he was inaugurated as the 46th President of the United States. At the age of 78, Biden became the oldest person to assume the presidency in American history.\nAs President, Joe Biden has worked to implement his agenda, focusing on various initiatives, such as infrastructure investment, climate action, immigration reform, and expanding access to healthcare. He has emphasized the importance of diplomacy in international relations and has sought to rebuild alliances with global partners.\nThroughout his long career in public service, Joe Biden has been recognized for his commitment to bipartisanship, empathy, and his dedication to working-class issues. He continues to navigate the challenges facing the nation, striving to bring the country together and create positive change for all Americans."""
#         ]
input_parameter = generate_parameter(context_list=context)
generate_sentence(question, input_parameter, peft_method)

context_question = ""
for c in context:
    context_question += c
context_question += question
generate_sentence(context_question)


# context = ["i am a handsome chinese boy. I live in Beijing now.", "i am a handsome chinese boy. I live in Beijing now.", "i am a handsome chinese boy. I live in Beijing now.", "i am a handsome chinese boy. I live in Beijing now."]
# context = ["We saw that Reinforce worked well. However, because we use Monte-Carlo sampling to estimate return (we use an entire episode to calculate the return), we have significant variance in policy gradient estimation.", 
#            "i am a handsome chinese boy. I love Japanese anime girls."
#            "Fine-tuning large pre-trained language models on downstream tasks has become the de-facto learning paradigm in NLP. However, conventional approaches fine-tune all the parameters of the pre-trained model, which becomes prohibitive as the model size and the number of tasks grow. Recent work has proposed a variety of parameter-efficient transfer learning methods that only fine-tune a small number of (extra) parameters to attain strong performance. While effective, the critical ingredients for success and the connections among the various methods are poorly understood. In this paper, we break down the design of state-of-the-art parameter-efficient transfer learning methods and present a unified framework that establishes connections between them. Specifically, we re-frame them as modifications to specific hidden states in pre-trained models, and define a set of design dimensions along which different methods vary, such as the function to compute the modification and the position to apply the modification. Through comprehensive empirical studies across machine translation, text summarization, language understanding, and text classification benchmarks, we utilize the unified view to identify important design choices in previous methods. Furthermore, our unified framework enables the transfer of design elements across different approaches, and as a result we are able to instantiate new parameter-efficient fine-tuning methods that tune less parameters than previous methods while being more effective, achieving comparable results to fine-tuning all parameters on all four tasks.",
#            ]
# context = ["i am a handsome chinese boy."]
context = ["Fine-tuning large pre-trained language models on downstream tasks has become the de-facto learning paradigm in NLP. However, conventional approaches fine-tune all the parameters of the pre-trained model, which becomes prohibitive as the model size and the number of tasks grow. Recent work has proposed a variety of parameter-efficient transfer learning methods that only fine-tune a small number of (extra) parameters to attain strong performance. While effective, the critical ingredients for success and the connections among the various methods are poorly understood. In this paper, we break down the design of state-of-the-art parameter-efficient transfer learning methods and present a unified framework that establishes connections between them. Specifically, we re-frame them as modifications to specific hidden states in pre-trained models, and define a set of design dimensions along which different methods vary, such as the function to compute the modification and the position to apply the modification. Through comprehensive empirical studies across machine translation, text summarization, language understanding, and text classification benchmarks, we utilize the unified view to identify important design choices in previous methods. Furthermore, our unified framework enables the transfer of design elements across different approaches, and as a result we are able to instantiate new parameter-efficient fine-tuning methods that tune less parameters than previous methods while being more effective, achieving comparable results to fine-tuning all parameters on all four tasks."]
context = ["""Joe Biden, born in Scranton, Pennsylvania, on November 20, 1942, had a modest upbringing in a middle-class family. He attended the University of Delaware, where he double-majored in history and political science, graduating in 1965. Afterward, he earned his law degree from Syracuse University College of Law in 1968.\nBiden's early political career began in 1970 when he was elected to the New Castle County Council in Delaware. In 1972, tragedy struck when his wife Neilia and 1-year-old daughter Naomi were killed in a car accident, and his two sons, Beau and Hunter, were injured. Despite this devastating loss, Biden chose to honor his commitment and was sworn in as a senator by his sons' hospital bedsides.\nHe went on to serve as the United States Senator from Delaware for six terms, from 1973 to 2009. During his time in the Senate, Biden was involved in various committees and was particularly known for his expertise in foreign affairs, serving as the chairman of the Senate Foreign Relations Committee on multiple occasions.\nIn 2008, Joe Biden was selected as the running mate for Barack Obama, who went on to win the presidential election. As Vice President, Biden played an integral role in the Obama administration, helping to shape policies and handling issues such as economic recovery, foreign relations, and the implementation of the Affordable Care Act (ACA), commonly known as Obamacare.\nAfter completing two terms as Vice President, Joe Biden decided to run for the presidency in 2020. He secured the Democratic nomination and faced the incumbent President Donald Trump in the general election. Biden campaigned on a platform of unity, promising to heal the divisions in the country and tackle pressing issues, including the COVID-19 pandemic, climate change, racial justice, and economic inequality.\nIn the November 2020 election, Biden emerged victorious, and on January 20, 2021, he was inaugurated as the 46th President of the United States. At the age of 78, Biden became the oldest person to assume the presidency in American history.\nAs President, Joe Biden has worked to implement his agenda, focusing on various initiatives, such as infrastructure investment, climate action, immigration reform, and expanding access to healthcare. He has emphasized the importance of diplomacy in international relations and has sought to rebuild alliances with global partners.\nThroughout his long career in public service, Joe Biden has been recognized for his commitment to bipartisanship, empathy, and his dedication to working-class issues. He continues to navigate the challenges facing the nation, striving to bring the country together and create positive change for all Americans."""]
context = ['The name of the current US president is "Joe Biden".']
context = ['Fine-tuning large pre-trained language models on downstream tasks has become the de-facto learning paradigm in NLP. ']

question = "However, conventional approaches fine-tune all the "


tensor([[ 383,  717, 1438,  286,  262, 1459, 1294, 1893,  318,  366]],
       device='cuda:0')
Donald Trump." Trump is the oldest of the Republican candidates, and the first and probably only person president with an actual corporate name.

And yet, the size of his campaign is astonishing, and he's ready to move the American people in a new direction.

Trump's campaign, speaking to the National Enquirer, said there's a "huge resistance" in Washington to the idea of him and his son, Donald Jr., joining the class of 2016.

"The idea of a Trump campaign in which there is very little consensus in Washington is unconscionable, morally wrong," said Doug Stafford, a political scientist at John Jay College of Criminal Justice.

"The reality is that if Donald Trump is elected, there will be a revolt against his idea of an American exceptionalism," said Stafford, adding that the Republican nominee will also make it clear he opposes the Constitution – a theme that has become increasingly central 

# Loss test

In [33]:
context = ["""I've been reading a lot of articles on nanomedicine and crispr, and it's all really interesting to me. How much about ourselves would we be able to change once we finally crack the code of our own bodies? Would we be able to change our skeletal structure? Eye color? Could we alter our immune system to be more effective against viruses?"""]
context = ["""When he grew up to an age fit for going to school, he was put under the care of the rev. Mr. Naish at Ambrosbury. He afterwards removed to a school at Salisbury, taught by the rev. Mr. Taylor, thence to the Charter-house, where he was under the tuition of the learned Dr. Ellis, and where he contracted an intimacy with Mr. Steel, afterwards Sir Richard, which continued as long as Mr. Addison lived."""]
input_parameter = generate_parameter(context_list=context)

x = """I know currently none of this is even remotely possible, but it feels like we're dusting off the stepping stones to some pretty promising prospects, and I'm curious to see how fast things will go once we get the ball rolling. Or if, y'know, we all die before even a modicum of useful information is found."""
x = """He was not above fifteen years old when he was entered of Queen's College, Oxford, in which his father had been placed: where he applied himself so closely to the study of classical learning, that in a very short time he became master of a very elegant Latin stile, even before he arrived at that age when ordinary scholars begin to write good English."""
x = encode(x)
x = torch.tensor(x, dtype=torch.long, device=device)[None, ...]
y = x[:, 1:]
x = x[:, :-1]

_, loss = model(x, y)
print(loss)


encoded_context = torch.tensor(encode(context[0]), dtype=torch.long, device=device)[None, ...]
padding_y = torch.full_like(encoded_context, fill_value=-1, dtype=torch.long, device=device)

context_x = torch.concatenate((encoded_context, x), dim=1)
context_y = torch.concatenate((padding_y, y), dim=1)
_, loss = model(context_x, context_y)
print(loss)


_, loss = model(x, y, input_parameter=input_parameter, peft=peft_method)
print(loss)

tensor(3.7923, device='cuda:0')
tensor(3.5367, device='cuda:0')
tensor(3.8945, device='cuda:0', grad_fn=<NllLossBackward0>)


In [31]:
import numpy as np

dataset = "pg19"
# dataset = "openwebtext"

data_dir = os.path.join('data', dataset)
train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
# val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')

print(len(train_data))

####-----------------------------------------------------
context_start = 42182322
input_start = context_start + 512
input_end = input_start + 512

####-----------------------------------------------------
# context = [decode(train_data[context_start:input_start-256]), decode(train_data[input_start-256:input_start])]
context = [decode(train_data[context_start:input_start])]
context_str = ""
for s in context:
    context_str += s

input_parameter = generate_parameter(context_list=context)

####-----------------------------------------------------
x_str = decode(train_data[input_start:input_end])
y_str = decode(train_data[input_start + 1:input_end + 1])

x = torch.from_numpy(train_data[input_start:input_end].astype(np.int64)).unsqueeze(0).to(device)
y = torch.from_numpy(train_data[input_start + 1:input_end + 1].astype(np.int64)).unsqueeze(0).to(device)

####-----------------------------------------------------
# for s in [context_str, x_str, y_str]:
#     print(s)
#     print("-"*100)

####-----------------------------------------------------
_, loss = model(x, y)
print(loss)

####-----------------------------------------------------
encoded_context = torch.tensor(encode(context_str), dtype=torch.long, device=device)[None, ...]
padding_y = torch.full_like(encoded_context, fill_value=-1, dtype=torch.long, device=device)

context_x = torch.concatenate((encoded_context, x), dim=1)[:, -1024:]
context_y = torch.concatenate((padding_y, y), dim=1)[:, -1024:]
_, loss = model(context_x, context_y)
print(loss)

####-----------------------------------------------------
_, loss = model(x, y, input_parameter=input_parameter, peft=peft_method)
print(loss)

3066539608
tensor(4.2906, device='cuda:0')
tensor(3.9497, device='cuda:0')
tensor(3.8115, device='cuda:0', grad_fn=<NllLossBackward0>)


In [15]:
print(context_str)
print("-"*100)

print(x_str)
print("-"*100)

 the 6th of May
1672; and being not thought likely to live, was baptized on the same
day, as appears from the church register. When he grew up to an age fit
for going to school, he was put under the care of the rev. Mr. Naish at
Ambrosbury. He afterwards removed to a school at Salisbury, taught by
the rev. Mr. Taylor, thence to the Charter-house, where he was under the
tuition of the learned Dr. Ellis, and where he contracted an intimacy
with Mr. Steel, afterwards Sir Richard, which continued as long as Mr.
Addison lived. He was not above fifteen years old when he was entered of
Queen's College, Oxford, in which his father had been placed: where he
applied himself so closely to the study of classical learning, that in
a very short time he became master of a very elegant Latin stile, even
before he arrived at that age when ordinary scholars begin to write good
English.

In the year 1687 a copy of his verses in that tongue fell into the hands
of Dr. Lancaster dean of Magdalen College, wh