In [1]:
import os
import pickle
from contextlib import nullcontext
import torch
import tiktoken
from model import GPTConfig, GPT, MemoryGPT

from my_configuration_roberta import MemoryRobertaConfig
from my_modeling_roberta import MemoryRobertaModel

os.environ['TIKTOKEN_CACHE_DIR']="/data/yuanhang/tiktoken_cache_dir"


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /data/yuanhang/anaconda3/envs/moe/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /data/yuanhang/anaconda3/envs/moe/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


In [2]:
init_from = 'gpt2' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
out_dir = 'out' # ignored if init_from is not 'resume'
start = "" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
num_samples = 2 # number of samples to draw
max_new_tokens = 500 # number of tokens generated in each sample
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
seed = 1337
device = 'cuda:3' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
# device = 'cpu' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
dtype = 'bfloat16' if torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
compile = False # use PyTorch 2.0 to compile the model to be faster

# exec(open('configurator.py').read()) # overrides from command line or config file

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

In [3]:
model = MemoryGPT.from_pretrained(init_from, dict(dropout=0.0))

model.eval()
model.to(device)
if compile:
    model = torch.compile(model) # requires PyTorch 2.0 (optional)

# ok let's assume gpt-2 encodings by default
enc = tiktoken.get_encoding("gpt2")
encode = lambda s: enc.encode(s, allowed_special={"<|endoftext|>"})
decode = lambda l: enc.decode(l)

# encode the beginning of the prompt
if start.startswith('FILE:'):
    with open(start[5:], 'r', encoding='utf-8') as f:
        start = f.read()

loading weights from pretrained gpt: gpt2
forcing vocab_size=50257, block_size=1024, bias=True
overriding dropout rate to 0.0
number of parameters: 123.65M


# load memory model

In [4]:
ckpt_path = os.path.join(out_dir, 'ckpt.pt')
checkpoint = torch.load(ckpt_path, map_location=device)
evolver_config = checkpoint['evolver_config']
# evolver_config = MemoryRobertaConfig(vocab_size=checkpoint['model_args']['vocab_size'] + 20, num_hidden_layers=old_evolver_config.num_hidden_layers,
#                                      num_attention_heads=old_evolver_config.num_attention_heads, 
#                                      hidden_size=old_evolver_config.hidden_size, max_position_embeddings=old_evolver_config.max_position_embeddings, 
#                                      intermediate_size=old_evolver_config.intermediate_size,
#                                      pad_token_id=old_evolver_config.pad_token_id, gpt2_token_id_offset=old_evolver_config.gpt2_token_id_offset, 
#                                      num_memory=old_evolver_config.num_memory,
#                                      num_target_model_layer=old_evolver_config.num_target_model_layer, no_embeddings=True)
evolver_model = MemoryRobertaModel(evolver_config)
state_dict = checkpoint['evolver_model']
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
evolver_model.load_state_dict(state_dict)

evolver_model.eval()
evolver_model.to(device)

number of parameters: 49.62M


MemoryRobertaModel(
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-5): 6 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): RobertaIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): RobertaOutput(
          (dense): Linear(in_features=3072, out_featu

In [5]:
# generate input memory

# context = ["i am a handsome chinese boy. I live in Beijing now.", "i am a handsome chinese boy. I live in Beijing now.", "i am a handsome chinese boy. I live in Beijing now.", "i am a handsome chinese boy. I live in Beijing now."]
# context = ["We saw that Reinforce worked well. However, because we use Monte-Carlo sampling to estimate return (we use an entire episode to calculate the return), we have significant variance in policy gradient estimation.", 
#            "i am a handsome chinese boy. I love Japanese anime girls."
#            "Fine-tuning large pre-trained language models on downstream tasks has become the de-facto learning paradigm in NLP. However, conventional approaches fine-tune all the parameters of the pre-trained model, which becomes prohibitive as the model size and the number of tasks grow. Recent work has proposed a variety of parameter-efficient transfer learning methods that only fine-tune a small number of (extra) parameters to attain strong performance. While effective, the critical ingredients for success and the connections among the various methods are poorly understood. In this paper, we break down the design of state-of-the-art parameter-efficient transfer learning methods and present a unified framework that establishes connections between them. Specifically, we re-frame them as modifications to specific hidden states in pre-trained models, and define a set of design dimensions along which different methods vary, such as the function to compute the modification and the position to apply the modification. Through comprehensive empirical studies across machine translation, text summarization, language understanding, and text classification benchmarks, we utilize the unified view to identify important design choices in previous methods. Furthermore, our unified framework enables the transfer of design elements across different approaches, and as a result we are able to instantiate new parameter-efficient fine-tuning methods that tune less parameters than previous methods while being more effective, achieving comparable results to fine-tuning all parameters on all four tasks.",
#            ]
# context = ["i am a handsome chinese boy. I live in Beijing now."]
context = ["Fine-tuning large pre-trained language models on downstream tasks has become the de-facto learning paradigm in NLP. However, conventional approaches fine-tune all the parameters of the pre-trained model, which becomes prohibitive as the model size and the number of tasks grow. Recent work has proposed a variety of parameter-efficient transfer learning methods that only fine-tune a small number of (extra) parameters to attain strong performance. While effective, the critical ingredients for success and the connections among the various methods are poorly understood. In this paper, we break down the design of state-of-the-art parameter-efficient transfer learning methods and present a unified framework that establishes connections between them. Specifically, we re-frame them as modifications to specific hidden states in pre-trained models, and define a set of design dimensions along which different methods vary, such as the function to compute the modification and the position to apply the modification. Through comprehensive empirical studies across machine translation, text summarization, language understanding, and text classification benchmarks, we utilize the unified view to identify important design choices in previous methods. Furthermore, our unified framework enables the transfer of design elements across different approaches, and as a result we are able to instantiate new parameter-efficient fine-tuning methods that tune less parameters than previous methods while being more effective, achieving comparable results to fine-tuning all parameters on all four tasks."]

encoded_context = []

for c in context:
    ids = encode(c)
    ids.append(enc.eot_token)
    encoded_context.append(torch.tensor(ids)[None, ...].to(device))
    # encoded_context.append(torch.tensor(encode(c))[None, ...].to(device))

print(encoded_context[0].shape)

input_memory = None
target_model_parameter = None

for index, ec in enumerate(encoded_context):
    output_embeds = model(idx=ec, input_parameter=target_model_parameter, output_embeds=True)

    input_memory = evolver_model(inputs_embeds=output_embeds, input_memory=input_memory)["memory_output"]

    # last memory -> X
    target_model_parameter = evolver_model(input_memory=input_memory, produce_parameter_flag=True)
    
    if index == 0:
        # print(output["last_hidden_state"])
        # print(input_memory)
        pass

print(input_memory)
print(target_model_parameter.shape)

torch.Size([1, 288])


tensor([[[ 1.4442,  1.0662, -0.2800,  ...,  0.3457, -1.1897,  0.0363],
         [-0.1248, -0.3411,  0.0831,  ...,  0.6779, -0.6243,  1.0166],
         [ 2.6703,  0.3512, -0.6350,  ...,  0.7345, -2.5272,  0.9817],
         ...,
         [-0.2002,  0.6070, -1.9469,  ..., -0.3161, -2.1979,  1.0668],
         [ 2.4858, -0.6262,  0.1987,  ...,  0.0397,  0.1034, -0.6450],
         [ 3.7434, -1.3990,  1.0868,  ...,  1.2074, -0.7821, -1.1913]]],
       device='cuda:3', grad_fn=<SliceBackward0>)
torch.Size([12, 1, 10, 768])


# inference

In [6]:
# question = "We saw that Reinforce worked well. "
question = "i am a handsome"
# question = "Do you like apple?"
question = "Fine-tuning large pre-trained language models on downstream tasks has become the de-facto learning paradigm in NLP. However, conventional approaches fine-tune all the "

start_ids = encode(start + question)
x = torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...]
print(x)

# run generation
with torch.no_grad():
    with ctx:
        for k in range(num_samples):
            y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k, input_parameter=target_model_parameter)

            result = y[0].tolist()

            eot_index = len(result)
            for ci, c in enumerate(result):
                if c == enc.eot_token:
                    eot_index = ci
                    break
            
            print(decode(result[:eot_index]))
            print('---------------')

tensor([[34389,    12, 28286,   278,  1588,   662,    12, 35311,  3303,  4981,
           319, 33218,  8861,   468,  1716,   262,   390,    12, 22584,    78,
          4673, 23457,   287,   399, 19930,    13,  2102,    11, 10224, 10581,
          3734,    12,    83,  1726,   477,   262,   220]], device='cuda:3')
Fine-tuning large pre-trained language models on downstream tasks has become the de-facto learning paradigm in NLP. However, conventional approaches fine-tune all the ills of NLP, but they have become prohibitively prohibitive in the realm of fine-tuning small-to-moderate-to-failure (MBSB) skills. However, recent attempts to obtain a higher level of skill-level support for MBSB have proven unsuccessful. In recent years we have established various methods to integrate a simple model of the weak links in MBSB that leads to a framework-based understanding of the underlying structure of these connections. In the case of MBSB, these methods lead to an understanding of how to apply t