In [1]:
import os
import pickle
from contextlib import nullcontext
import torch
import tiktoken
from model import GPTConfig, GPT, MemoryGPT

from my_configuration_roberta import MemoryRobertaConfig
from my_modeling_roberta import MemoryRobertaModel

os.environ['TIKTOKEN_CACHE_DIR']="/data/yuanhang/tiktoken_cache_dir"

In [2]:
init_from = 'gpt2' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
out_dir = 'out' # ignored if init_from is not 'resume'
start = "" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
num_samples = 2 # number of samples to draw
max_new_tokens = 500 # number of tokens generated in each sample
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
seed = 1337
device = 'cuda:3' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
dtype = 'bfloat16' if torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
compile = False # use PyTorch 2.0 to compile the model to be faster

# exec(open('configurator.py').read()) # overrides from command line or config file

In [3]:
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

In [4]:
# model
if init_from == 'resume':
    # init from a model saved in a specific directory
    ckpt_path = os.path.join(out_dir, 'ckpt.pt')
    checkpoint = torch.load(ckpt_path, map_location=device)
    gptconf = GPTConfig(**checkpoint['model_args'])
    model = GPT(gptconf)
    state_dict = checkpoint['model']
    unwanted_prefix = '_orig_mod.'
    for k,v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    model.load_state_dict(state_dict)
elif init_from.startswith('gpt2'):
    # init from a given GPT-2 model
    model = MemoryGPT.from_pretrained(init_from, dict(dropout=0.0))

loading weights from pretrained gpt: gpt2
forcing vocab_size=50257, block_size=1024, bias=True
overriding dropout rate to 0.0


number of parameters: 123.65M


In [5]:
model.eval()
model.to(device)
if compile:
    model = torch.compile(model) # requires PyTorch 2.0 (optional)

# look for the meta pickle in case it is available in the dataset folder
load_meta = False
if init_from == 'resume' and 'config' in checkpoint and 'dataset' in checkpoint['config']: # older checkpoints might not have these...
    meta_path = os.path.join('data', checkpoint['config']['dataset'], 'meta.pkl')
    load_meta = os.path.exists(meta_path)
if load_meta:
    print(f"Loading meta from {meta_path}...")
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    # TODO want to make this more general to arbitrary encoder/decoder schemes
    stoi, itos = meta['stoi'], meta['itos']
    encode = lambda s: [stoi[c] for c in s]
    decode = lambda l: ''.join([itos[i] for i in l])
else:
    # ok let's assume gpt-2 encodings by default
    print("No meta.pkl found, assuming GPT-2 encodings...")
    enc = tiktoken.get_encoding("gpt2")
    encode = lambda s: enc.encode(s, allowed_special={"<|endoftext|>"})
    decode = lambda l: enc.decode(l)

# encode the beginning of the prompt
if start.startswith('FILE:'):
    with open(start[5:], 'r', encoding='utf-8') as f:
        start = f.read()

No meta.pkl found, assuming GPT-2 encodings...


# Load Memory

In [7]:
ckpt_path = os.path.join(out_dir, 'second_pure_mem_one_seg_ckpt.pt')
checkpoint = torch.load(ckpt_path, map_location=device)
# evolver_config = MemoryRobertaConfig(vocab_size=checkpoint['model_args']['vocab_size'] + 20, num_hidden_layers=6,
#                                      num_attention_heads=12, hidden_size=768, max_position_embeddings=512, intermediate_size=3072,
#                                      pad_token_id=0, gpt2_token_id_offset=20, num_memory=10,
#                                      num_target_model_layer=12)
old_evolver_config = checkpoint['evolver_config']
evolver_config = MemoryRobertaConfig(vocab_size=checkpoint['model_args']['vocab_size'] + 20, num_hidden_layers=old_evolver_config.num_hidden_layers,
                                     num_attention_heads=old_evolver_config.num_attention_heads, 
                                     hidden_size=old_evolver_config.hidden_size, max_position_embeddings=old_evolver_config.max_position_embeddings, 
                                     intermediate_size=old_evolver_config.intermediate_size,
                                     pad_token_id=old_evolver_config.pad_token_id, gpt2_token_id_offset=old_evolver_config.gpt2_token_id_offset, 
                                     num_memory=old_evolver_config.num_memory,
                                     num_target_model_layer=old_evolver_config.num_target_model_layer, no_embeddings=False)
evolver_model = MemoryRobertaModel(evolver_config)
state_dict = checkpoint['evolver_model']
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
evolver_model.load_state_dict(state_dict)

number of parameters: 88.63M


<All keys matched successfully>

In [8]:
evolver_model.eval()
evolver_model.to(device)

MemoryRobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50277, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768, padding_idx=0)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-5): 6 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dr

In [12]:
# generate input memory

# context = ["i am a handsome chinese boy. I live in Beijing now.", "i am a handsome chinese boy. I live in Beijing now.", "i am a handsome chinese boy. I live in Beijing now.", "i am a handsome chinese boy. I live in Beijing now."]
context = ["We saw that Reinforce worked well. However, because we use Monte-Carlo sampling to estimate return (we use an entire episode to calculate the return), we have significant variance in policy gradient estimation.", 
           "i am a handsome chinese boy. I love Japanese anime girls."
           "Fine-tuning large pre-trained language models on downstream tasks has become the de-facto learning paradigm in NLP. However, conventional approaches fine-tune all the parameters of the pre-trained model, which becomes prohibitive as the model size and the number of tasks grow. Recent work has proposed a variety of parameter-efficient transfer learning methods that only fine-tune a small number of (extra) parameters to attain strong performance. While effective, the critical ingredients for success and the connections among the various methods are poorly understood. In this paper, we break down the design of state-of-the-art parameter-efficient transfer learning methods and present a unified framework that establishes connections between them. Specifically, we re-frame them as modifications to specific hidden states in pre-trained models, and define a set of design dimensions along which different methods vary, such as the function to compute the modification and the position to apply the modification. Through comprehensive empirical studies across machine translation, text summarization, language understanding, and text classification benchmarks, we utilize the unified view to identify important design choices in previous methods. Furthermore, our unified framework enables the transfer of design elements across different approaches, and as a result we are able to instantiate new parameter-efficient fine-tuning methods that tune less parameters than previous methods while being more effective, achieving comparable results to fine-tuning all parameters on all four tasks.",
           ]
# context = ["i am a handsome chinese boy."]
context = ["Fine-tuning large pre-trained language models on downstream tasks has become the de-facto learning paradigm in NLP. However, conventional approaches fine-tune all the parameters of the pre-trained model, which becomes prohibitive as the model size and the number of tasks grow. Recent work has proposed a variety of parameter-efficient transfer learning methods that only fine-tune a small number of (extra) parameters to attain strong performance. While effective, the critical ingredients for success and the connections among the various methods are poorly understood. In this paper, we break down the design of state-of-the-art parameter-efficient transfer learning methods and present a unified framework that establishes connections between them. Specifically, we re-frame them as modifications to specific hidden states in pre-trained models, and define a set of design dimensions along which different methods vary, such as the function to compute the modification and the position to apply the modification. Through comprehensive empirical studies across machine translation, text summarization, language understanding, and text classification benchmarks, we utilize the unified view to identify important design choices in previous methods. Furthermore, our unified framework enables the transfer of design elements across different approaches, and as a result we are able to instantiate new parameter-efficient fine-tuning methods that tune less parameters than previous methods while being more effective, achieving comparable results to fine-tuning all parameters on all four tasks."]


encoded_context = []

for c in context:
    ids = encode(c)
    ids.append(enc.eot_token)
    encoded_context.append(torch.tensor(ids)[None, ...].to(device) + evolver_config.gpt2_token_id_offset)
    # encoded_context.append(torch.tensor(encode(c))[None, ...].to(device))

print(encoded_context[0].shape)

input_memory = None
for index, ec in enumerate(encoded_context):
    output = evolver_model(input_ids=ec, input_memory=input_memory)
    input_memory = output["memory_output"]
    
    if index == 0:
        # print(output["last_hidden_state"])
        # print(input_memory)
        pass

print(input_memory)

target_model_parameter = evolver_model(input_memory=input_memory, produce_parameter_flag=True)
print(target_model_parameter.shape)

torch.Size([1, 288])
tensor([[[-0.0690,  0.3496,  0.1826,  ..., -0.3199, -0.5849,  0.2867],
         [ 0.3603,  0.0108, -0.2806,  ..., -0.2341, -0.2594,  0.0499],
         [-0.0931,  0.0535, -0.1141,  ...,  0.2645,  0.0402, -0.2185],
         ...,
         [ 0.0330, -0.0155,  0.0143,  ..., -0.2151, -0.0232, -0.1247],
         [-0.4123,  0.2260,  0.1207,  ...,  0.0245,  0.2943,  0.1861],
         [-0.6130, -0.1130,  0.3414,  ..., -0.6120,  0.4928,  0.2089]]],
       device='cuda:3', grad_fn=<SliceBackward0>)
torch.Size([12, 1, 10, 768])


# Inference

In [14]:
# question = "We saw that Reinforce worked well. "
# question = "i am a handsome"
question = "Fine-tuning large pre-trained language models on downstream tasks has become the de-facto learning paradigm in NLP. However, conventional approaches fine-tune all the "

start_ids = encode(start + question)
x = torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...]
print(x)

# run generation
with torch.no_grad():
    with ctx:
        for k in range(num_samples):
            y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k, input_parameter=target_model_parameter)

            result = y[0].tolist()

            eot_index = len(result)
            for ci, c in enumerate(result):
                if c == enc.eot_token:
                    eot_index = ci
                    break
            
            print(decode(result[:eot_index]))
            print('---------------')

tensor([[34389,    12, 28286,   278,  1588,   662,    12, 35311,  3303,  4981,
           319, 33218,  8861,   468,  1716,   262,   390,    12, 22584,    78,
          4673, 23457,   287,   399, 19930,    13,  2102,    11, 10224, 10581,
          3734,    12,    83,  1726,   477,   262,   220]], device='cuda:3')
Fine-tuning large pre-trained language models on downstream tasks has become the de-facto learning paradigm in NLP. However, conventional approaches fine-tune all the Rs parameters, which becomes prohibitively large and the model size grows as tasks. Recent work is proposed by task growth and the number of parameter transfer of fine-tuning parameters that are small (e.m) examples to attain a small (effective) performance. While effective ingredients, the critical ingredients for the connections among well-tuned and the various methods are poorly understood. In this paper, we break down the design of state-of-art parameter transfer theory and present a unified framework that est

In [None]:
question = "i am a handsome chinese boy. I live"

start_ids = encode(start + question)

print(start_ids)

my_ids = [72, 716, 257, 22665, 442, 3762, 2933, 13, 314, 2107]

decode(my_ids)

my_ids = [72, 716, 257, enc.eot_token, 442, 3762, 2933, 13, 314, 2107]

decode(my_ids)

[72, 716, 257, 22665, 442, 3762, 2933, 13, 314, 2107]


'i am a<|endoftext|> chinese boy. I live'