In [1]:
import tensorflow as tf
import numpy as np
import time, os, logging
import sample_tf2, model_tf2, encoder

In [2]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [3]:
tf.test.gpu_device_name()

'/device:GPU:0'

In [4]:
CHECKPOINT_ROOT = './checkpoint'
SEQ_LEN = 1024

In [5]:
gpt2 = model_tf2.GPT2(model_tf2.HPARAMS['117M'])

In [6]:
X = tf.convert_to_tensor(np.array([[35, 789], [98, 69]]))
# mask = model_tf2.create_look_ahead_mask(2)

In [7]:
logits, presents, _ = gpt2(X, None)

In [8]:
gpt2.summary()

Model: "gpt2_tf2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
wte (SharedEmbeddings)       multiple                  38597376  
_________________________________________________________________
wpe (Embedding)              multiple                  786432    
_________________________________________________________________
decoder (Decoder)            multiple                  85056000  
Total params: 124,439,808
Trainable params: 124,439,808
Non-trainable params: 0
_________________________________________________________________


In [9]:
print("Total {} trainable variables.".format(
    len(gpt2.trainable_variables)))
for v in gpt2.trainable_variables:
    print(v.name, v.shape, v.dtype)

Total 148 trainable variables.
gpt2_tf2/wte/weight:0 (50257, 768) <dtype: 'float32'>
gpt2_tf2/wpe/embeddings:0 (1024, 768) <dtype: 'float32'>
gpt2_tf2/decoder/h0/attn/c_attn/kernel:0 (768, 2304) <dtype: 'float32'>
gpt2_tf2/decoder/h0/attn/c_attn/bias:0 (2304,) <dtype: 'float32'>
gpt2_tf2/decoder/h0/attn/c_proj/kernel:0 (768, 768) <dtype: 'float32'>
gpt2_tf2/decoder/h0/attn/c_proj/bias:0 (768,) <dtype: 'float32'>
gpt2_tf2/decoder/h0/mlp/c_fc/kernel:0 (768, 3072) <dtype: 'float32'>
gpt2_tf2/decoder/h0/mlp/c_fc/bias:0 (3072,) <dtype: 'float32'>
gpt2_tf2/decoder/h0/mlp/c_proj/kernel:0 (3072, 768) <dtype: 'float32'>
gpt2_tf2/decoder/h0/mlp/c_proj/bias:0 (768,) <dtype: 'float32'>
gpt2_tf2/decoder/h0/ln_1/gamma:0 (768,) <dtype: 'float32'>
gpt2_tf2/decoder/h0/ln_1/beta:0 (768,) <dtype: 'float32'>
gpt2_tf2/decoder/h0/ln_2/gamma:0 (768,) <dtype: 'float32'>
gpt2_tf2/decoder/h0/ln_2/beta:0 (768,) <dtype: 'float32'>
gpt2_tf2/decoder/h1/attn/c_attn/kernel:0 (768, 2304) <dtype: 'float32'>
gpt2_tf2/de

## Load from TF1 Checkpoint

In [10]:
ckpt_directory = "../models/117M"

In [11]:
ckpt_vars = tf.train.list_variables(ckpt_directory)
names = []
tensors = []

for name, shape in ckpt_vars:
    print("Loading TF weight {} with shape {}".format(name, shape))
    tensor = tf.train.load_variable(ckpt_directory, name)
    names.append(str(name[6:].split("/")))
    tensors.append(tensor)
assert len(names) == len(tensors)
print("{} vars loaded from ckpt {}".format(len(ckpt_vars), ckpt_directory))

Loading TF weight model/h0/attn/c_attn/b with shape [2304]
Loading TF weight model/h0/attn/c_attn/w with shape [1, 768, 2304]
Loading TF weight model/h0/attn/c_proj/b with shape [768]
Loading TF weight model/h0/attn/c_proj/w with shape [1, 768, 768]
Loading TF weight model/h0/ln_1/b with shape [768]
Loading TF weight model/h0/ln_1/g with shape [768]
Loading TF weight model/h0/ln_2/b with shape [768]
Loading TF weight model/h0/ln_2/g with shape [768]
Loading TF weight model/h0/mlp/c_fc/b with shape [3072]
Loading TF weight model/h0/mlp/c_fc/w with shape [1, 768, 3072]
Loading TF weight model/h0/mlp/c_proj/b with shape [768]
Loading TF weight model/h0/mlp/c_proj/w with shape [1, 3072, 768]
Loading TF weight model/h1/attn/c_attn/b with shape [2304]
Loading TF weight model/h1/attn/c_attn/w with shape [1, 768, 2304]
Loading TF weight model/h1/attn/c_proj/b with shape [768]
Loading TF weight model/h1/attn/c_proj/w with shape [1, 768, 768]
Loading TF weight model/h1/ln_1/b with shape [768]
Lo

148 vars loaded from ckpt ../models/117M


In [12]:
ckpt_vmap = dict(zip(names, tensors))

In [13]:
vname_mapping = {"kernel": "w", "bias": "b", "gamma": "g", "beta": "b"}

In [14]:
for v in gpt2.trainable_variables:
    tf2_vname = v.name[9:-2].split("/")
    if tf2_vname[0] == "decoder":
        tf2_vname = tf2_vname[1:]
    elif tf2_vname[0] == "wte" or tf2_vname[0] == "wpe":
        tf2_vname = [tf2_vname[0]]
    if tf2_vname[-1] in vname_mapping:
        tf2_vname[-1] = vname_mapping[tf2_vname[-1]]
    tf2_vvalue = np.squeeze(ckpt_vmap[str(tf2_vname)])
    assert v.shape == tf2_vvalue.shape, \
        "{} has different shape: gpt2_tf2 {} vs gpt2_tf1 {}" \
        .format(v.name, str(v.shape), str(tf2_vvalue.shape))
    v.assign(tf2_vvalue)

## Test loaded gpt2_tf2 model

In [15]:
enc = encoder.get_encoder("117M", "../models")

In [16]:
context = enc.encode("Can we be friends?")
context = tf.convert_to_tensor(context,dtype=tf.int32)
context = tf.expand_dims(context, axis=0)

In [17]:
output = sample_tf2.sample_sequence(
    gpt2_model=gpt2,
    length=200,
    context=context,
    top_p=0,
    top_k=40,
    batch_size=1
)

In [18]:
enc.decode(output[0].numpy())

'Can we be friends? If not, why not? We just met once back in March. I\'m glad I haven\'t. Thank you."\n\nWith all that being said, I\'m glad you\'re here. I love you.\n\nHere\'s hoping you enjoy everything coming your way, and thank you for everything that you did for the community. There\'s no better way to enjoy all of our events, especially when we\'re surrounded by great people with great things to do. We want to keep it on social media, and let you know you can join us online and let the world know how awesome your event was. Also, please keep up with the latest news.\n\nIf you\'d like to see what we\'re bringing to 2017, check out the official website here. (Thanks again to @Cindroz)<|endoftext|>I\'m not entirely sure what to mean by this article. You should look it up for anything about the story, but it was just a quick overview of what kind of situation'

## Save model

In [20]:
gpt2.save_weights("../models/117M_tf2/pretrained_weights.h5")

---
# Load the saved model

In [1]:
import tensorflow as tf
import numpy as np
import time, os, logging
import sample_tf2, model_tf2, encoder

In [2]:
gpt2_loaded = model_tf2.GPT2(model_tf2.HPARAMS['117M'])

In [3]:
X = tf.convert_to_tensor(np.array([[35, 789], [98, 69]]))

In [4]:
logits, presents, _ = gpt2_loaded(X, None)

In [5]:
gpt2_loaded.load_weights("../models/117M_tf2/pretrained_weights.h5")

In [6]:
gpt2_loaded.summary()

Model: "gpt2_tf2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
wte (SharedEmbeddings)       multiple                  38597376  
_________________________________________________________________
wpe (Embedding)              multiple                  786432    
_________________________________________________________________
decoder (Decoder)            multiple                  85056000  
Total params: 124,439,808
Trainable params: 124,439,808
Non-trainable params: 0
_________________________________________________________________


## Test loaded gpt2_tf2 model

In [7]:
enc = encoder.get_encoder("117M", "../models")

In [8]:
context = enc.encode("Can we be friends?")
context = tf.convert_to_tensor(context,dtype=tf.int32)
context = tf.expand_dims(context, axis=0)

In [9]:
output = sample_tf2.sample_sequence(
    gpt2_model=gpt2_loaded,
    length=200,
    context=context,
    top_p=0,
    top_k=40,
    batch_size=1
)

In [10]:
enc.decode(output[0].numpy())

'Can we be friends?\n\n\nI thought of my mother\n\nShe didn\'t know she was dead.\n\n\nThis wasn\'t my first thing to do.\n\nWe went off together.\n\n"We have a great plan.\n\n\nDon\'t let go\n\nI don\'t let go and I don\'t go\n\n\nWe need to put that back on\n\n\nShe had nothing left to lose.\n\nWe need to get it back on\n\nDon\'t leave her alone.\n\n\nIn some way, I am in good enough shape to go with you<|endoftext|>Sebastian Weidman at her usual blog. (AP Photo/The Washington Post)\n\nI was shocked and saddened at the thought of the loss of a former staffer who wrote about my struggles as a parent.\n\nHe said she died and I feel terrible I didn\'t know him. Or that I shouldn\'t have known. But there are other people who know him. Here\'s one about me in'

In [30]:
inp_text = "How are you my friend? I am here to chat with you. I like your shirt"

In [31]:
inp = tf.convert_to_tensor(
            np.stack(
                [enc.encode(inp_text)]),
            dtype=tf.int32
        )
inp

<tf.Tensor: shape=(1, 18), dtype=int32, numpy=
array([[ 2437,   389,   345,   616,  1545,    30,   314,   716,   994,
          284,  8537,   351,   345,    13,   314,   588,   534, 10147]],
      dtype=int32)>

In [32]:
predictions, _, _ = gpt2_loaded(inp, None)

In [33]:
enc.decode(np.argmax(predictions[0,:-1].numpy(), axis=1))

',/._ of\n,,tic the the\n won. is in name'

In [34]:
output = sample_tf2.sample_sequence(
    gpt2_model=gpt2_loaded,
    length=200,
    context=inp,
    top_p=0,
    top_k=40,
    batch_size=1
)

In [35]:
enc.decode(output[0].numpy())

"How are you my friend? I am here to chat with you. I like your shirton\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDont'\n\n\n\nyou can do better in the best way to\n\n\ntell me\n\n\nI could hear ya\n\nMy name is\n\nWe can tell you how i mean\n\nYou get what I mean. I like doing\n\nI can't understand\n\nYou make me? What i'm say\n\nI'm not doing well,\n\n\nI'm not in my mouth to\n\n\nI'm not\n\n\nOh\n\n\nyou can I have\n\n\n\nI have\n\n\n\n\nyou\n\n\n\nI like a\n\n\n\n\n\n\nI've been\n\n\n\nwhat you\n\n\n\n\n\n\n\n\n\nI'm\n\n\n\n\n\nyou\n\n\n\nwhat you can\n\n\n\n\nI'm of\n\n\n\n\nyou\n\nyou"