In [3]:
import tensorflow as tf
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import time
import numpy

tf.config.run_functions_eagerly(False)

# hyperparameters
batch_size = 24 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 10000
eval_interval = 100
learning_rate = 1e-3
eval_iters = 50
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.1
# ------------

In [4]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [5]:
# Train and test splits
data = tf.convert_to_tensor(encode(text), dtype=tf.int64)
n = int(0.95*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [6]:
g1 = tf.random.Generator.from_seed(1331)

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = g1.uniform(minval=0,maxval=len(data)-block_size,shape=[batch_size,],dtype=tf.dtypes.int64)
    
    x = tf.map_fn(lambda i:tf.gather(data, tf.range(i,i+block_size)), ix,dtype=tf.dtypes.int64)
    y = tf.map_fn(lambda i:tf.gather(data, tf.range(i+1,i+block_size+1)), ix,dtype=tf.dtypes.int64)

    return tf.stack(x), tf.stack(y)

In [7]:
x,y = get_batch('train')
print(decode(x[0].numpy().tolist()))

d other skins
Of ill-shaped fishes; and about his shelves
A beggarly account of empty boxes,
Green earthen pots, bladders and musty seeds,
Remnants of packthread and old cakes of roses,
Were thinly scatter'd, to make up a show.
Noting this penury, to mysel


In [8]:
print(decode(y[0].numpy().tolist()))

 other skins
Of ill-shaped fishes; and about his shelves
A beggarly account of empty boxes,
Green earthen pots, bladders and musty seeds,
Remnants of packthread and old cakes of roses,
Were thinly scatter'd, to make up a show.
Noting this penury, to myself


In [9]:
x,y = get_batch('val')
print(decode(x[0].numpy().tolist()))

RANIO:
But say, what to thine old news?

BIONDELLO:
Why, Petruchio is coming in a new hat and an old
jerkin, a pair of old breeches thrice turned, a pair
of boots that have been candle-cases, one buckled,
another laced, an old rusty sword ta'en out of the



In [10]:
print(decode(y[0].numpy().tolist()))

ANIO:
But say, what to thine old news?

BIONDELLO:
Why, Petruchio is coming in a new hat and an old
jerkin, a pair of old breeches thrice turned, a pair
of boots that have been candle-cases, one buckled,
another laced, an old rusty sword ta'en out of the
t


In [11]:
def estimate_loss():
    out = {}
    for split in ['train', 'val']:
        losses =[0]*eval_iters
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.numpy().tolist()
        out[split] = tf.math.reduce_mean(tf.constant(losses))
    return out

In [12]:
class Head(layers.Layer):
    """ one head of self-attention """

    def __init__(self, head_size):
        super(Head, self).__init__()
        self.key = layers.Dense(head_size, use_bias=False,input_shape=(None,n_embd))
        self.query = layers.Dense(head_size, use_bias=False,input_shape=(None,n_embd))
        self.value = layers.Dense(head_size, use_bias=False,input_shape=(None,n_embd))
        self.tril = tf.linalg.LinearOperatorLowerTriangular(tf.ones((block_size, block_size)))
        #self.dropout = layers.Dropout(dropout)

    def call(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        t = tf.transpose(k, perm=[0, 2, 1])
        wei = q @ t * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = tf.where(tf.equal(self.tril.to_dense()[None, :T, :T], 0), float('-inf'), wei) # (B, T, T)
        wei = tf.nn.softmax(wei,axis=-1) # (B, T, T)
        #wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out


In [13]:
class MultiHeadAttention(layers.Layer):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super(MultiHeadAttention, self).__init__()
        self.heads = [Head(head_size) for _ in range(num_heads)]
        self.proj = layers.Dense(n_embd,input_shape=(None,head_size * num_heads))
        self.dropout = layers.Dropout(dropout)

    def call(self, x):
        head_outputs = [head(x) for head in self.heads]
        out = tf.concat(head_outputs, axis=-1)
        out = self.dropout(out)
        out = self.proj(out)
        return out

In [14]:
from tensorflow.keras import Sequential
class FeedFoward(layers.Layer):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super(FeedFoward,self).__init__()
        self.net = Sequential([layers.Dense(4 * n_embd),
                               layers.ReLU(),
                               layers.Dense(n_embd)#,
                               #layers.Dropout(dropout)
                              ])

    def call(self, x):
        return self.net(x)

In [15]:
class Block(layers.Layer):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super(Block,self).__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = layers.LayerNormalization(epsilon=1e-6)
        self.ln2 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, x):
        s = self.sa(self.ln1(x))
        x = x + s
        f = self.ffwd(self.ln2(x))
        x = x + f
        return(x)

In [16]:
import os


class GPTLanguageModel(layers.Layer):

    def __init__(self):
        super(GPTLanguageModel, self).__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = layers.Embedding(vocab_size, n_embd)
        self.position_embedding_table = layers.Embedding(block_size, n_embd)
        self.blocks = keras.Sequential([Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = layers.LayerNormalization() # final layer norm
        self.lm_head = layers.Dense(vocab_size,input_shape=(None,4 * n_embd))

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self._init_weights()
        

    
    def _init_weights(self):
        
        for module in self.submodules:
            if isinstance(module, layers.Dense):
                module.kernel_initializer = keras.initializers.RandomNormal(mean=0.0, stddev=0.02)
                if module.use_bias:
                    module.bias_initializer = keras.initializers.Zeros()
            elif isinstance(module, layers.Embedding):
                module.embeddings_initializer = keras.initializers.RandomNormal(mean=0.0, stddev=0.02)
    


    def call(self, idx, targets=None):
        
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(tf.range(T, dtype=tf.int64)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = tf.reshape(logits, (B*T, C))
            targets = tf.reshape(targets, B*T)
            loss = tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True)
            loss = tf.reduce_mean(loss)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, _ = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # sample from the distribution
            idx_next = tf.random.categorical(probs, num_samples=1, dtype=tf.int64)  # (B, 1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, idx_next], axis=1)  # (B, T+1)
        return idx
    
    def saveModel(self):
        folder = "model_" + str(time.time()).split('.')[0]
        os.mkdir(folder)
        x = self.weights
        for i in x:
            fileName = str(i.name.replace('/','__').replace(':','##')) +".txt"
            f = open(str(folder+fileName),"w")
            numpy.savetxt(f,i)
            f.close()

In [17]:
model = GPTLanguageModel()

In [19]:
# create a PyTorch optimizer
optimizer = tf.optimizers.experimental.AdamW(learning_rate=learning_rate)
context = tf.zeros((1,1), dtype=tf.int64)
plotLoss = []
plotValLoss = []
for i in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if i % eval_interval == 0 or i == max_iters - 1:
        losses = estimate_loss()
        print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        plotValLoss.append(losses['val'])
        sample = decode(model.generate(context, max_new_tokens=100)[0].numpy().tolist())
        print(sample,"\n======================\n")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    with tf.GradientTape() as tape:
        logits, loss = model(xb, yb)
        plotLoss.append(loss)
    gradients = tape.gradient(loss, model.trainable_weights)
    optimizer.apply_gradients(zip(gradients, model.trainable_weights))
model.saveModel()

step 0: train loss 2.5750, val loss 2.5763

dam3$nnoiPqrgi;'lYSxCBqnNR!EJ$KFxhkqZc3ga$WxXBMYAn.Nq,OPMiSmmEoE;g;?JO lqhE$!PVcIvoikamEAdMRoaXfJriO 

step 100: train loss 2.4946, val loss 2.5031

3Sq;G&TQYBNnJUMgqNyXez&aFI$.KNkyrb!fqYYzpks3HunVAVCfsE oWhwEoop.NXYIvafRaybYGuUC?NcJ xaRr
qXo?dJRMMu 

step 200: train loss 2.4572, val loss 2.4941

c SFuYMqtotnLxc$Sl?Gt
Ei,pUoHWOh.pboGyoaxds&UPg';WNkFkZXTeBl3PA3dNf3JNbhfmKkgqdAGKUfl!nL'Y3?&ZhLFd:
 



KeyboardInterrupt: 

In [None]:
# model.get_weights()

In [None]:
model.count_params()

In [None]:
model.get_config()

In [None]:
# model.get_weights()

In [None]:
len([*range(0,max_iters+1,eval_interval)])

In [None]:
import matplotlib.pyplot as plt
import numpy as np

y1 = np.array(plotLoss)
y2 = np.array(plotValLoss)

plt.plot([*range(0,max_iters)],y1)
plt.plot([*range(0,max_iters+1,eval_interval)],y2)

plt.show()

In [None]:
#import time
#import numpy
#x = model.weights
#folder = "model/"
#for i in x:
#    fileName = str(i.name.replace('/','__').replace(':','##')) +".txt"
#    f = open(str(folder+fileName),"w")
#    numpy.savetxt(f,i)
#    f.close()

In [None]:
#tf.test.gpu_device_name()

In [None]:
context = tf.zeros((1,1), dtype=tf.int64)
print(decode(model.generate(context, max_new_tokens=100)[0].numpy().tolist()))

In [None]:
#step 0: train loss 4.1344, val11 loss 4.1358
#step 500: train loss 1.9654, val1 loss 2.0608
#step 1000: train loss 1.5308, val1 loss 1.7140
#step 1500: train loss 1.3840, val1 loss 1.5959
#step 2000: train loss 1.3039, val1 loss 1.5739
#step 2500: train loss 1.2459, val1 loss 1.5446
#step 3000: train loss 1.1882, val1 loss 1.5414
#step 3500: train loss 1.1322, val1 loss 1.5643
#step 4000: train loss 1.0745, val1 loss 1.5777
#step 4500: train loss 1.0114, val1 loss 1.6254
#step 5000: train loss 0.9423, val1 loss 1.6745