### Load Modules

In [1]:
import tensorflow as tf
import numpy as np
import tiktoken
import math
import wget

### Load & clean data

In [2]:
with open('./lyrics.txt','r',encoding='utf-8') as f:
    Text = f.read()

In [3]:
print(list(set(Text)))

['\n', '/', 'V', '9', 'g', '“', '6', 'u', '!', '*', 's', 'r', 'z', 'h', 'l', 'W', 'a', 'ï', '4', 'Y', 'B', 'w', '?', '(', ')', '\\', 'b', 'ó', '5', 'G', '[', 'x', 'E', 'M', '3', 't', 'í', '—', 'F', 'j', 'n', 'i', 'm', 'I', 'c', 'y', 'X', '”', '-', 'A', '&', 'S', '.', ']', 'C', "'", 'R', 'p', ':', 'J', 'Z', '1', ',', '7', 'o', 'D', 'k', '0', '2', 'v', '"', 'H', 'O', ';', 'L', '8', 'P', '’', 'é', 'd', 'T', 'e', 'f', 'K', 'N', 'Q', ' ', 'U', 'q']


In [4]:
Text = Text.replace("\\","")

In [5]:
len(Text)

817508

In [6]:
vocab = list(set(Text))

In [7]:
vocab_size = len(vocab)

In [8]:
print(vocab)

['\n', '/', 'V', '9', 'g', '“', '6', 'u', '!', '*', 's', 'r', 'z', 'h', 'l', 'W', 'a', 'ï', '4', 'Y', 'B', 'w', '?', '(', ')', 'b', 'ó', '5', 'G', '[', 'x', 'E', 'M', '3', 't', 'í', '—', 'F', 'j', 'n', 'i', 'm', 'I', 'c', 'y', 'X', '”', '-', 'A', '&', 'S', '.', ']', 'C', "'", 'R', 'p', ':', 'J', 'Z', '1', ',', '7', 'o', 'D', 'k', '0', '2', 'v', '"', 'H', 'O', ';', 'L', '8', 'P', '’', 'é', 'd', 'T', 'e', 'f', 'K', 'N', 'Q', ' ', 'U', 'q']


In [9]:
print(vocab_size)

88


### Encode & Decode tokenizer

In [10]:
chr_to_idx = {c:i for i,c in enumerate(vocab)}

In [11]:
idx_to_chr = {i:c for i,c in enumerate(vocab)}

In [12]:
def encode(text:str)->list[int]:
    return [chr_to_idx[c] for c in text]

In [13]:
def decode(tokens:list[int])-> str:
    return [idx_to_chr[i] for i in tokens]

In [14]:
encode("Hello World !")

[70, 80, 14, 14, 63, 85, 15, 63, 11, 14, 78, 85, 8]

In [15]:
Data = tf.constant(encode(Text))
#Data = tf.constant(tokenizer.encode(Text))

In [16]:
Data

<tf.Tensor: shape=(817508,), dtype=int32, numpy=array([70, 80, 85, ..., 14, 80, 69])>

### Data Loader

In [17]:
class DataLoader:
    def __init__(self,token,batch,context):
        self.token = token
        self.batch = batch
        self.context = context
        self.cur_pos = 0
    def get_batch(self):
        b,c = self.batch,self.context
        start_pos = self.cur_pos
        end_pos = self.cur_pos + b*c + 1
        add_data = -1
        if end_pos > len(self.token):
            add_data = end_pos - len(self.token)
            end_pos = len(self.token)
        d = self.token[start_pos:end_pos]
        if add_data != -1:
            d = tf.concat([d,self.token[:add_data]],axis=0)
        x = tf.reshape(d[:-1],(b,c))
        y = tf.reshape(d[1:],(b,c))
        self.cur_pos += b*c
        if self.cur_pos > len(self.token) - 1:
            self.cur_pos=0
        return x,y

In [18]:
TRAIN_BATCH = 16
EVAL_BATCH = 8
CONTEXT_SIZE = 256
train_split = 0.8

In [19]:
train_data = Data[:int(len(Data)*train_split)]
eval_data = Data[int(len(Data)*train_split):]

In [20]:
train_loader = DataLoader(train_data,TRAIN_BATCH,CONTEXT_SIZE)
eval_loader = DataLoader(eval_data,EVAL_BATCH,CONTEXT_SIZE)

In [21]:
xb,xc = train_loader.get_batch()

In [22]:
xb.shape,xc.shape

(TensorShape([16, 256]), TensorShape([16, 256]))

In [23]:
train_loader.get_batch()

(<tf.Tensor: shape=(16, 256), dtype=int32, numpy=
 array([[63, 68, 80, ..., 85, 34,  7],
        [11, 39, 85, ..., 13, 80, 85],
        [34, 40, 41, ...,  7, 10, 34],
        ...,
        [ 0, 42, 54, ..., 63, 85, 16],
        [39, 44, 34, ...,  0, 83, 63],
        [25, 63, 78, ..., 85, 16, 34]])>,
 <tf.Tensor: shape=(16, 256), dtype=int32, numpy=
 array([[68, 80,  0, ..., 34,  7, 11],
        [39, 85, 63, ..., 80, 85, 34],
        [40, 41, 80, ..., 10, 34, 85],
        ...,
        [42, 54, 68, ..., 85, 16, 39],
        [44, 34, 13, ..., 83, 63, 25],
        [63, 78, 44, ..., 16, 34, 85]])>)

## Model

### Positional Encoding

In [24]:
class PositionalEncoding(tf.keras.Model):
    def __init__(self,CONTEXT_SIZE,d_model):
        super().__init__()
        self.pe = np.zeros((CONTEXT_SIZE,d_model))
        self.pos = np.arange(0,CONTEXT_SIZE).reshape(-1,1)
        self.div_term = np.exp(np.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        self.pe[:, 0::2] = np.sin(self.pos * self.div_term)
        self.pe[:, 1::2] = np.cos(self.pos * self.div_term)
        self.pe = np.expand_dims(self.pe, axis=0)
        self.pos_enc = tf.constant(self.pe, dtype=tf.float32)
        print(self.pos_enc.shape)
    def call(self,x)->tf.Tensor:
        return x + self.pos_enc[:,:tf.shape(x)[1],:]

In [25]:
PositionalEncoding(256,512)

(1, 256, 512)


<__main__.PositionalEncoding at 0x1cfcea2fca0>

### Self-Attention

### MultiAttention

In [26]:
class MultiHeadAttention(tf.keras.Model):
    def __init__(self,d_model:int,n_heads:int):
        super().__init__()
        self.n_heads = n_heads
        self.head_dim = d_model // self.n_heads
        
        assert(self.n_heads * self.head_dim == d_model)
        
        self.query = tf.keras.layers.Dense(d_model)
        self.key = tf.keras.layers.Dense(d_model)
        self.value = tf.keras.layers.Dense(d_model)
        self.ff_out = tf.keras.layers.Dense(d_model)
        self.dropout = tf.keras.layers.Dropout(0.2)
    def call(self,inputs):
        B,C,d_model = inputs.shape
        Q = tf.reshape(self.query(inputs), (B, C, self.n_heads, self.head_dim))
        Q = tf.transpose(Q, perm=[0, 2, 1, 3])
        K = tf.reshape(self.key(inputs), (B, C, self.n_heads, self.head_dim))
        K = tf.transpose(K, perm=[0, 2, 1, 3])
        V = tf.reshape(self.value(inputs), (B, C, self.n_heads, self.head_dim))
        V = tf.transpose(V, perm=[0, 2, 1, 3])
        
        attention_score = tf.matmul(Q,K,transpose_b=True)/tf.math.sqrt(float(self.head_dim))
        mask = tf.cast(tf.linalg.band_part(tf.ones((C,C)),0,-1),tf.bool)
        mask = tf.linalg.set_diag(mask,tf.zeros(C,dtype=tf.bool))
        attention_score = tf.where(mask,float('-inf'),attention_score)
        weighted_attention = tf.nn.softmax(attention_score,axis=-1)
        attention_output = tf.matmul(self.dropout(weighted_attention),V)
        attention_output = tf.transpose(attention_output,perm=[0,2,1,3])
        attention_output = tf.reshape(attention_output,shape=(B,C,d_model))
        out = self.ff_out(attention_output)
        return out

### Model

In [27]:
class DecoderBlock(tf.keras.Model):
    def __init__(self,d_model,n_heads):
        super().__init__()
        self.ff = tf.keras.models.Sequential([
            tf.keras.layers.Dense(4*d_model,activation="gelu"),
            tf.keras.layers.Dense(d_model)
        ])
        self.att = MultiHeadAttention(d_model,n_heads)
        self.ln1 = tf.keras.layers.LayerNormalization()
        self.ln2 = tf.keras.layers.LayerNormalization()
        self.dropout = tf.keras.layers.Dropout(0.2)
    def call(self,logits):
        att_logits = self.att(logits)
        adn_logits = self.ln1(logits+att_logits)
        logits = self.dropout(adn_logits)
        logits = self.ff(logits)
        logits = self.ln2(logits+adn_logits)
        return logits

In [28]:
class GPT(tf.keras.Model):
    def __init__(self,vocab_size,d_model,n_heads,n_layers):
        super().__init__()
        self.emb = tf.keras.layers.Embedding(vocab_size,d_model)
        self.wpe = PositionalEncoding(CONTEXT_SIZE,d_model)
        self.blocks = [DecoderBlock(d_model,n_heads) for _ in range(n_layers)]
        self.ff1 = tf.keras.models.Sequential([
            tf.keras.layers.Dense(vocab_size)
        ])
    def call(self,inputs,targets = None,training=False):
        logits = self.emb(inputs)
        logits = self.wpe.call(logits)
        for block in self.blocks:
            logits = block(logits)
        logits = self.ff1(logits)
        loss = 0
        if targets is not None:
            batch,seq,d_model = tf.shape(logits)[0],tf.shape(logits)[1],tf.shape(logits)[2]
            logits = tf.reshape(logits,[batch*seq,d_model])
            targets = tf.reshape(targets,[batch*seq])
            loss_fn = tf.keras.losses.sparse_categorical_crossentropy(targets,logits,from_logits=True)
            loss = tf.reduce_mean(loss_fn)
        return logits,loss
    def generate(self,inputs,max_token_num):
        output = tf.Variable(inputs)
        for _ in range(max_token_num):
            curr_seq_len = output.shape[1]
            if curr_seq_len > CONTEXT_SIZE:
                inputs = inputs[:,-CONTEXT_SIZE:]
            logits,_ = self.call(inputs)
            logits = logits[:,-1,:]
            probs = tf.keras.activations.softmax(logits,axis=-1)
            idx = tf.random.categorical(tf.math.log(probs),num_samples=1,dtype=tf.int32)
            inputs = tf.concat([inputs,idx],axis=-1)
            output = tf.concat([output,idx],axis=-1)
        return [out for out in output]

In [29]:
d_model=256
n_heads = 4
n_layers = 2

In [30]:
m = GPT(vocab_size,d_model,n_heads,n_layers)

(1, 256, 256)


In [31]:
m.build(input_shape=(16, 256))
m.summary()

Model: "gpt"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  22528     
                                                                 
 positional_encoding_1 (Posi  multiple                 0 (unused)
 tionalEncoding)                                                 
                                                                 
 decoder_block (DecoderBlock  multiple                 789760    
 )                                                               
                                                                 
 decoder_block_1 (DecoderBlo  multiple                 789760    
 ck)                                                             
                                                                 
 sequential_2 (Sequential)   (16, 256, 88)             22616     
                                                               

### Train

In [32]:
epochs = 100000
eval_steps = 2000
learning_rate=1e-3
opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
for i in range(1,epochs+1):
    xb,xc = train_loader.get_batch()
    with tf.GradientTape() as tape:
        logit,loss = m.call(xb,xc,training=True)
        loss = tf.reduce_mean(loss)
    grads = tape.gradient(loss,m.trainable_variables)
    opt.apply_gradients(zip(grads,m.trainable_variables))
    if i%eval_steps == 0:
        yb,yc = eval_loader.get_batch()
        _,e_loss = m.call(yb,yc,training=False)
        e_loss = tf.reduce_mean(e_loss)
        print(f"Epoch: {i} \ttrain_loss: {loss.numpy()} \teval_loss: {e_loss.numpy()}")

Epoch: 2000 	train_loss: 1.665138602256775 	eval_loss: 1.7025821208953857
Epoch: 4000 	train_loss: 1.2841722965240479 	eval_loss: 1.381739616394043
Epoch: 6000 	train_loss: 1.2152273654937744 	eval_loss: 1.7053344249725342
Epoch: 8000 	train_loss: 0.9449632167816162 	eval_loss: 2.155064105987549
Epoch: 10000 	train_loss: 1.0582729578018188 	eval_loss: 2.0462427139282227
Epoch: 12000 	train_loss: 0.7053725719451904 	eval_loss: 1.3840622901916504
Epoch: 14000 	train_loss: 0.9804032444953918 	eval_loss: 1.7156836986541748
Epoch: 16000 	train_loss: 0.5309898853302002 	eval_loss: 3.0305087566375732
Epoch: 18000 	train_loss: 0.6560888886451721 	eval_loss: 4.168720245361328
Epoch: 20000 	train_loss: 0.40707501769065857 	eval_loss: 2.455336809158325
Epoch: 22000 	train_loss: 0.4803198277950287 	eval_loss: 2.6075375080108643
Epoch: 24000 	train_loss: 0.3484504520893097 	eval_loss: 2.27535080909729
Epoch: 26000 	train_loss: 0.36934056878089905 	eval_loss: 2.26998233795166
Epoch: 28000 	train_los

In [33]:
inputs = tf.convert_to_tensor(encode("Love"))
inputs = tf.expand_dims(inputs,axis=1)
print(inputs.shape)

(4, 1)


In [34]:
print(''.join(decode(m.generate(inputs,1000)[0].numpy().tolist())))

Likes to no-rist-ohead)
Flows, it's deal
If I told you I'm in your Aring all their giving the joke, no oh-oh, oh-oh!
You need it back now, it's just because of your choses, as songs specklues, inside she wastling on None innoce
You littee like the Car, lights before you (Grew push my!
Tellization)
And I sat u2019s so gone, I'm cay

Set the right tears back when you were like me, wide
I as I lock believe, and mark for a lie"
Tin' ain't not the go dirl shooding even known I'm who it to flame
Theed I thould something, I see fire, I feit lacking flictrees, that a finning an acheice from the song all the sunset ends
And I'll got see fireing and don't see
(You will blood, I was scream, and reason
Whrisping up, my farm room, and restimatuallate, in the aird shoulder and reason
Itu2019ll the same, he don't wanna him from him to time, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh

It we were right the wrap, weep your of the sick, and reason leave foxist pure I set think that I donu2019t keep me?
A han

In [35]:
word = tf.convert_to_tensor(encode("sweet"))
word = tf.expand_dims(word,axis=1)
print(word.shape)

(5, 1)


In [36]:
print(''.join(decode(m.generate(word,1000)[0].numpy().tolist())))

s,
I was tired and fell asleep beneath an oak tree,
I bet my mother's proud of me from each scar,
Upon my knuckle and each graze upon my knew moth-My,
my prives heart,
I don't wanna got dechood on the fam wide and he how let
We mad,
on not let their problem,
You so,
He's the top And by myself to fly on the fucky to pain interwatorner someo,
And if the night,
If I shouldn't
Bull only the I'll be enough
We people more Friends just on all over night far
Cause you are my one mornth.,
And I sat of high my soul
If you now,
You're happy on the flames be my brothers,
Tell me when it,
Write tinger around,
Tonight,
At give me that come back would you do,

Oh,
my,
my seem,
And I see fible faded it,
It's a hard,
And if you'll put me when you were hon thinkin'
And it's too so gorday someoke down,
Tryin' to go Eydoyes,
I to be sometimes I won't have faded,
But at the light we would wait,
There more times,
Stayin' help it celebelieve,
I lost you though,
I have to raina,
And I'll piet,
u2019s should r