In [1]:
import tensorflow as tf
from tensorflow.keras import layers, Sequential, Model
from tensorflow.keras.layers import Dense, Input, Embedding, LayerNormalization, Dropout
import numpy as np


#embedding_layer = Embedding(input_dim = 10, output_dim = 4)

# input_data =  np.array([[1,2,3],[3,4,2]])

# output_data = embedding_layer(input_data)

# print(output_data)

'''import tensorflow as tf
from tensorflow.keras.layers import LayerNormalization

# Create a Layer Normalization layer
layer_norm = LayerNormalization(epsilon=1e-6)

# Example input tensor (batch_size=2, sequence_length=3, embedding_dim=4)
x = tf.constant([
    [[1.0, 2.0, 3.0, 4.0],
     [5.0, 6.0, 7.0, 8.0],
     [9.0, 10.0, 11.0, 12.0]],

    [[-1.0, -2.0, -3.0, -4.0],
     [-5.0, -6.0, -7.0, -8.0],
     [-9.0, -10.0, -11.0, -12.0]]
], dtype=tf.float32)

# Apply Layer Normalization
normalized_x = layer_norm(x)

print("Original Input:")
print(x.numpy())

print("\nLayer Normalized Output:")
print(normalized_x.numpy())''' #testing out how layer normalization works


'import tensorflow as tf\nfrom tensorflow.keras.layers import LayerNormalization\n\n# Create a Layer Normalization layer\nlayer_norm = LayerNormalization(epsilon=1e-6)\n\n# Example input tensor (batch_size=2, sequence_length=3, embedding_dim=4)\nx = tf.constant([\n    [[1.0, 2.0, 3.0, 4.0],\n     [5.0, 6.0, 7.0, 8.0],\n     [9.0, 10.0, 11.0, 12.0]],\n\n    [[-1.0, -2.0, -3.0, -4.0],\n     [-5.0, -6.0, -7.0, -8.0],\n     [-9.0, -10.0, -11.0, -12.0]]\n], dtype=tf.float32)\n\n# Apply Layer Normalization\nnormalized_x = layer_norm(x)\n\nprint("Original Input:")\nprint(x.numpy())\n\nprint("\nLayer Normalized Output:")\nprint(normalized_x.numpy())'

In [2]:
 from google.colab import files
 uploaded = files.upload()


f = open('training_data.txt','r')
data = f.read().replace('\n',' ')
print(len(data))

Saving training_data.txt to training_data.txt
1115394


In [3]:
characters=list(set(list(data))) #unordered set of unique characters we have
#print(len(characters))

character_to_integer_encoding={}
integer_to_character_encoding={}
for i in range(len(characters)):
    character_to_integer_encoding[characters[i]]=i+1
    integer_to_character_encoding[i+1]=characters[i]

# dictionaries to encode a each unique character with a number.

In [4]:
# functions to encode and decode

def encode(string):
  global character_to_integer_encoding
  return [ character_to_integer_encoding[char] for char in string]

def decode(lst):
  out = ""
  global integer_to_character_encoding
  for i in lst :
    out += integer_to_character_encoding[i]
  return out




In [5]:
input_data=encode(data)
train_data=input_data[:int(0.9*len(input_data))] # taking 90% of the encoded data to train
test_data=input_data[int(0.9*len(input_data)):] # rest 10% to train
print(len(train_data))

1003854


In [6]:
batch_size=32 #chunks of text processed in one forward pass
block_size=128 # Number of prev characters the model looks at to generate the next one
num_heads=8 # number of attention heads (self - attention layers)
num_transformer_blocks = 4 #number of transformer blocks (MHA + feed forward layers)
input_vocab_size=len(characters)+1
feed_forward_dim = 256 #number of hidden layers

In [7]:
def causal_attention_mask(batch_size, n_dest, n_src):
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest #a bool matrix denoting which tokens should be visible to each other.
    mask = tf.cast(m, tf.bool)
    mask = tf.reshape(mask, [1, n_dest, n_src]) ### -> Fill this up
    return tf.tile(mask, [batch_size, 1, 1])

  #a casual attention mask ensures that the transformer doesn't see any future words while making predictions
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        # Give code for an attention layer, feedforward layers, and normalization layers. The attention layer is first, then normalization and dropout, then forward the data passed through a non-linear function, and call the dropout layer again
        ###
        # Insert code here
        self.att = layers.MultiHeadAttention(num_heads, embed_dim) #creates multi-head attention layer
        self.ffn = Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        ) # creates two Dense layers in our feed-forward network

        ###
        self.normalization_layer_1 = LayerNormalization(epsilon=1e-6) #normalization layers
        self.normalization_layer_2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate) # dropout layers


    def call(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        block_size = input_shape[1]
        ''' Insert the calling code here '''
        casual_mask = causal_attention_mask(batch_size, block_size, block_size)
        attention_output = self.att(inputs, inputs, attention_mask=casual_mask) #self-attention layer that can only see previous tokens while processing a token
        attention_output = self.dropout1(attention_output) # Dropout layer to prevent overfitting of the data
        out1 = self.normalization_layer_1(inputs + attention_output) #Normalization layer
        ffn_output = self.ffn(out1) #feed-forward layer(2 dense-layers one with relu activation).
        ffn_output = self.dropout2(ffn_output) #second dropout layer
        return self.normalization_layer_2(out1 + ffn_output) #returns the output of the normalization layer





In [8]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_embedding = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_embedding = Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_embedding(positions)
        x = self.token_embedding(x)
        return x + positions
        ### -> Fill this up


In [9]:
# function based API
def get_transformer_model(
    maxlen,
    vocab_size,
    embed_dim,
    num_heads,
    feed_forward_dim,
    num_transformer_blocks=1
):
    inputs = Input(shape=(maxlen,), dtype=tf.int32)
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    x = embedding_layer(inputs)
    for _ in range(num_transformer_blocks):
        transformer_block = TransformerBlock(embed_dim, num_heads, feed_forward_dim)
        x = transformer_block(x)
    outputs = Dense(vocab_size)(x)
    model = Model(inputs=inputs, outputs=[outputs])
    return model


In [10]:
model = get_transformer_model(block_size, input_vocab_size, feed_forward_dim, num_heads, feed_forward_dim, num_transformer_blocks)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(
    "adam",
    loss=[loss_fn],
    metrics=["accuracy"]
)

In [11]:
model.summary()

In [12]:
inputs = [train_data[i:i+block_size] for i in range(0, len(train_data)-block_size-1)]
targets = [train_data[i+1:i+block_size+1] for i in range(0, len(train_data)-block_size-1)]

'''
Insert code here to preprocess the input data and the target data to send it to the model.
'''

inputs = np.array(inputs, dtype=np.int32)
targets = np.array(targets, dtype=np.int32)

dataset= tf.data.Dataset.from_tensor_slices((inputs, targets))
dataset = dataset.shuffle(10000)
dataset = dataset.batch(batch_size, drop_remainder=True)

In [None]:
dataset= tf.data.Dataset.from_tensor_slices((inputs, targets))
dataset=dataset.shuffle(1000)
dataset = dataset.batch(batch_size, drop_remainder=True)
model.fit(dataset, epochs=10)

Epoch 1/10
[1m  383/31366[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m80:47:29[0m 9s/step - accuracy: 0.3901 - loss: 2.0866

In [13]:
def generate_text(model, start_index, num_generate=1):
    '''
        This function will generate text for num_generate characters, starting from start_index+batch_size.
    '''
    input_sequence = train_data[start_index:start_index + block_size]
    generated_text = decode(input_sequence)
    exact_sequence = decode(input_sequence)
    for i in range(num_generate):
        input_eval = tf.convert_to_tensor([input_sequence], dtype=tf.int32)
        predictions = model.predict(input_eval)
        probabilities = tf.nn.softmax(predictions[0, -1]).numpy()
        next_token = np.random.choice(len(probabilities), p=probabilities)
        input_sequence += [next_token]
        input_sequence = input_sequence[1:]
        exact_sequence += decode([np.argmax(probabilities)])
        generated_text += decode([next_token])

    return generated_text, exact_sequence

In [None]:
generate_text(model, start_index=0, num_generate=1000)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 251ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 213ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 198ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 196ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 218ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

("First Citizen: Before we proceed any further, hear me speak.  All: Speak, speak.  First Citizen: You are all resolved rather to allle thlet n: acay Cirave the:ized fffaallltfogangCuli tiirenooudnd tLe enithat tiind d ailllyithuizrilphgahiy Sienthangus t ciucwLg? thinoeitveCiis the thathod aDwe :ecinie busthhin: we thins Sit ; ther tis tfsi: aneparsggu tiSean il t te uniyrico Se Mhe tol f: thatpenalhithold gcinhiz uikigalntay3vehsthis wihizealanousidd tthinMaue re thenxchith- d thins the. aid in: as thays s aricmxyel?e u aluu gore re the C s aed tvxWe  wxidicsthus d thiobe. the the t then:g, thin thisnd Cez Dthhniurap: id ciththe uthatins wwwcokithe ousthe d pwanhilllhislenve CimenSedinould . atanI d ngknsoleal, the d chen .hithes tthecLcthi fox aa t d : ae , yhe thungo:then t irewe an  thzelurn: ths s thuwoasthoouthiznhan: lenoze thelln lCuli ors pustonouallnocinghe ae hize t thicitthend   gnn uk. thb yanalllith a3 ;ee anhedlLu ange athizthtoous ogzehangoc. ithatithea ganirit thinitw