# Project 4
## Students:
 > Abdurhman Bahour,
 > Coby White,
 > William C. Parham
 
 

In [17]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from tensorflow.keras.preprocessing.text import Tokenizer

import numpy as np
import re

In [18]:
print(tf.__version__)# you may want to upgrade to 2.10.0 

2.12.0


### Please Use Markdown
> for markdown, see here: https://www.ibm.com/docs/en/watson-studio-local/1.2.3?topic=notebooks-markdown-jupyter-cheatsheet

## Task 1

In [19]:
class TransformerModel():
    def __init__(self, vocab_size, embed_dim=256, num_heads=2, num_blocks=1, ff_dim=256, maxlen=64, rate=0.1):
        #initailize variables
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.num_heads = num_heads    #Use with transformerblock function
        self.num_blocks = num_blocks
        self.ff_dim = ff_dim
        self.maxlen = maxlen
        self.rate = rate              #Use with dropout layer creation

    def TransformerBlock(self, inputs):
        #create the transformer block as discribed in the writeup, use the Keras functional API (https://keras.io/guides/functional_api/)
        #add the inputs which should be a positional embedding and token embedding
       
        #MultiHeadAttention layer, specifiy 'use_causal_mask=True' (https://keras.io/api/layers/attention_layers/multi_head_attention/)
        # layer_1 = layers.MultiHeadAttention(num_heads=self.num_heads, key_dim=, use_casual_mask=True)(input_tensor)
        layer_1 = layers.MultiHeadAttention(num_heads=self.num_heads, key_dim=self.embed_dim, use_bias=False, dropout=self.rate, name='layer_1')(inputs, inputs, use_causal_mask=True)

        # layer_1 = layers.MultiHeadAttention(num_heads=self.num_heads, key_dim=self.embed_dim, use_bias=False, use_casual_mask=True, dropout=self.rate, name='multi_head_attention')(input_tensor)
        #Use the rate variable for the dropout layers
        layer_2 = layers.Dropout(rate=self.rate, name='layer_2')(layer_1)
        
        #adder layer, output of prev dropout and input tensor
        layer_adder = layers.Add()([layer_2, inputs])
        
        #LayerNormalization layer, specifiy 'epsilon=1e-6' (https://keras.io/api/layers/normalization_layers/layer_normalization/)
        layer_3 = layers.LayerNormalization(epsilon=1e-6, name='layer_3')(layer_adder)
        
        #first dense layer
        layer_4 = layers.Dense(units=self.ff_dim, name='layer_4', activation = 'relu')(layer_3)
        
        #second dense layer
        layer_5 = layers.Dense(units=self.ff_dim, name='layer_5')(layer_4)
        
        #dropout layer
        layer_6 = layers.Dropout(rate=self.rate, name='layer_6')(layer_5)
        
        #adder layer, output of previous dropout and layer layer normalization layer
        layer_adder_2 = layers.Add()([layer_3 + layer_6])
        
        #output layer, final layer normalization layer
        output_layer = layers.LayerNormalization(epsilon=1e-6, name='output_layer')(layer_adder_2)
        
        return output_layer
        
    def EmbeddingLayer(self, inputs):
            
        #create the embedding layer
        #create (1) an embedding for the tokens and (2) an embedding for the positions
        #you can use https://keras.io/api/layers/core_layers/embedding/ Embedding class
        #you can use tf.range to enocde positions
        #add (1) and (2) and return the layer

        # return layers.Add(name='embedding')([layers.Embedding(input_dim=self.vocab_size, output_dim=self.embed_dim, name='token_embedding')(inputs) + layers.Embedding(input_dim=self.maxlen, output_dim=self.embed_dim, name='pos_embedding')(tf.range(start=0, limit=self.maxlen, delta=1))])

        # Define the token embedding layer
        token_embedding_layer = layers.Embedding(input_dim=self.vocab_size, output_dim=self.embed_dim, name='token_embedding_layer')(inputs)

        # Define the positional embedding layer
        position_embedding_layer = layers.Embedding(input_dim=self.maxlen, output_dim=self.embed_dim, name='position_embedding_layer')(tf.range(start=0, limit=self.maxlen, delta=1))

        # Combine the token embedding and positional embedding layers
        combined_embedding_layer = layers.Add()([token_embedding_layer + position_embedding_layer])

        return combined_embedding_layer
    
    def create_model(self):
        
        #combine the EmbeddingLayer and num_blocks TransformerBlocks to create the model, use the Keras functional API (https://keras.io/guides/functional_api/)
        #See the section on the functional API link "All models are callabe, just like layers" for code refernce
        transformer_input = keras.Input(shape=self.maxlen,  name='inputs')
        embed_block = self.EmbeddingLayer(inputs=transformer_input)
        transformer_block = embed_block
        for _ in range(self.num_blocks):
            transformer_block = self.TransformerBlock(inputs=transformer_block)

        output_dense = layers.Dense(units=self.vocab_size, activation='softmax', name='output_dense')(transformer_block)
        model = Model(inputs=transformer_input, outputs=output_dense)
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

        # model.summary()
        return model
        #use the SparseCategoricalCrossentropy loss function (https://keras.io/api/losses/probabilistic_losses/#sparsecategoricalcrossentropy-class)

## Task 2

In [20]:
import re
import numpy as np

class DataSet:
    def __init__(self, filename, seq_len):
        with open(filename, 'r') as f:
            self.text = f.read()
        self.seq_len = seq_len
        self.vocab = None
        self.tokenized_text = None

    def prep_text(self):
        self.text = re.sub(r'[^a-zA-Z\s]', ' ', self.text)  # remove everything except letters and spaces
        self.text = self.text.lower()
        self.text = re.sub(r'\s+', ' ', self.text)  # remove duplicate spaces
        self.text = self.text.replace('\t', ' ')  # replace tabs with spaces
        self.text = self.text.split()
        
    def tokenize_text(self):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts([self.text])
        self.vocab = np.unique(self.text)
        self.tokenized_text  = [np.where(self.vocab == word)[0][0] for word in self.text]

    def create_dataset(self):
        # Preprocess the text
        self.prep_text()
        self.tokenize_text()

        # Split the tokenized data into sequences of length len
        num_seq = int(len(self.text) // self.seq_len)
        print(num_seq)
        x = [self.tokenized_text[i*self.seq_len:(i+1)*self.seq_len] for i in range(num_seq)]
        y = [self.tokenized_text[(i*self.seq_len)+1:((i+1)*self.seq_len)+1] for i in range(num_seq)]

        x = np.array(x)
        y = np.array(y)
        
        return x, y, self.vocab

## Task 3

In [21]:
class GenerateText():
    def __init__(self, model, vocab):
    # The init method will instantiates the reference to the model and vocabulary. It
    # also create a mapping from the integer representation of tokens/words into a
    # human-readable format.
        self.model = model
        self.vocab = vocab
        self.int_to_vocab = dict(enumerate(self.vocab))
        self.vocab_to_int = {self.int_to_vocab[i]: i for i in self.int_to_vocab}
    
    def generate_text(self, start_string, num_generate=100):
        start_tokens = [self.vocab_to_int[word] for word in start_string.split()]

        maxlen = 64
        num_tokens_generated = 0
        tokens_generated = []
        txt = start_string + ' '
        while num_tokens_generated <= num_generate:
            pad_len = maxlen - len(start_tokens)
            sample_index = len(start_tokens) - 1
            if pad_len < 0:
                x = start_tokens[:maxlen]
                sample_index = maxlen - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens

            x = np.array([x])
            y = self.model.predict(x)
            y = np.array(y)
            
            predictions = tf.squeeze(y, 0)
            predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
            best_token = self.vocab[predicted_id]

            tokens_generated.append(best_token)
            start_tokens.append(self.vocab_to_int[best_token])
            num_tokens_generated = len(tokens_generated)
            txt = txt + best_token + ' '

    def generate_random_text(self, start_string='', num_generate=100):
        # initialize generated text
        generated_text = []
        
        # loop to generate text
        for i in range(num_generate):
            # get random word from vocab
            predicted_word = np.random.choice(self.vocab)
            
            # add predicted word to generated text
            generated_text.append(predicted_word)
            
        # print("generated_text: ", ' '.join(generated_text))
        word = ' '.join(generated_text)
        
        return start_string + ' ' + word
        #return ' ' + word


In [22]:
# model = train_model(model,x[:64,:],y[:64,:],vocab,epochs=50)

## Task 4: Model Traning and Testing

In [23]:
#Train the model while periodically generating text to show progress
def train_model(model, x, y, vocab, epochs=50):
    # Generate text
    GT = GenerateText(model, vocab)
    start_text = "zapped"
    for e in range(epochs):
        print(f"Epoch {e+1}")
        # Train the model
        model.fit(x, y, epochs=1, batch_size=64, verbose=1)

        # # # Generate text
        # random_text = GT.generate_random_text(random_text)
        # print("random_text: ", random_text)
        if e%10 == 0:
            random_text = GT.generate_random_text(start_text)
            print("random_text: ", random_text)

        # random_text = GenerateText.generate_random_text(100)

    print("Betal Song:", random_text)
    return model

In [24]:
# clean the data
x,y,vocab = DataSet("beatles.txt", 64).create_dataset()

model = TransformerModel(len(vocab)).create_model()

# Train the model
model = train_model(model,x[:,:],y[:,:],vocab,epochs=50)


564
Epoch 1
random_text:  zapped slowly kids diverted live problems tight trolly mornin tired chains queue negotiations wasting learn wisdom lizzie heard a coffee message loretta almost problems jo yellow saved sky reject daily work fever mich lighten class board girl vont log to follow bone lovely honey reason zapped marvel style hear thinks chasing climb built arrives taking overnight suitcase hoped lullabye drehtest rain expert tell denied tied licks other sweater realize benefit pillow most memories octopus share meander wall cuts tie moscow jai rigby box performs tremember make special caught maid dan mystery pulled heart ten welcome window minds knees bout gib pretend
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
random_text:  zapped could bloody waltz church direction sho robbing charity bother m weren sight joan yi helps your supposed slither nasty opened though glass hold turing lala pride sir bra log climbing mack submarine martin his trigg


# Report

## Introduction

## Results

## Conclusion

## How to Run Code

Please include any special libraries and list your tf version here.