# Project 4
## Students:
 > Abdurhman Bahour,
 > Coby White,
 > William C. Parham
 
 

In [99]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Model
import numpy as np
import re

In [100]:
print(tf.__version__)# you may want to upgrade to 2.10.0 

2.12.0


### Please Use Markdown
> for markdown, see here: https://www.ibm.com/docs/en/watson-studio-local/1.2.3?topic=notebooks-markdown-jupyter-cheatsheet

## Task 1

In [101]:
class TransformerModel():
    def __init__(self, vocab_size, embed_dim=256, num_heads=2, num_blocks=1, ff_dim=256, maxlen=64, rate=0.1):
        #initailize variables
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.num_heads = num_heads    #Use with transformerblock function
        self.num_blocks = num_blocks
        self.ff_dim = ff_dim
        self.maxlen = maxlen
        self.rate = rate              #Use with dropout layer creation

    def TransformerBlock(self, inputs):
        #create the transformer block as discribed in the writeup, use the Keras functional API (https://keras.io/guides/functional_api/)
        #add the inputs which should be a positional embedding and token embedding
        block_input = inputs[0] + inputs[1]
        
        #create the input tensor:
        input_tensor = keras.Input(shape=(block_input.shape))
        
        #MultiHeadAttention layer, specifiy 'use_causal_mask=True' (https://keras.io/api/layers/attention_layers/multi_head_attention/)
        layer_1 = layers.MultiHeadAttention(num_heads=self.num_heads, use_casual_mask=True)(input_tensor)
        
        #Use the rate variable for the dropout layers
        layer_2 = layers.Dropout(rate=self.rate)(layer_1)
        
        #adder layer, output of prev dropout and input tensor
        layer_adder = layers.add()([layer_2, input_tensor])
        
        #LayerNormalization layer, specifiy 'epsilon=1e-6' (https://keras.io/api/layers/normalization_layers/layer_normalization/)
        layer_3 = layers.LayerNormalization(epsilon=1e-6)(layer_adder)
        
        #first dense layer
        layer_4 = layers.Dense(units=256)(layer_3)
        
        #second dense layer
        layer_5 = layers.Dense(units=256)(layer_4)
        
        #dropout layer
        layer_6 = layers.Dropout(rate=self.rate)(layer_5)
        
        #adder layer, output of previous dropout and layer layer normalization layer
        layer_adder_2 = layers.add()([layer_3, layer_6])
        
        #output layer, final layer normalization layer
        output_layer = layers.LayerNormalization(epsilon=1e-6)(layer_adder_2)
        
        model = Model(inputs=input_tensor, outputs=output_layer)
        
        return model
        
    def EmbeddingLayer(self, inputs):
            
        #create the embedding layer
        #create (1) an embedding for the tokens and (2) an embedding for the positions
        #you can use https://keras.io/api/layers/core_layers/embedding/ Embedding class
        #you can use tf.range to enocde positions
        #add (1) and (2) and return the layer

        # Define the token embedding layer
        token_embedding_layer = layers.Embedding(self.vocab_size, self.embed_dim)(inputs)

        # Define the positional embedding layer
        position_embedding_layer = layers.Embedding(self.maxlen, self.embed_dim)(tf.range(start=0, limit=self.maxlen, delta=1))

        # Combine the token embedding and positional embedding layers
        combined_embedding_layer = layers.Add()([token_embedding_layer, position_embedding_layer])

        return combined_embedding_layer
    
    def create_model(self):
        
        #combine the EmbeddingLayer and num_blocks TransformerBlocks to create the model, use the Keras functional API (https://keras.io/guides/functional_api/)
        #See the section on the functional API link "All models are callabe, just like layers" for code refernce
        transformer_input = keras.Input(shape=self.maxlen, )
        embed_block = self.EmbeddingLayer(inputs=transformer_input)
        transformer_block = self.TransformerBlock(inputs=embed_block)

        model = Model(inputs=transformer_input, outputs=transformer_block)
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

        return model
        #use the SparseCategoricalCrossentropy loss function (https://keras.io/api/losses/probabilistic_losses/#sparsecategoricalcrossentropy-class)

## Task 2

In [102]:
class DataSet():
    def __init__(self, filename, len):
        #load the text from the file
        
        # open file and read text
        t_file = open(filename, "r")
        self.f_string = t_file.read()

        self.vocab_size = 0
        self.vocab = []
        self.int_string = []
        self.len = len

        pass
        
    def prep_text(self):
        #remove all punctuation, set to lowercase, remove duplicate spaces and other whitespace (keep newlines)

        # Remove all punctuation
        self.f_string = re.sub(r'[^\w\s]', '', self.f_string)
        # Set to lowercase
        self.f_string = self.f_string.lower()
        # Remove duplicate spaces and other whitespace
        self.f_string = re.sub(r'\s+', ' ', self.f_string)

        # Remove leading and trailing whitespace
        self.f_string = self.f_string.strip()        
        
    def tokenize_text(self):
        #seperate into words, create a vocab and convert the text to a list of numbers using the vocab such that each unique word is represented by its own number number
        
        vocab, self.text = np.unique(self.f_string, return_inverse=True)
        self.vocab_size = len(vocab)
        # # seperate into words
        # self.f_string = self.f_string.split()
        # # create a vocab
        # self.vocab = sorted(set(self.f_string))
        # # convert the text to a list of numbers using the vocab
        # self.f_string = [self.vocab.index(i) for i in self.f_string]
        # # each word is represented by its own number number
        # self.vocab_size = len(self.vocab)
        # # make vocab_size unique numbers
        # self.int_string = np.unique(self.vocab_size)

    def create_dataset(self):
        #split the tokenized data into sequences of length len, return the sequences and vocab
        # The create dataset method will call prep text, tokenize text, and then create and
        # return the x, y, and vocabulary used to train your model. Here, each element of x
        # is a sequence of integers (representing words) and y is offset forward by one, such
        # that x and y have the same length.

        self.prep_text()
        self.tokenize_text()

        # create x and y
        x = []
        y = []
        for i in range(len(self.f_string) - self.len):
            x.append(self.f_string[i:i + self.len])
            y.append(self.f_string[i + self.len])
        

        return x, y, self.vocab


## Task 3

In [103]:
class GenerateText():
    def __init__(self, model, vocab):
    # The init method will instantiates the reference to the model and vocabulary. It
    # also create a mapping from the integer representation of tokens/words into a
    # human-readable format.
        self.model = model
        self.vocab = vocab
        self.int_to_vocab = dict(enumerate(self.vocab))
        self.vocab_to_int = {self.int_to_vocab[i]: i for i in self.int_to_vocab}
    
    def generate_text(self, start_string, num_generate=100):
        #generate text using the model and vocab, start with the start_string and generate num_generate words
        # The generate text method will use a start string and generate a number of addi-tional words. 
        # The start string should take in at least one word to initialize the beginning of the return sequence
        tokenized_start_string = [self.vocab_to_int[i] for i in start_string.split()]

        # Number of words to generate
        for i in range(num_generate):
            # Tokenize the start string
            # Pad the start string
            tokenized_start_string = tf.keras.preprocessing.sequence.pad_sequences(tokenized_start_string, maxlen=100, padding='post')
            # Predict the next word
            predicted_word = self.model.predict(tokenized_start_string)
            # Add the predicted word to the start string
            start_string += self.int_to_vocab[predicted_word]

    def generate_random_text(self, num_words):
        # Generate random start string
        start_string = " ".join(np.random.choice(self.vocab, size=np.random.randint(1, 10)))

        # Call generate_text method with random start string
        return self.generate_text(start_string, num_words)

## Task 4: Model Traning and Testing

In [104]:
#Train the model while periodically generating text to show progress
def train_model(model, vocab, x, y, epochs=50):
    # Generate text
    for e in range(epochs):
        print(f"Epoch {e}")
        # Train the model
        model.fit(x, y, epochs=1, batch_size=128, verbose=1)

        # Generate text
        text = GenerateText.generate_text(model, vocab)
        random_text = GenerateText.generate_random_text(100)

    
    return model

In [105]:
t_file = open("beatles.txt", "r")
f_string = t_file.read()
print(len(f_string))

# clean the data
clean_data = DataSet("beatles.txt", 100).create_dataset()
model = TransformerModel(len(f_string)).create_model()

# Train the model
model = train_model(model, clean_data.vocab, clean_data.x, clean_data.y, epochs=50)

169389


TypeError: MultiHeadAttention.__init__() missing 1 required positional argument: 'key_dim'


# Report

## Introduction

## Results

## Conclusion

## How to Run Code

Please include any special libraries and list your tf version here.