# Project 4
## Students:
 > Abdurhman Bahour,
 > Coby White,
 > William C. Parham
 
 

In [249]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from tensorflow.keras.preprocessing.text import Tokenizer

import numpy as np
import re

In [250]:
print(tf.__version__)# you may want to upgrade to 2.10.0 

2.10.0


### Please Use Markdown
> for markdown, see here: https://www.ibm.com/docs/en/watson-studio-local/1.2.3?topic=notebooks-markdown-jupyter-cheatsheet

## Task 1

In [251]:
class TransformerModel():
    def __init__(self, vocab_size, embed_dim=256, num_heads=2, num_blocks=1, ff_dim=256, maxlen=64, rate=0.1):
        #initailize variables
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.num_heads = num_heads    #Use with transformerblock function
        self.num_blocks = num_blocks
        self.ff_dim = ff_dim
        self.maxlen = maxlen
        self.rate = rate              #Use with dropout layer creation

    def TransformerBlock(self, inputs):
        #create the transformer block as discribed in the writeup, use the Keras functional API (https://keras.io/guides/functional_api/)
        #add the inputs which should be a positional embedding and token embedding
       
        #MultiHeadAttention layer, specifiy 'use_causal_mask=True' (https://keras.io/api/layers/attention_layers/multi_head_attention/)
        # layer_1 = layers.MultiHeadAttention(num_heads=self.num_heads, key_dim=, use_casual_mask=True)(input_tensor)
        layer_1 = layers.MultiHeadAttention(num_heads=self.num_heads, key_dim=self.embed_dim, use_bias=False, dropout=self.rate, name='layer_1')(inputs, inputs, use_causal_mask=True)

        # layer_1 = layers.MultiHeadAttention(num_heads=self.num_heads, key_dim=self.embed_dim, use_bias=False, use_casual_mask=True, dropout=self.rate, name='multi_head_attention')(input_tensor)
        #Use the rate variable for the dropout layers
        layer_2 = layers.Dropout(rate=self.rate, name='layer_2')(layer_1)
        
        #adder layer, output of prev dropout and input tensor
        layer_adder = layers.Add()([layer_2, inputs])
        
        #LayerNormalization layer, specifiy 'epsilon=1e-6' (https://keras.io/api/layers/normalization_layers/layer_normalization/)
        layer_3 = layers.LayerNormalization(epsilon=1e-6, name='layer_3')(layer_adder)
        
        #first dense layer
        layer_4 = layers.Dense(units=self.ff_dim, name='layer_4', activation = 'relu')(layer_3)
        
        #second dense layer
        layer_5 = layers.Dense(units=self.ff_dim, name='layer_5')(layer_4)
        
        #dropout layer
        layer_6 = layers.Dropout(rate=self.rate, name='layer_6')(layer_5)
        
        #adder layer, output of previous dropout and layer layer normalization layer
        layer_adder_2 = layers.Add()([layer_3 + layer_6])
        
        #output layer, final layer normalization layer
        output_layer = layers.LayerNormalization(epsilon=1e-6, name='output_layer')(layer_adder_2)
        
        return output_layer
        
    def EmbeddingLayer(self, inputs):
            
        #create the embedding layer
        #create (1) an embedding for the tokens and (2) an embedding for the positions
        #you can use https://keras.io/api/layers/core_layers/embedding/ Embedding class
        #you can use tf.range to enocde positions
        #add (1) and (2) and return the layer

        # return layers.Add(name='embedding')([layers.Embedding(input_dim=self.vocab_size, output_dim=self.embed_dim, name='token_embedding')(inputs) + layers.Embedding(input_dim=self.maxlen, output_dim=self.embed_dim, name='pos_embedding')(tf.range(start=0, limit=self.maxlen, delta=1))])

        # Define the token embedding layer
        token_embedding_layer = layers.Embedding(input_dim=self.vocab_size, output_dim=self.embed_dim, name='token_embedding_layer')(inputs)

        # Define the positional embedding layer
        position_embedding_layer = layers.Embedding(input_dim=self.maxlen, output_dim=self.embed_dim, name='position_embedding_layer')(tf.range(start=0, limit=self.maxlen, delta=1))

        # Combine the token embedding and positional embedding layers
        combined_embedding_layer = layers.Add()([token_embedding_layer + position_embedding_layer])

        return combined_embedding_layer
    
    def create_model(self):
        
        #combine the EmbeddingLayer and num_blocks TransformerBlocks to create the model, use the Keras functional API (https://keras.io/guides/functional_api/)
        #See the section on the functional API link "All models are callabe, just like layers" for code refernce
        transformer_input = keras.Input(shape=self.maxlen,  name='inputs')
        embed_block = self.EmbeddingLayer(inputs=transformer_input)
        transformer_block = embed_block
        for _ in range(self.num_blocks):
            transformer_block = self.TransformerBlock(inputs=transformer_block)

        output_dense = layers.Dense(units=self.vocab_size, activation='softmax', name='output_dense')(transformer_block)
        model = Model(inputs=transformer_input, outputs=output_dense)
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

        # model.summary()
        return model
        #use the SparseCategoricalCrossentropy loss function (https://keras.io/api/losses/probabilistic_losses/#sparsecategoricalcrossentropy-class)

## Task 2

In [252]:
import re
import numpy as np

class DataSet:
    def __init__(self, filename, seq_len):
        with open(filename, 'r') as f:
            self.text = f.read()
        self.seq_len = seq_len
        self.vocab = None
        self.tokenized_text = None

    def prep_text(self):
        self.text = re.sub(r'[^a-zA-Z\s]', ' ', self.text)  # remove everything except letters and spaces
        self.text = self.text.lower()
        self.text = re.sub(r'\s+', ' ', self.text)  # remove duplicate spaces
        self.text = self.text.replace('\t', ' ')  # replace tabs with spaces
        self.text = self.text.split()
        
    def tokenize_text(self):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts([self.text])
        self.vocab = np.unique(self.text)
        self.tokenized_text  = [np.where(self.vocab == word)[0][0] for word in self.text]

    def create_dataset(self):
        # Preprocess the text
        self.prep_text()
        self.tokenize_text()

        # Split the tokenized data into sequences of length len
        num_seq = int(len(self.text) // self.seq_len)
        print(num_seq)
        x = [self.tokenized_text[i*self.seq_len:(i+1)*self.seq_len] for i in range(num_seq)]
        y = [self.tokenized_text[(i*self.seq_len)+1:((i+1)*self.seq_len)+1] for i in range(num_seq)]

        x = np.array(x)
        y = np.array(y)
        
        print("x: ", x.shape)
        print("y: ", y.shape)
        print("vocab: ", self.vocab)

        return x, y, self.vocab

## Task 3

In [253]:
class GenerateText():
    def __init__(self, model, vocab):
    # The init method will instantiates the reference to the model and vocabulary. It
    # also create a mapping from the integer representation of tokens/words into a
    # human-readable format.
        self.model = model
        self.vocab = vocab
        self.int_to_vocab = dict(enumerate(self.vocab))
        self.vocab_to_int = {self.int_to_vocab[i]: i for i in self.int_to_vocab}
    
        print("vocab: ", self.vocab)
        print("int_to_vocab: ", self.int_to_vocab)
        print("vocab_to_int: ", self.vocab_to_int)

    def generate_text(self, start_string, num_generate=100):
        #start_tokens = [_ for _ in start_string]
        print('this is start_string: ', start_string)
        start_tokens = [self.vocab_to_int[word] for word in start_string.split()]

        print('this is start_tokens: ', start_tokens)
        
        maxlen = 64
        num_tokens_generated = 0
        tokens_generated = []
        txt = start_string + ' '
        while num_tokens_generated <= num_generate:
            pad_len = maxlen - len(start_tokens)
            sample_index = len(start_tokens) - 1
            if pad_len < 0:
                x = start_tokens[:maxlen]
                sample_index = maxlen - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens

            x = np.array([x])
            print("this is x: ", x)
            # print("shape of x: ", x.shape)
            # y,_ = self.model.predict(x)  #THE ERROR IS HAPPENING HERE -- github is saying it may be a dimensionality error in pytorch ... we are using keras.
            y = self.model.predict(x)
            # print('this is the shape: ', y.shape)
            # print('this is y[0][sample_index]: \n', y[0][sample_index])
            #sample_token = self.generate_random_text(y[0][sample_index])
            # sample_token = self.sample_from(y[0][sample_index])
            y = np.array(y)
            
            #look through each row and column and argmax the row then find the corresponding value
            #the position in each list will reflect 

            argmaxes = []
            for i in range(0, len(y)):
                for j in range(0, len(y[i])):
                    argmaxes.append(np.argmax(y[i][j]))
            
            print('argmaxes: \n', argmaxes)
            #print('argmax of argmaxes: ', np.argmax(np.array(argmaxes)))
            print('argmaxes[np.argmax(np.array(argmaxes))]:', argmaxes[np.argmax(np.array(argmaxes))])
            best_token = self.vocab[argmaxes[np.argmax(np.array(argmaxes))]]
            tokens_generated.append(best_token)
            start_tokens.append(self.vocab_to_int[best_token])
            print('best token: ', best_token, '\n',
                  'start_tokens: ', start_tokens, '\n', 
                  'token_generated: ', tokens_generated)
            num_tokens_generated = len(tokens_generated)
            txt = txt + best_token + ' '

            # print(f"generated text:\n{txt}\n")
            print('generated text: ', txt)
        # # Initialize the start sequence
        # input_eval = [self.vocab_to_int[word] for word in start_string.split()]
        # # input_eval = tf.expand_dims(input_eval, 0)
        
        # # Initialize the generated text
        # generated_text = []
        
        # X = np.array(input_eval)
        
        # # Loop through the specified number of words to generate
        # for i in range(num_generate):
        #     print("GenerateText.generate_text() input_eval: ", [input_eval])
        #     # Predict the next word using the trained model
        #     predictions = self.model.predict(X)
        #     # predictions = tf.squeeze(predictions, 0)
        #     predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
            
        #     # Add the predicted word to the generated text
        #     generated_text.append(self.reverse_vocab[predicted_id])
            
        #     # Update the input sequence for the next iteration
        #     input_eval = tf.expand_dims([predicted_id], 0)
            
        # # Join the generated text into a single string and return it
        # return ' '.join(generated_text)

        
    # def generate_text(self, start_string, num_generate=100):
    #     # preprocess start string
    #     print("GenerateText.generate_text() Start String: ", start_string)
    #     tokenized_start_string = [self.vocab_to_int[i] for i in start_string.split()]

    #     # initialize generated text
    #     generated_text = []
        
     
    #     # for i in range(num_generate):
    #     #     # get predictions from model
    #     #     predictions = self.model(tokenized_start_string)
    #     #     predictions = tf.squeeze(predictions, 0)
            
    #     #     print("predictions: ", predictions)

    #     #     # apply softmax to predictions and sample index from distribution
    #     #     predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
            
    #     #     # add predicted word to generated text and update input_eval
    #     #     generated_text.append(self.vocab_to_int[predicted_id])
    #     #     input_eval = tf.expand_dims([predicted_id], 0)
    #     for i in range(num_generate):
    #         # Tokenize the start string
    #         # Pad the start string
            
    #         print("tokenized_start_string: ", tokenized_start_string)
            
    #         np_tokenized = np.array(tokenized_start_string)
    #         print("np_tokenized: ", np_tokenized)

    #         tokenized_reshaped = np.reshape(np_tokenized, (np_tokenized.shape[0],1))
    #         print("tokenized_reshaped: ", tokenized_reshaped)

    #         tokenized_start_string = tf.keras.preprocessing.sequence.pad_sequences(np_tokenized, maxlen=64, padding='post')
    #         # Predict the next word
    #         predicted_word = self.model.predict(tokenized_start_string)
    #         # Add the predicted word to the start string
    #         start_string += self.int_to_vocab[predicted_word]   

    #         # print("generated_text: ", generated_text)
    #         # print("input_eval: ", input_eval)
    #         # print("predicted_id: ", predicted_id)
    #         # print("predictions: ", predictions)
    #         # print("start_string: ", start_string)
    #     return start_string + ' ' + ' '.join(generated_text)
    
    def generate_random_text(self, start_string='', num_generate=100):
        # initialize generated text
        generated_text = []
        
        # loop to generate text
        for i in range(num_generate):
            # get random word from vocab
            predicted_word = np.random.choice(self.vocab)
            
            # add predicted word to generated text
            generated_text.append(predicted_word)
            
        # print("generated_text: ", ' '.join(generated_text))
        word = ' '.join(generated_text)
        
        print('This is start string and len: ', len(start_string), ' ', start_string, '\n', 'This is word and len: ',len(word), word)
        return start_string + ' ' + word
        #return ' ' + word


In [254]:
# model = train_model(model,x[:64,:],y[:64,:],vocab,epochs=50)

## Task 4: Model Traning and Testing

In [255]:
#Train the model while periodically generating text to show progress
def train_model(model, x, y, vocab, epochs=50):
    # Generate text
    GT = GenerateText(model, vocab)
    random_text = "zapped"
    for e in range(epochs):
        print(f"Epoch {e+1}")
        # Train the model
        model.fit(x, y, epochs=1, batch_size=64, verbose=1)

        # # # Generate text
        # random_text = GT.generate_random_text(random_text)
        # print("random_text: ", random_text)
        random_text = GT.generate_text(random_text)  #doenst get past here
        print("random_text: ", random_text)

        # random_text = GenerateText.generate_random_text(100)

    print("Betal Song:", random_text)
    return model

In [256]:
# clean the data
x,y,vocab = DataSet("beatles.txt", 64).create_dataset()
# print(vocab[-10:])
# print(x[:,-1])
# print(y[:,-1])
print("len(vocab):", len(vocab))
# print("Voab: ", vocab)
model = TransformerModel(len(vocab)).create_model()

# Train the model
model = train_model(model,x[:,:],y[:,:],vocab,epochs=50)


564
x:  (564, 64)
y:  (564, 64)
vocab:  ['a' 'aaaaaaaahhhh' 'aaaaaahhhhhh' ... 'zapped' 'zoo' 'zu']
len(vocab): 2491
vocab:  ['a' 'aaaaaaaahhhh' 'aaaaaahhhhhh' ... 'zapped' 'zoo' 'zu']
Epoch 1
this is start_string:  zapped
this is start_tokens:  [2488]
this is x:  [[2488    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]]
argmaxes: 
 [1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005, 1005]
argmax

AttributeError: 'NoneType' object has no attribute 'split'


# Report

## Introduction

## Results

## Conclusion

## How to Run Code

Please include any special libraries and list your tf version here.