# Project 4
## Students:
 > [Tanner Mengel]
 > [Ian Cox]
 
 

In [8]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import re

In [9]:
print(tf.__version__)# you may want to upgrade to 2.10.0 

2.12.0


### Please Use Markdown
> for markdown, see here: https://www.ibm.com/docs/en/watson-studio-local/1.2.3?topic=notebooks-markdown-jupyter-cheatsheet

## Task 1

In [10]:
class TransformerModel():
    def __init__(self, vocab_size, embed_dim=256, num_heads=2, num_blocks=1, ff_dim=256, maxlen=80, rate=0.1):
        #initailize variables
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.num_blocks = num_blocks
        self.ff_dim = ff_dim
        self.maxlen = maxlen
        self.rate = rate

    def TransformerBlock(self, inputs):
        #create the transformer block as discribed in the writeup, use the Keras functional API (https://keras.io/guides/functional_api/)
        #MultiHeadAttention layer, specifiy 'use_causal_mask=True' (https://keras.io/api/layers/attention_layers/multi_head_attention/)
        MHA_Layer = layers.MultiHeadAttention(num_heads=self.num_heads,  key_dim=self.embed_dim,  dropout=self.rate)(inputs, inputs, use_causal_mask=True)
        
        #Dropout layer
        dropout_1 = layers.Dropout(rate=self.rate)(MHA_Layer)
        #LayerNormalization layer, specifiy 'epsilon=1e-6' (https://keras.io/api/layers/normalization_layers/layer_normalization/)
        normlayer_1 = layers.LayerNormalization(epsilon=1e-6)(inputs + dropout_1)
        #Use the rate variable for the dropout layers and remember to use two dense layers
        # Feed-Forward Layer
        dense_1 = layers.Dense(units=self.ff_dim, activation='relu')(normlayer_1)
        # Second Dense layer
        dense_2 = layers.Dense(units=self.ff_dim)(dense_1)
        # Second Dropout layer
        dropout_2 = layers.Dropout(rate=self.rate)(dense_2)
        # Output Layer norm
        transformer_output = layers.LayerNormalization(epsilon=1e-6)(normlayer_1 + dropout_2)
        #See assignment and its figures for more details.
        return transformer_output

    
    def EmbeddingLayer(self, inputs):
        #create the embedding layer
        #create (1) an embedding for the tokens and (2) an embedding for the positions
        #you can use https://keras.io/api/layers/core_layers/embedding/ Embedding class
        #you can use tf.range to enocde positions
        #add (1) and (2) and return the layer
        embedding_tokens = layers.Embedding(input_dim=self.vocab_size, output_dim=self.embed_dim)(inputs)
        embedding_positions = layers.Embedding(input_dim=self.maxlen, output_dim=self.embed_dim)(tf.range(start=0, limit=self.maxlen, delta=1))
        embeddings = embedding_tokens + embedding_positions
        return embeddings

    
    def create_model(self):
        #combine the EmbeddingLayer and num_blocks TransformerBlocks to create the model, use the Keras functional API (https://keras.io/guides/functional_api/)
        #use the SparseCategoricalCrossentropy loss function (https://keras.io/api/losses/probabilistic_losses/#sparsecategoricalcrossentropy-class)
         # Model Input
        input = layers.Input(shape=(self.maxlen,))
        # Embedding Layer
        embeddings = self.EmbeddingLayer(input)
        # Transformer Blocks
        for i in range(self.num_blocks):
            embeddings = self.TransformerBlock(embeddings)
        # Output Layer
        model_output = layers.Dense(units=self.vocab_size, activation='softmax')(embeddings)
        # Create Model
        model = tf.keras.Model(inputs=input, outputs=model_output)
        # Compile Model
        model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['accuracy'])
        return model

## Task 2

In [11]:
class DataSet():
    def __init__(self, filename, len):
        #load the text from the file
        with open(filename, 'r') as f:
            self.text = f.read()
        self.len = len
        self.filename = filename

    def prep_text(self):
        #remove all punctuation, set to lowercase, remove duplicate spaces and other whitespace (keep newlines)
        self.text = re.sub(r'[^\w\s]', '', self.text).lower()
        # self.text = re.sub(r'\s+', ' ', self.text)
        self.text = re.sub(r'[  ]', ' ', self.text)
        
        
    def tokenize_text(self):
        #seperate into words, create a vocab and convert the text to a list of numbers using the vocab such that each unique word is represented by its own number number
        self.words = self.text.split(' ')
        self.vocab = np.unique(self.words)
        self.word2idx = {word: i for i, word in enumerate(self.vocab)}
        self.idx2word = {i: word for i, word in enumerate(self.vocab)}
        self.tokens = np.array([self.word2idx[word] for word in self.words])
        

    def create_dataset(self):
        #split the tokenized data into sequences of length len, return the sequences and vocab
        self.prep_text()
        self.tokenize_text()
        
        x, y = [], []
        for i in range(0, len(self.tokens) - self.len, self.len):
            seq_x = self.tokens[i:i+self.len]
            seq_y = self.tokens[i+1:i+self.len+1]
            x.append(seq_x)
            y.append(seq_y)
        self.x = np.array(x)
        self.y = np.array(y)
        
        return self.x, self.y, self.vocab

## Task 3

In [12]:
class GenerateText():
    def __init__(self, model, vocab):
        self.model = model
        self.vocab = {word: i for i, word in enumerate(vocab)}
        self.idx_to_word = {i: word for i, word in enumerate(self.vocab)}

    def generate_text(self, start_string, num_generate=100):
        input_eval = [self.vocab[word] for word in start_string.split(' ') if word in self.vocab]
        input_eval = tf.expand_dims(input_eval, 0)
        text_generated = []
        
        # Here, batch size == 1
        self.model.reset_states()
        for i in range(num_generate):
            predictions = self.model(input_eval)
            # Remove the batch dimension
            predictions = tf.squeeze(predictions, 0)
            predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
            input_eval = tf.expand_dims([predicted_id], 0)
            text_generated.append(self.idx_to_word[predicted_id])

        return ' '.join(text_generated)
    
    def generate_random_text(self, num_generate=100):
        start_string = np.random.choice(list(self.vocab.keys()))
        return self.generate_text(start_string, num_generate)

## Task 4: Model Traning and Testing

In [13]:
#Train the model while periodically generating text to show progress
def train_model(model, vocab, x, y, epochs=50):
    for i in range(epochs):
        model.fit(x, y, epochs=1, batch_size=64, verbose=1)
        print("Epoch:", i + 1)
        if((i+1)%int(epochs)==0):
            # Generate text after each epoch
            generate_text = GenerateText(model, vocab)
            print(generate_text.generate_text("love", num_generate=50))
    
    return model

In [14]:
ds = DataSet('beatles.txt', len=80)
x, y, vocab = ds.create_dataset()
model = TransformerModel(vocab_size=len(vocab), embed_dim=256, num_heads=2, maxlen=80, num_blocks=2, ff_dim=256, rate=0.1).create_model()
model = train_model(model, vocab, x, y, epochs=1)

Epoch: 1
tonight
shes bent chance dose minute somebody
who vain
so beat kissin man
how oh
in eye
crabalocker go
when whisper collapsed home
get ease
everyone you
think weh on
a lagoon
didnt son
mr yeah
ive dark
everyone you
tomorrow oh
closer
let oh
well boy
four see
all years
sgt love
but day
good gone
cause paper komm portrait somebody
help alone
i do
shes moonlight
you that poop
from are
sexy rain
bullfrog
doing seen
youre next ooh
my way act down
out


In [15]:
ds = DataSet('beatles.txt', len=80)
x, y, vocab = ds.create_dataset()
model = TransformerModel(vocab_size=len(vocab), embed_dim=256, num_heads=2, maxlen=80, num_blocks=2, ff_dim=256, rate=0.1).create_model()
model = train_model(model, vocab, x, y, epochs=50)

Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Epoch: 20
Epoch: 21
Epoch: 22
Epoch: 23
Epoch: 24
Epoch: 25
Epoch: 26
Epoch: 27
Epoch: 28
Epoch: 29
Epoch: 30
Epoch: 31
Epoch: 32
Epoch: 33
Epoch: 34
Epoch: 35
Epoch: 36
Epoch: 37
Epoch: 38
Epoch: 39
Epoch: 40
Epoch: 41
Epoch: 42
Epoch: 43
Epoch: 44
Epoch: 45
Epoch: 46
Epoch: 47
Epoch: 48
Epoch: 49
Epoch: 50
out
try nun
mondays garden
id me
ive mine
will dream
i need
nothing too blue
tell lost you
just doctor pounds lit truththen time
boy behind
and against cry
make you
sha ballad yeah
there pictures
joaoaoaoan
but warm
im hills continuing denis you
one lonely
would dinner
with do
whenever am
you mind
you south mama today
from teeshirt rival military kindness dear
martha driver out
shes be
when minute
you mojo hill
the undertake way
cause never


In [16]:
ds = DataSet('beatles.txt', len=80)
x, y, vocab = ds.create_dataset()
model = TransformerModel(vocab_size=len(vocab), embed_dim=256, num_heads=2, maxlen=80, num_blocks=2, ff_dim=256, rate=0.1).create_model()
model = train_model(model, vocab, x, y, epochs=100)

Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Epoch: 20
Epoch: 21
Epoch: 22
Epoch: 23
Epoch: 24
Epoch: 25
Epoch: 26
Epoch: 27
Epoch: 28
Epoch: 29
Epoch: 30
Epoch: 31
Epoch: 32
Epoch: 33
Epoch: 34
Epoch: 35
Epoch: 36
Epoch: 37
Epoch: 38
Epoch: 39
Epoch: 40
Epoch: 41
Epoch: 42
Epoch: 43
Epoch: 44
Epoch: 45
Epoch: 46
Epoch: 47
Epoch: 48
Epoch: 49
Epoch: 50
Epoch: 51
Epoch: 52
Epoch: 53
Epoch: 54
Epoch: 55
Epoch: 56
Epoch: 57
Epoch: 58
Epoch: 59
Epoch: 60
Epoch: 61
Epoch: 62
Epoch: 63
Epoch: 64
Epoch: 65
Epoch: 66
Epoch: 67
Epoch: 68
Epoch: 69
Epoch: 70
Epoch: 71
Epoch: 72
Epoch: 73
Epoch: 74
Epoch: 75
Epoch: 76
Epoch: 77
Epoch: 78
Epoch: 79
Epoch: 80
Epoch: 81
Epoch: 82
Epoch: 83
Epoch: 84
Epoch: 85
Epoch: 86
Epoch: 87
Epoch: 88
Epoch: 89
Epoch: 90
Epoch: 91
Epoch: 92
Epoch: 93
Epoch: 94
Epoch: 95
Epoch: 96
Epoch: 97
Epoch: 98
Epoch: 99
Epoch: 100
gone
and

In [19]:
ds = DataSet('beatles.txt', len=100)
x, y, vocab = ds.create_dataset()
model = TransformerModel(vocab_size=len(vocab), embed_dim=256, num_heads=10, maxlen=100, num_blocks=10, ff_dim=256, rate=0.1).create_model()
model = train_model(model, vocab, x, y, epochs=1000)

Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Epoch: 20
Epoch: 21
Epoch: 22
Epoch: 23
Epoch: 24
Epoch: 25
Epoch: 26
Epoch: 27
Epoch: 28
Epoch: 29
Epoch: 30
Epoch: 31
Epoch: 32
Epoch: 33
Epoch: 34
Epoch: 35
Epoch: 36
Epoch: 37
Epoch: 38
Epoch: 39
Epoch: 40
Epoch: 41
Epoch: 42
Epoch: 43
Epoch: 44
Epoch: 45
Epoch: 46
Epoch: 47
Epoch: 48
Epoch: 49
Epoch: 50
Epoch: 51
Epoch: 52
Epoch: 53
Epoch: 54
Epoch: 55
Epoch: 56
Epoch: 57
Epoch: 58
Epoch: 59
Epoch: 60
Epoch: 61
Epoch: 62
Epoch: 63
Epoch: 64
Epoch: 65
Epoch: 66
Epoch: 67
Epoch: 68
Epoch: 69
Epoch: 70
Epoch: 71
Epoch: 72
Epoch: 73
Epoch: 74
Epoch: 75
Epoch: 76
Epoch: 77
Epoch: 78
Epoch: 79
Epoch: 80
Epoch: 81
Epoch: 82
Epoch: 83
Epoch: 84
Epoch: 85
Epoch: 86
Epoch: 87
Epoch: 88
Epoch: 89
Epoch: 90
Epoch: 91
Epoch: 92
Epoch: 93
Epoch: 94
Epoch: 95
Epoch: 96
Epoch: 97
Epoch: 98
Epoch: 99
Epoch: 100
Epoch: 1


# Report

## Introduction
The goal of this project is to create neural networks which utilize the transformer architecture to create lyrics for a song which might be from the Beatles.  This is done by training the network using the lyrics from all of the previous Beatles songs.  All of the lyrics are input as a list of words which have removed the punctutation, while keeping the newlines.  In terms of song lyrics, it is important to keep the newlines to retain the structure of a song.
Once a desired length of words is input into the model, it is first embedded using both token and positional embedding.  The token embedding allows for each of the words to be given a certain value so that the model can identify what it is and how to output it, similar to a one-hot encoding of all of the words.  The positional embedding is then used to relate where the words are located in the input.  
Following the embedding is the TransformerBlock, which is a multi-headed attention layer followed by simple dense layers.  Attention used the embedding from earlier to find how each word relates to the others in the phrase.  This model allows for the user to choose the number of TransformerBlocks in the model before training, increasing or decreasing the complexity.  After the TransformerBlocks is another dense layer to the outputs, which are also a prechosen length.  
After training, the network can take in a word or phrase to then create an output which would be the potential lyrics to a Beatles like song.  

## Results

## Conclusion

## How to Run Code

Please include any special libraries and list your tf version here.