# Follows Chapter 16

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    !pip install -q -U tensorflow-addons
    IS_COLAB = True
except Exception:
    IS_COLAB = False

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "nlp"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)


# Character RNN
   splitting a sequence into batches of shuffled windows
   
   "For example, let's split the sequence 0 to 14 into windows of length 5, each shifted by 2 (e.g.,`[0, 1, 2, 3, 4]`, `[2, 3, 4, 5, 6]`, etc.), then shuffle them, and split them into inputs (the first 4 steps) and targets (the last 4 steps) (e.g., `[2, 3, 4, 5, 6]` would be split into `[[2, 3, 4, 5], [3, 4, 5, 6]]`), then create batches of 3 such input/target pairs:"

In [2]:
np.random.seed(42)
tf.random.set_seed(42)

#### visualization of splitting the batches

In [3]:

n_steps = 5
dataset = tf.data.Dataset.from_tensor_slices(tf.range(15))
dataset = dataset.window(n_steps, shift=2, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(n_steps))
dataset = dataset.shuffle(10).map(lambda window: (window[:-1], window[1:]))
dataset = dataset.batch(3).prefetch(1)
for index, (X_batch, Y_batch) in enumerate(dataset):
    print("_" * 20, "Batch", index, "\nX_batch")
    print(X_batch.numpy())
    print("=" * 5, "\nY_batch")
    print(Y_batch.numpy())

____________________ Batch 0 
X_batch
[[6 7 8 9]
 [2 3 4 5]
 [4 5 6 7]]
===== 
Y_batch
[[ 7  8  9 10]
 [ 3  4  5  6]
 [ 5  6  7  8]]
____________________ Batch 1 
X_batch
[[ 0  1  2  3]
 [ 8  9 10 11]
 [10 11 12 13]]
===== 
Y_batch
[[ 1  2  3  4]
 [ 9 10 11 12]
 [11 12 13 14]]


#### Shakespear text

In [4]:
shakespeare_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"

In [5]:
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)

In [6]:
with open(filepath) as f:
    text = f.read()

In [7]:
print(text[:148])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?



In [8]:
"".join(sorted(set(text.lower()))) #shows all characters w/in text

"\n !$&',-.3:;?abcdefghijklmnopqrstuvwxyz"

In [9]:
#finds all characters used in text and maps to a different character id
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True) #transform text to number


In [10]:
tokenizer.fit_on_texts(text)

In [11]:
tokenizer.texts_to_sequences(['First']) #word first to character id

[[20, 6, 9, 8, 3]]

In [None]:
tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]]) #reverse

['f i r s t']

In [None]:
max_id = len(tokenizer.word_index) #number of distinct characters

In [None]:
dataset_size = tokenizer.document_count #total number of characters

In [None]:
#encoded text array; -1 is to start array at 0
[encoded] = np.array(tokenizer.texts_to_sequences([text])) - 1

In [None]:
train_size = dataset_size * 90 // 100 #take 90% of the dataset for training
#slice dataset for training
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

creates small windows of text pg 530

In [None]:
n_steps = 100
window_length = n_steps + 1 # target = input shifted 1 character ahead
dataset = dataset.repeat().window(window_length, shift=1, drop_remainder=True)

In [None]:
#modifies the dataset into a dictionary of arrays
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [None]:
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

In [None]:
dataset = dataset.prefetch(1)

In [None]:
for X_batch, Y_batch in dataset.take(1):
    print(X_batch.shape,  Y_batch.shape)

(32, 100, 39) (32, 100)


### Create and Train the model

In [None]:
checkpoint = keras.callbacks.ModelCheckpoint('best_model.hdf5', monitor='loss',
                                            save_best_only= True, mode='auto',
                                            save_freq=1)

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                   activation='softmax'))
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [None]:
backup_save_model = model.save('backup_save.h5')

### Model below can take anywhere from 7-14 hours for training

In [None]:
history = model.fit(dataset, steps_per_epoch=train_size // batch_size, epochs=10, 
                   callbacks=[checkpoint])


Train for 31370 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10

In [None]:
model = keras.models.load_model('backup_save.h5')


In [None]:
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)

In [None]:
#predict the last letter of the sentence
X_new = preprocess(['How are yo'])
Y_pred = model.predict_classes(X_new)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1]

In [None]:
#comeback to unsure on why this array is created
tf.random.set_seed(42)

tf.random.categorical([[np.log(0.5), np.log(0.4), np.log(0.1)]], num_samples= 40).numpy()

In [None]:
#predict next character in sequence
def next_char(text, temperature=1):
    X_new = preprocess([text])
    y_proba = model.predict(X_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [None]:
tf.random.set_seed(42)

next_char('How are yo')

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
#predict sentence based on first letter
#temperature closer to 0 will predict characters in words that are common
#while a temperature further from 0 will predict less likely character words
def complete_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [None]:
print(complete_text('t', temperature=0.2))

In [None]:
print(complete_text('t', temperature=1))

In [None]:
print(complete_text('t', temperature=0.1))

In [None]:
print(complete_text('t', temperature=3))

# Stateful RNN 
  keeps state for the model to learn long term patterns

In [None]:
tf.random.set_seed(42)

In [None]:
#go through to understand better

df = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
df = df.window(window_length, shift=n_steps, drop_remainder=True)
df = df.flat_map(lambda window: window.batch(window_length))
df = df.repeat().batch(1)
df = df.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
df = df.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
df = df.prefetch(1)

In [None]:
batch_size = 32

In [None]:
encoded_parts = np.array_split(encoded[:train_size], batch_size)
datasets = []
for encoded_part in encoded_parts:
    df  = tf.data.Dataset.from_tensor_slices(encoded_part)
    df = df.window(window_length, shift=n_steps, drop_remainder=True)
    df = df.flat_map(lambda window: window.batch(window_length))
    datasets.append(df)
df = tf.data.Dataset.zip(tuple(datasets)).map(lambda *windows: tf.stack(windows))
df = df.repeat().map(lambda windows: (windows[:, :-1], windows[:, 1:]))
df = df.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = df.prefetch(1)

In [None]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, stateful=True, 
                     dropout=0.2, recurrent_dropout=0.2,
                     batch_input_shape=[batch_size, None, max_id]),
    keras.layers.GRU(128, return_sequences=True, stateful=True,
                    dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation='softmax'))
])

In [None]:
#reset state every epoch
class Reset_States_cb(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [None]:
steps_per_epoch = train_size // batch_size // n_steps

In [None]:
checkpoint = keras.callbacks.ModelCheckpoint('best_stateful.hdf5', monitor='loss',
                                            save_best_only= True, mode='auto',
                                            save_freq=1)

In [None]:
history = model.fit(dataset, steps_per_epoch=steps_per_epoch, epochs=50, 
                    callbacks=[Reset_States_cb(), che])

In [None]:
#for different batch sizes, create stateless copy. 
# dropout can be removed since it's only used for training

stateless_model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax"))
])

In [None]:
stateless_model.build(tf.TensorShape([None, None, max_id]))

In [None]:
stateless_model.set_weights(model.get_weights())
model = stateless_model

In [None]:
tf.random.set_seed(42)

print(complete_text('t'))

# Sentiment Analysis

In [None]:
tf.random.set_seed(42)

Using the IMDB dataset

In [None]:
(X_train, y_test), (X_valid, y_test) = keras.datasets.imdb.load_data()

In [None]:
#each review is preprocessed and assigned an integer for each word
X_train[0][:10]

0, 1, 2: represent padding 

In [None]:
#converted integers
word_index = keras.datasets.imdb.get_word_index()
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}
for id_, token in enumerate(('<pad>','<sos>','<unk>')):
    id_to_word[id_] = token
    
" ".join([id_to_word[id_] for id_ in X_train[0][:10]])

##### Building models with preprocessing built in

In [None]:
#load as bytes
import tensorflow_datasets as tfds

datasets, info = tfds.load('imdb_reviews', as_supervised=True, with_info=True)

In [None]:
datasets.keys()

In [None]:
train_size = info.splits['train'].num_examples
test_size = info.splits['test'].num_examples

In [None]:
train_size, test_size

In [None]:
for X_batch, y_batch in datasets['train'].batch(2).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("review:", review.decode('utf-8')[:200], "...")
        print('label:', label, "= Positive" if label else "= Negative")
        print()

In [None]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

In [None]:
preprocess(X_batch, y_batch)

In [None]:
from collections import Counter

vocab = Counter()

for X_batch, y_batch in datasets['train'].batch(32).map(preprocess):
    for review in X_batch:
        vocab.update(list(review.numpy()))

In [None]:
vocab.most_common()[:3] #top 3 most common words

In [None]:
len(vocab) #all words in dictionary

In [None]:
#get the 10k most common words 
vocab_size = 10000
truncated_vocab = [
    word for word, count in vocab.most_common()[:vocab_size]
]

In [None]:
word_to_id = {word: index for index, word in enumerate(truncated_vocab)}

In [None]:
for word in b'This movie was faaaaaantastic'.split():
    print(word_to_id.get(word) or vocab_size)

In [None]:
#preprocessing to replace each word with an ID
words = tf.constant(truncated_vocab)
word_ids = tf.range(len(truncated_vocab), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000 #out-of-vocab buckets for lookup
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [None]:
#now able to look up the ids of words
table.lookup(tf.constant([b'This movie was faaaaaantastic'.split()]))

In [None]:
#encode the words
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

In [None]:
train_set = datasets['train'].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [None]:
embedded_size = 128 #neurons per layer

In [None]:
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embedded_size,
                          input_shape=[None], mask_zero=True), #remove the padding: mask_zero
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam',
             metrics=['accuracy'])

In [None]:
checkpoint = keras.callbacks.ModelCheckpoint('sentiment_model.h5', monitor='loss',
                                            save_best_only=True, mode='auto')

In [None]:
#steps_per_epoch makes the model run out of data, even though the batch size is identical

In [None]:
history = model.fit(train_set, epochs=5, callbacks=[checkpoint])

Manual implementation of masking (removing padding)

In [None]:
K = keras.backend
embed_size = 128
inputs = keras.layers.Input(shape=[None])
mask = keras.layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs)
z = keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size)(inputs)
z = keras.layers.GRU(128, return_sequences=True)(z, mask=mask)
z = keras.layers.GRU(128)(z, mask=mask)
outputs = keras.layers.Dense(1, activation="sigmoid")(z)
model = keras.models.Model(inputs=[inputs], outputs=[outputs])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, steps_per_epoch=train_size // 32, epochs=5)

#  Reusing pretrained embeddings

In [None]:
tf.random.set_seed(42)

pretrained components called modules; located at https://tfhub.dev

In [None]:
%ls

In [None]:
TFHub_cache_dir = os.path.join(os.curdir, 'tfhub_cache')
os.environ['TFHub_cache_dir'] = TFHub_cache_dir

In [None]:
import tensorflow_hub as hub

#first layer is a sentence encoder, taking strings as input and encodes each as a single
# vector
model = keras.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1",
                   dtype=tf.string, input_shape=[], output_shape=[50]),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
#view directory
# for dirpath, dirnames, filenames in os.walk(TFHub_cache_dir):
#     for filename in filenames:
#         print(os.path.join(dirpath, filename))

In [None]:
import tensorflow_datasets as tfds

datasets, info = tfds.load('imdb_reviews', as_supervised=True, with_info=True)
train_size = info.splits['train'].num_examples

batch_size = 32
train_set = datasets['train'].repeat().batch(batch_size).prefetch(1)

history = model.fit(train_set, steps_per_epoch=train_size//batch_size, epochs=5)

In [None]:
model.save('pretrained_embeddings.h5')

In [None]:
test_set = datasets['test'].repeat().batch(batch_size).prefetch(1)
test_size = info.splits['test'].num_examples

In [None]:
model.evaluate(test_set, steps= test_size//batch_size)

# automatic translation
-comeback, still iffy, https://homl.info/103

In [None]:
tf.random.set_seed(42)

In [None]:
vocab_size = 100
embed_size = 10

In [None]:
import tensorflow_addons as tfa

pg 543-546
- ex: english sentences fed into the encoder, while french sentences are fed into the decoder one step back. (decoder is given the output from the encoder on previous step)
- encoder also reverses the sentence 
- masking is use to handle the sentence length useful function: (tf.data.experimental.bucket_by_sequence_length)
- outputs after 'end of sequence' will be ignored from loss



In [None]:
encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)

sequence_length = keras.layers.Input(shape=[], dtype=np.int32)

In [None]:
embeddings = keras.layers.Embedding(vocab_size, embed_size)
encoder_embeddings = embeddings(encoder_inputs)
decoder_embeddings = embeddings(decoder_inputs)

In [None]:
#return_state true to get the final hidden state for passing to decoder
encoder = keras.layers.LSTM(512, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

In [None]:
sampler = tfa.seq2seq.sampler.TrainingSampler()

In [None]:
decoder_cell = keras.layers.LSTMCell(512)
output_layer = keras.layers.Dense(vocab_size)
decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell, sampler, output_layer=output_layer)

In [None]:
final_outputs, final_state, final_sequence_lengths = decoder(
    decoder_embeddings, initial_state=encoder_state,
    sequence_length=sequence_length)

In [None]:
Y_proba = tf.nn.softmax(final_outputs.rnn_output)

In [None]:
model = keras.models.Model(
    inputs=[encoder_inputs, decoder_inputs, sequence_length],
    outputs=[Y_proba])

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [None]:
X = np.random.randint(100, size=10*1000).reshape(1000, 10)
Y = np.random.randint(100, size=15*1000).reshape(1000, 15)
X_decoder = np.c_[np.zeros((1000, 1)), Y[:, :-1]]
seq_lengths = np.full([1000], 15)

In [None]:
history = model.fit([X, X_decoder, seq_lengths], Y, epochs=2)

# Bidirectional Recurrent Layers

pretty simple implementation of bidirectional layers, which allows the model to look into the future, frequently used for translation.

In [None]:
model = keras.models.Sequential([
    keras.layers.GRU(10, return_sequences=True, input_shape=[None, 10]),
    #pretty simple 
    keras.layers.Bidirectional(keras.layers.GRU(10, return_sequences=True))
])

In [None]:
#bidirectional layer creates a coppy of the GRU layer in the reverse direction
model.summary()

#### Mock Beamwidth implement

In [None]:
beam_width = 10 #number of sentence possibilities

In [None]:
decoder = tfa.seq2seq.beam_search_decoder.BeamSearchDecoder(
    cell= decoder_cell, beam_width=beam_width, output_layer=output_layer)

In [None]:
decoder_initial_state = tfa.seq2seq.beam_search_decoder.tile_batch(encoder_state, 
                                                                      multiplier=beam_width)

In [None]:
# outputs, _, _ = decoder(
#     decoder, start_tokens= start_tokens, end_token= end_token, 
#     initial_state= decoder_initial_state)

##### adding a luong attention  to encoder-decoder
- attention mechanisms beneficial for natural language translation, generating image captions, and for explainability

In [None]:
# attention_mechanism = tfa.seq2seq.attention_wrapper.LuongAttention(
#     units, encoder_State, memory_sequence_length=encoder_sequence_length)
# attention_decoder_cell = tfa.seq2seq.attention_wrapper.AttentionWrapper(
#     decoder_cell, attention_mechanism, attention_layer_size=n_units)

# Positional Encoding 


In [None]:
class Positional_Encoding(keras.layers.Layer):
    def __init__(self, max_steps, max_dims, dtype=tf.float32, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        
        if max_dims % 2 == 1: max_dims += 1 #max_dimensions needs to be even
        
        p, i = np.meshgrid(np.arange(max_steps), np.arange(max_dims // 2))
        pos_emb = np.empty((1, max_steps, max_dims))
        pos_emb[0, :, ::2] = np.sin(p / 10000**(2 * i / max_dims)).T
        pos_emb[0, :, 1::2] = np.cos(p / 10000**(2 * i / max_dims)).T
        self.positional_embedding = tf.constant(pos_emb.astype(self.dtype))
    def call(self, inputs):
        shape = tf.shape(inputs)
        return inputs + self.positional_embedding[:, :shape[-2], :shape[-1]]

In [None]:
max_steps = 201 
max_dims = 512
position_embedding = Positional_Encoding(max_steps, max_dims)
PE = position_embedding(np.zeros((1, max_steps, max_dims), np.float32))[0].numpy()

### Visualization of positional encoding

In [None]:
i1, i2, crop_i = 100, 101, 150
p1, p2, p3 = 22, 60, 35
fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(9, 5))
ax1.plot([p1, p1], [-1, 1], "k--", label="$p = {}$".format(p1))
ax1.plot([p2, p2], [-1, 1], "k--", label="$p = {}$".format(p2), alpha=0.5)
ax1.plot(p3, PE[p3, i1], "bx", label="$p = {}$".format(p3))
ax1.plot(PE[:,i1], "b-", label="$i = {}$".format(i1))
ax1.plot(PE[:,i2], "r-", label="$i = {}$".format(i2))
ax1.plot([p1, p2], [PE[p1, i1], PE[p2, i1]], "bo")
ax1.plot([p1, p2], [PE[p1, i2], PE[p2, i2]], "ro")
ax1.legend(loc="center right", fontsize=14, framealpha=0.95)
ax1.set_ylabel("$P_{(p,i)}$", rotation=0, fontsize=16)
ax1.grid(True, alpha=0.3)
ax1.hlines(0, 0, max_steps - 1, color="k", linewidth=1, alpha=0.3)
ax1.axis([0, max_steps - 1, -1, 1])
ax2.imshow(PE.T[:crop_i], cmap="gray", interpolation="bilinear", aspect="auto")
ax2.hlines(i1, 0, max_steps - 1, color="b")
cheat = 2 # need to raise the red line a bit, or else it hides the blue one
ax2.hlines(i2+cheat, 0, max_steps - 1, color="r")
ax2.plot([p1, p1], [0, crop_i], "k--")
ax2.plot([p2, p2], [0, crop_i], "k--", alpha=0.5)
ax2.plot([p1, p2], [i2+cheat, i2+cheat], "ro")
ax2.plot([p1, p2], [i1, i1], "bo")
ax2.axis([0, max_steps - 1, 0, crop_i])
ax2.set_xlabel("$p$", fontsize=16)
ax2.set_ylabel("$i$", rotation=0, fontsize=16)
plt.savefig("positional_embedding_plot")
plt.show()

In [None]:
embed_size = 512; max_steps = 500; vocab_size= 10000

encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)

embeddings= keras.layers.Embedding(vocab_size, embed_size)
encoder_embeddings = embeddings(encoder_inputs)
decoder_embeddings = embeddings(decoder_inputs)

positional_encoding = Positional_Encoding(max_steps, max_dims=embed_size)

encoder_in = position_embedding(encoder_embeddings)
decoder_in = position_embedding(decoder_embeddings)

 Simple transformer, w/ multi-head attention

In [None]:
Z = encoder_in 
for N in range(6):
    Z = keras.layers.Attention(use_scale=True)([Z, Z])
    
encoder_outps = Z
Z = decoder_in
for N in range(6):
    Z = keras.layers.Attention(use_scale=True, causal=True)([Z, Z])
    Z = keras.layers.Attention(use_scale=True)([Z, encoder_outputs])
    
outputs = keras.layers.TimeDistributed(
    keras.layers.Dense(vocab_size, activation='softmax'))(Z)

In [None]:
K = keras.backend

class MultiHeadAttention(keras.layers.Layer):
    def __init__(self, n_heads, causal=False, use_scale=False, **kwargs):
        self.n_heads = n_heads
        self.causal = causal
        self.use_scale = use_scale
        super().__init__(**kwargs)
    def build(self, batch_input_shape):
        self.dims = batch_input_shape[0][-1]
        self.q_dims, self.v_dims, self.k_dims = [self.dims // self.n_heads] * 3 # could be hyperparameters instead
        self.q_linear = keras.layers.Conv1D(self.n_heads * self.q_dims, kernel_size=1, use_bias=False)
        self.v_linear = keras.layers.Conv1D(self.n_heads * self.v_dims, kernel_size=1, use_bias=False)
        self.k_linear = keras.layers.Conv1D(self.n_heads * self.k_dims, kernel_size=1, use_bias=False)
        self.attention = keras.layers.Attention(causal=self.causal, use_scale=self.use_scale)
        self.out_linear = keras.layers.Conv1D(self.dims, kernel_size=1, use_bias=False)
        super().build(batch_input_shape)
    def _multi_head_linear(self, inputs, linear):
        shape = K.concatenate([K.shape(inputs)[:-1], [self.n_heads, -1]])
        projected = K.reshape(linear(inputs), shape)
        perm = K.permute_dimensions(projected, [0, 2, 1, 3])
        return K.reshape(perm, [shape[0] * self.n_heads, shape[1], -1])
    def call(self, inputs):
        q = inputs[0]
        v = inputs[1]
        k = inputs[2] if len(inputs) > 2 else v
        shape = K.shape(q)
        q_proj = self._multi_head_linear(q, self.q_linear)
        v_proj = self._multi_head_linear(v, self.v_linear)
        k_proj = self._multi_head_linear(k, self.k_linear)
        multi_attended = self.attention([q_proj, v_proj, k_proj])
        shape_attended = K.shape(multi_attended)
        reshaped_attended = K.reshape(multi_attended, [shape[0], self.n_heads, shape_attended[1], shape_attended[2]])
        perm = K.permute_dimensions(reshaped_attended, [0, 2, 1, 3])
        concat = K.reshape(perm, [shape[0], shape_attended[1], -1])
        return self.out_linear(concat)

In [None]:
ignore.warning
Q = np.random.rand(2, 50, 512)
V = np.random.rand(2, 80, 512)
multi_attn = MultiHeadAttention(8)
multi_attn([Q, V]).shape