In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
import numpy as np
import os
import re
import string
import random

2023-02-22 13:36:25.223457: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-22 13:36:25.877372: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.2/include:/usr/local/cuda-11.2/lib64:
2023-02-22 13:36:25.877426: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.2/include:/usr/local/cuda-11.2/lib64:


In [2]:
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    return tf.tile(mask, mult)


class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads, embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
        attention_output = self.att(inputs, inputs, attention_mask=causal_mask)
        attention_output = self.dropout1(attention_output)
        out1 = self.layernorm1(inputs + attention_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

In [3]:
# embedding layer

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [4]:
# miniature GPT model

vocab_size = 25000  # Only consider the top 20k words
maxlen = 40  # Max sequence size
embed_dim = 256  # Embedding size for each token
num_heads = 6  # Number of attention heads
feed_forward_dim = 256  # Hidden layer size in feed forward network inside transformer


def create_model():
    inputs = layers.Input(shape=(maxlen,), dtype=tf.int32)
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, feed_forward_dim)
    x = transformer_block(x)
    outputs = layers.Dense(vocab_size)(x)
    model = keras.Model(inputs=inputs, outputs=outputs)
    
    return model

In [5]:
# data 

batch_size = 128
filenames = []
directories = [
    "aclImdb/train/pos",
    "aclImdb/train/neg",
    "aclImdb/test/pos",
    "aclImdb/test/neg",
]
for dir in directories:
    for f in os.listdir(dir):
        filenames.append(os.path.join(dir, f))

print(f"{len(filenames)} files")

# Create a dataset from text files
random.shuffle(filenames)
text_ds = tf.data.TextLineDataset(filenames)
text_ds = text_ds.shuffle(buffer_size=256)
text_ds = text_ds.batch(batch_size)


def custom_standardization(input_string):
    lowercased = tf.strings.lower(input_string)
    stripped_html = tf.strings.regex_replace(lowercased, "<br />", " ")
    return tf.strings.regex_replace(stripped_html, f"([{string.punctuation}])", r" \1")


# Create a vectorization layer and adapt it to the text
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=maxlen + 1,
)
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()  # To get words back from token indices


def prepare_lm_inputs_labels(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


text_ds = text_ds.map(prepare_lm_inputs_labels)
text_ds = text_ds.prefetch(tf.data.AUTOTUNE)

50000 files


2023-02-22 13:36:37.032568: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-22 13:36:37.037482: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-22 13:36:37.037741: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-22 13:36:37.038238: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

In [6]:
model = create_model()
model.summary()

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile("adam", loss=loss_fn,) 
model.fit(text_ds, epochs=30)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 40)]              0         
                                                                 
 token_and_position_embeddin  (None, 40, 256)          6410240   
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_block (Transfor  (None, 40, 256)          1710336   
 merBlock)                                                       
                                                                 
 dense_2 (Dense)             (None, 40, 25000)         6425000   
                                                                 
Total params: 14,545,576
Trainable params: 14,545,576
Non-trainable params: 0
_________________________________________________

2023-02-22 13:36:51.593997: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:630] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-02-22 13:36:51.626338: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x7fabbc224150 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-02-22 13:36:51.626352: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA GeForce RTX 3070, Compute Capability 8.6
2023-02-22 13:36:51.629359: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-02-22 13:36:51.722854: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fae7c0d0e80>

In [8]:

# Tokenize starting prompt
word_to_index = {}
for index, word in enumerate(vocab):
    word_to_index[word] = index

start_prompt = "this movie is"
start_tokens = [word_to_index.get(_, 1) for _ in start_prompt.split()]
num_tokens_generated = 30
self_max_tokens = 30
top_k=10



def sample_from(logits):
    logits, indices = tf.math.top_k(logits, k=top_k, sorted=True)
    indices = np.asarray(indices).astype("int32")
    preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
    preds = np.asarray(preds).astype("float32")
    return np.random.choice(indices, p=preds)

def detokenize(number):
    return vocab[number]

def generate(start_tokens):
    start_tokens = [_ for _ in start_tokens]
    num_tokens_generated_local = 0
    tokens_generated = []
    while num_tokens_generated_local <= self_max_tokens:
        pad_len = maxlen - len(start_tokens)
        sample_index = len(start_tokens) - 1
        if pad_len > 0:
            x = start_tokens + [0] * pad_len
        else:
            x = start_tokens
        x = np.array([x])
        y = model.predict(x)
        sample_token = sample_from(y[0][sample_index])
        tokens_generated.append(sample_token)
        start_tokens.append(sample_token)
        num_tokens_generated_local = len(tokens_generated)
    txt = " ".join(
        [detokenize(_) for _ in start_tokens + tokens_generated]
    )
    print(f"generated text:\n{txt}\n")
    
generate(start_tokens)

generated text:
this movie is absolutely horrible . if you watch the movie , the acting is average , and the script is sub -par . it does not take the slightest in the film . absolutely horrible . if you watch the movie , the acting is average , and the script is sub -par . it does not take the slightest in the film .



In [9]:
# TextGenerator

class TextGenerator(keras.callbacks.Callback):
    """A callback to generate text from a trained model.
    1. Feed some starting prompt to the model
    2. Predict probabilities for the next token
    3. Sample the next token and add it to the next input

    Arguments:
        max_tokens: Integer, the number of tokens to be generated after prompt.
        start_tokens: List of integers, the token indices for the starting prompt.
        index_to_word: List of strings, obtained from the TextVectorization layer.
        top_k: Integer, sample from the `top_k` token predictions.
        print_every: Integer, print after this many epochs.
    """

    def __init__(
        self, max_tokens, start_tokens, index_to_word, top_k=10, print_every=1
    ):
        self.max_tokens = max_tokens
        self.start_tokens = start_tokens
        self.index_to_word = index_to_word
        self.print_every = print_every
        self.k = top_k

    def sample_from(self, logits):
        logits, indices = tf.math.top_k(logits, k=self.k, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

    def detokenize(self, number):
        return self.index_to_word[number]

    def on_epoch_end(self, epoch, logs=None):
        start_tokens = [_ for _ in self.start_tokens]
        if (epoch + 1) % self.print_every != 0:
            return
        num_tokens_generated = 0
        tokens_generated = []
        while num_tokens_generated <= self.max_tokens:
            pad_len = maxlen - len(start_tokens)
            sample_index = len(start_tokens) - 1
            if pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens
            x = np.array([x])
            y = self.model.predict(x)
            sample_token = self.sample_from(y[0][sample_index])
            tokens_generated.append(sample_token)
            start_tokens.append(sample_token)
            num_tokens_generated = len(tokens_generated)
        txt = " ".join(
            [self.detokenize(_) for _ in self.start_tokens + tokens_generated]
        )
        print(f"generated text:\n{txt}\n")


# Tokenize starting prompt
word_to_index = {}
for index, word in enumerate(vocab):
    word_to_index[word] = index

start_prompt = "this movie is"
start_tokens = [word_to_index.get(_, 1) for _ in start_prompt.split()]
num_tokens_generated = 30
text_gen_callback = TextGenerator(num_tokens_generated, start_tokens, vocab)

In [10]:
model = create_model()
model.summary()

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile("adam", loss=loss_fn,) 
model.fit(text_ds, verbose=2, epochs=25, callbacks=[text_gen_callback])

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 40)]              0         
                                                                 
 token_and_position_embeddin  (None, 40, 256)          6410240   
 g_1 (TokenAndPositionEmbedd                                     
 ing)                                                            
                                                                 
 transformer_block_1 (Transf  (None, 40, 256)          1710336   
 ormerBlock)                                                     
                                                                 
 dense_5 (Dense)             (None, 40, 25000)         6425000   
                                                                 
Total params: 14,545,576
Trainable params: 14,545,576
Non-trainable params: 0
_______________________________________________

generated text:
this movie is so bad i can 't say that i 'm actually not going to be a bad movie .i don 't know much about a lot of people are doing . there

391/391 - 17s - loss: 3.3507 - 17s/epoch - 45ms/step
Epoch 10/25
generated text:
this movie is not a masterpiece of both movies . its about sports and a story line , and not only is [UNK] . the music is set in the u .s . marshal

391/391 - 18s - loss: 3.2364 - 18s/epoch - 45ms/step
Epoch 11/25
generated text:
this movie is a classic . it was supposed to be about three teenage boys and [UNK] [UNK] ) . i 'm going to say the [UNK] comedy team and i 've seen all

391/391 - 17s - loss: 3.1347 - 17s/epoch - 45ms/step
Epoch 12/25
generated text:
this movie is terrible . i 've got a lot of people who saw this movie in the u .s . . carrot top is so -so [UNK] . and i can 't believe

391/391 - 17s - loss: 3.0443 - 17s/epoch - 45ms/step
Epoch 13/25
generated text:
this movie is so cool . i 've tried to watch it with a movie that i have seen t

generated text:
this movie is one of the worst i 've seen in a long time . not to mention bad acting , bad direction [UNK] . . . .i just just have just seen every

391/391 - 18s - loss: 2.6621 - 18s/epoch - 45ms/step
Epoch 19/25
generated text:
this movie is about a group of friends with a young boy named luther who wears a mountain and his bachelor party , keaton ) he marries her , (matt [UNK] ) he [UNK]

391/391 - 18s - loss: 2.6158 - 18s/epoch - 45ms/step
Epoch 20/25
generated text:
this movie is not a great movie , it 's a waste of time on the plot and the actors do not make any sense . if you 've seen this movie , you

391/391 - 18s - loss: 2.5743 - 18s/epoch - 45ms/step
Epoch 21/25
generated text:
this movie is the best movie i have ever seen . it has a lot to say about it , it is not a film about people in india . i am not saying

391/391 - 17s - loss: 2.5350 - 17s/epoch - 45ms/step
Epoch 22/25


generated text:
this movie is just a bit about the plot of dubious business (albeit transformed in a middle class couple research about the time already 15 minutes ) . it 's not supposed to be

391/391 - 18s - loss: 2.4982 - 18s/epoch - 45ms/step
Epoch 23/25
generated text:
this movie is so bad that it 's hard to describe it . there is no plot that 's just so stupid i don 't recall a film that you are watching and will

391/391 - 18s - loss: 2.4655 - 18s/epoch - 45ms/step
Epoch 24/25
generated text:
this movie is so bad i can 't believe it 's actually been better made than the first . i 've seen the three or four or two of the worst movies ever made

391/391 - 18s - loss: 2.4344 - 18s/epoch - 45ms/step
Epoch 25/25
generated text:
this movie is a classic . it is so hard to believe that someone could give him a good job at the top of a movie , and the acting is good . the

391/391 - 18s - loss: 2.4047 - 18s/epoch - 45ms/step


<keras.callbacks.History at 0x7fae0c27bca0>