In [None]:
# A good reference to compare results: https://towardsdatascience.com/sentiment-analysis-a-benchmark-903279cab44a

In [1]:
#install addons for 
!pip install tensorflow-addons
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import csv 
import os
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding, Dense, Input, Dropout, LayerNormalization, GlobalAveragePooling1D, Flatten
from tensorflow_addons.layers import MultiHeadAttention
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



In [2]:
# Download the data from Kaggle https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/tasks?taskId=588 
# Load the 1mdb datasets from Kaggle
kaggle_imdb_file = 'datasets/IMDB_Dataset.csv' 
data_x = []
data_y = []
with open(kaggle_imdb_file, 'r') as csvfile: 
  filereader = csv.reader(csvfile, delimiter=',', dialect='excel')
  next(filereader)
  for row in filereader:
    data_x.append(row[0]) 
    label = 1 if row[1] == 'positive' else 0 
    data_y.append(label)

In [3]:
# Prepare the data into trainable format
data_x = np.array(data_x)
data_y = np.array(data_y)
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=29)

In [5]:
# Show some sample reviews 
ids = np.random.randint(X_train.shape[0], size=10)
for i in ids:
  print(y_train[i], ':',  X_train[i])
  print("\n=====================\n")

1 : I guess this is the first time I have seen a Roscoe 'Fatty' Arbuckle movie. I really liked him in his (title) role as a butcher boy. The way he moves is very funny in my opinion, for example how he handles his knife and the way he rolls a cigarette. I think he is a good actor; his facial expressions really suit the role he plays, for example how he winks at the audience in the end. But one might add that that was probably not too difficult. Anyway I think he would have deserved a longer career. As you probably know it was ruined by greedy journalists who made money by printing false accusations that said he was involved in a scandal.<br /><br />The plot is not very important. In the first half, Fatty and Alum are employees at a store and rivals for Almondine's affection. After a heavy food fight, Almondine is sent to a girls' school by her father, the store owner. (This is the beginning of the second half). Both Fatty and Alum enter the school in drag, and the fight for Almondine c

In [6]:
EMBEDDING_DIM = 100
MAX_SEQUENCE_LENGTH = 400
MAX_NB_WORDS = 20000
# Tokenize the data 
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(data_x)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X_train_padded = pad_sequences(X_train_seq, padding='post', truncating='post',  maxlen=MAX_SEQUENCE_LENGTH)
X_test_padded = pad_sequences(X_test_seq, padding='post', truncating='post', maxlen=MAX_SEQUENCE_LENGTH)

print(X_train_padded.shape)
print(X_test_padded.shape)
inverted_word_index = dict((i, word) for (word, i) in word_index.items())
inverted_word_index[0] = ' '


Found 124252 unique tokens.
(40000, 400)
(10000, 400)


In [6]:
# check the decoded sequences
idx = 21
decoded_sequence = " ".join(inverted_word_index[i] for i in X_train_padded[idx])
print(y_train[idx], ":", X_train[idx])
print("\n===============\n")
print(len(X_train_seq[idx]), decoded_sequence)


0 : I watched "Fuckland" a long time ago. I lied if I'd tell that I remember it in detail; what I remember most vividly is the irritation it provoked me and the feeling of a total waste of precious money and time, not only my time and money invested in watching the movie but also the director's.<br /><br />Supposedly, "Fuckland" is a critic of Argentinians, presenting us (I'm an Argentinian too) as little people who take credit for and even boast about petty, ridiculous victories, and think we're the best thing that God (who is also an Argentinian) created. I'm not going to argue that. It's probably a true statement about a quite big part of the population (the part I despise, by the way). And even if this weren't true, that's not my point. The worst sin "Fuckland" committed was to express such a statement about its own director.<br /><br />The continuous impression I received was that the director was too busy trying to impress us for sneaking a camera inside the islands to worry abou

In [7]:
# Loading Word Embedding Index 
# Download Glove Data from https://nlp.stanford.edu/data/glove.6B.zip and place the data under datasets/
embeddings_index = {}
GLOVE_DIR = "datasets"
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))
print('d_model: %s', embeddings_index['hi'].shape)

Found 400000 word vectors.
d_model: %s (100,)


In [9]:
# Make the word embedding layer that can take X_train_padded as input 
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print(embedding_matrix.shape)

(124253, 100)


In [10]:
# Train a FNN to fine tune the embeddings 
# FNN with embedding 
inputs = layers.Input(shape=(MAX_SEQUENCE_LENGTH,)) 
ffn_embedding_layer =  Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                            trainable=True)
outputs_fnn = ffn_embedding_layer(inputs)
outputs_fnn = Flatten()(outputs_fnn) 
outputs_fnn = Dropout(0.1)(outputs_fnn)
outputs_fnn = Dense(40, activation='relu')(outputs_fnn)
outputs_fnn = Dropout(0.1)(outputs_fnn)
outputs_fnn = Dense(20, activation='relu')(outputs_fnn)
outputs_fnn = Dense(1, activation='sigmoid')(outputs_fnn) 

model_fnn = keras.Model(inputs=inputs, outputs=outputs_fnn) 
model_fnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model_fnn.summary())

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 400)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 400, 100)          12425300  
_________________________________________________________________
flatten (Flatten)            (None, 40000)             0         
_________________________________________________________________
dropout (Dropout)            (None, 40000)             0         
_________________________________________________________________
dense (Dense)                (None, 40)                1600040   
_________________________________________________________________
dropout_1 (Dropout)          (None, 40)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 20)               

In [11]:
history_fnn = model_fnn.fit(
    X_train_padded, y_train, batch_size=40, epochs=2, validation_data=(X_test_padded, y_test)
)

Epoch 1/2
Epoch 2/2


In [12]:
# Extract out the embedding from the fully conncted model as input to the Transformer
ffn_embeddings = ffn_embedding_layer.get_weights()[0] 

In [13]:
# Prepare to Train a Transformer 
# Helper functions with padding_masks and positional encodings
# Now let's handle positional encoding
def positional_encoding(positions, d):
    """
    Precomputes a matrix with all the positional encodings 
    
    Arguments:
        positions (int) -- Maximum number of positions to be encoded 
        d (int) -- Encoding size 
    
    Returns:
        pos_encoding -- (1, position, d_model) A matrix with the positional encodings
    """

    # initialize a matrix angle_rads of all the angles 
    angle_rads = np.arange(positions)[:, np.newaxis] / np.power(10000, (2 * (np.arange(d)[np.newaxis, :]//2)) / np.float32(d))
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
    pos_encoding = angle_rads[np.newaxis, ...]
    
    return tf.cast(pos_encoding, dtype=tf.float32)


# Now we need to create masks for padded words
def create_padding_mask_deprecated(decoder_token_ids):
    """Deprecated This No Longer Works"""
    """
    Creates a matrix mask for the padding cells
    
    Arguments:
        decoder_token_ids -- (n, m) matrix
    
    Returns:
        mask -- (n, 1, 1, m) binary tensor
    """    
    seq = 1 - tf.cast(tf.math.equal(decoder_token_ids, 0), tf.float32)
  
    # add extra dimensions to add the padding
    # to the attention logits.
    return seq[:, tf.newaxis, :] 

def create_padding_mask(decoder_token_ids):
    """This works with tensorflow_addons"""
    """
    Creates a matrix mask for the padding cells
    
    Arguments:
        decoder_token_ids -- (n, m) matrix
    
    Returns:
        mask -- (n, m, m) binary tensor
    Discussions at https://github.com/tensorflow/tensorflow/issues/49237 
    """    
    def outer_product(a):
      return tf.tensordot(a, a, 0)
    
    seq = 1 - tf.cast(tf.math.equal(decoder_token_ids, 0), tf.float32)
    
    res = tf.cast(tf.vectorized_map(outer_product, seq), dtype=bool) 
    # add extra dimensions to add the padding
    # to the attention logits.
    return res




In [14]:
# Now we need to build a decoder only Transformer for sentiment classification

class DecoderLayer(tf.keras.layers.Layer):
    """
    The decoder layer is composed by a multi-head self-attention mechanism,
    followed by a simple, positionwise fully connected feed-forward network. 
    This archirecture includes a residual connection around each of the two 
    sub-layers, followed by layer normalization.
    """
    def __init__(self, embedding_dim, num_heads, fully_connected_dim,
                 dropout_rate=0.1, layernorm_eps=1e-6):
        super(DecoderLayer, self).__init__()

        self.mha = MultiHeadAttention(num_heads=num_heads,
                                      head_size=embedding_dim, #key_dim=embedding_dim,
                                      dropout=dropout_rate)

        self.ffn = tf.keras.Sequential([
          tf.keras.layers.Dense(fully_connected_dim, activation='relu'),  # (batch_size, seq_len, dff)
          tf.keras.layers.Dense(embedding_dim)  # (batch_size, seq_len, d_model)
        ])

        self.layernorm1 = LayerNormalization(epsilon=layernorm_eps)
        self.layernorm2 = LayerNormalization(epsilon=layernorm_eps)

        self.dropout_ffn = Dropout(dropout_rate)
    
    def call(self, x, training, mask):
        """
        Forward pass for the Encoder Layer
        
        Arguments:
            x -- Tensor of shape (batch_size, input_seq_len, embedding_dim )
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            mask -- Boolean mask to ensure that the padding is not 
                    treated as part of the input
        Returns:
            decoder_layer_out -- Tensor of shape (batch_size, input_seq_len, embedding_dim)
        """
        # START CODE HERE
        # calculate self-attention using mha(~1 line). Dropout will be applied during training
        attn_output = self.mha([x, x, x], mask=mask) # not sure how to apply mask here. 
        
        # apply layer normalization on sum of the input and the attention output to get the  
        # output of the multi-head attention layer (~1 line)
        out1 = self.layernorm1(x+attn_output)  # (batch_size, input_seq_len, fully_connected_dim)

        # pass the output of the multi-head attention layer through a ffn (~1 line)
        ffn_output = self.ffn(out1)   # (batch_size, input_seq_len, fully_connected_dim)
        
        # apply dropout layer to ffn output during training (~1 line)
        ffn_output =  self.dropout_ffn(ffn_output, training=training)
        
        # apply layer normalization on sum of the output from multi-head attention and ffn output to get the
        # output of the encoder layer (~1 line)
        decoder_layer_out = self.layernorm2(out1+ffn_output)  # (batch_size, input_seq_len, fully_connected_dim)
        # END CODE HERE
        
        return decoder_layer_out
    

In [15]:
class Transformer(tf.keras.layers.Layer):
    """
    The entire Encoder is starts by passing the target input to an embedding layer 
    and using positional encoding to then pass the output through a stack of
    decoder Layers
        
    """ 
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, 
               maximum_position_encoding, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Transformer, self).__init__()

        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        self.embedding = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            embeddings_initializer=tf.keras.initializers.Constant(ffn_embeddings),
                            trainable=False)
        self.pos_encoding = positional_encoding(maximum_position_encoding, self.embedding_dim)

        self.dec_layers = [DecoderLayer(embedding_dim=self.embedding_dim,
                                        num_heads=num_heads,
                                        fully_connected_dim=fully_connected_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps) 
                           for _ in range(self.num_layers)]
        self.dropout = Dropout(dropout_rate)
        self.final_layer = tf.keras.Sequential([GlobalAveragePooling1D(), 
                                                Dense(fully_connected_dim, activation="relu"), 
                                                Dense(1, activation='sigmoid')])
        
    
    def call(self, x, training):
        """
        Forward  pass for the Decoder
        
        Arguments:
            x -- Tensor of shape (batch_size, target_seq_len, embedding_dim)
            
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            
        Returns:
            x -- Tensor of shape (batch_size, 1), probability of positive sentiments 
            
        """

        seq_len = tf.shape(x)[1]
        mask = create_padding_mask(x)
        attention_weights = {}
        
        # START CODE HERE
        # create word embeddings 
        ec = np.sqrt(self.embedding_dim) 
        x = ec*self.embedding(x)
        x += self.pos_encoding 

        # apply a dropout layer to x
        x = self.dropout(x, training = training)

        # use a for loop to pass x through a stack of decoder layers and update attention_weights (~4 lines total)
        for i in range(self.num_layers):
            # pass x and the encoder output through a stack of decoder layers and save the attention weights
            # of block 1 and 2 (~1 line)
            x = self.dec_layers[i](x, training, mask)

         
        x = self.final_layer(x)
        return x

In [16]:
NUM_HEADS = 5  # Number of attention heads
NUM_LAYERS = 1 # 1 layer of transformer block 
FF_DIM = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(MAX_SEQUENCE_LENGTH,))
transformer_1 = Transformer(NUM_LAYERS, EMBEDDING_DIM, NUM_HEADS, FF_DIM, MAX_SEQUENCE_LENGTH)

outputs = transformer_1(inputs)

model_transformer_1 = keras.Model(inputs=inputs, outputs=outputs)
model_transformer_1.summary()

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 400)]             0         
_________________________________________________________________
transformer (Transformer)    (None, 1)                 12635597  
Total params: 12,635,597
Trainable params: 210,297
Non-trainable params: 12,425,300
_________________________________________________________________


In [17]:
model_transformer_1.compile("adam", "binary_crossentropy", metrics=["accuracy"]) 
historytransformer_1 = model_transformer_1.fit(
    X_train_padded, y_train, batch_size=32, epochs=2, validation_data=(X_test_padded, y_test)
)


Epoch 1/2
Epoch 2/2


In [18]:
NUM_HEADS = 3 # Number of attention heads
NUM_LAYERS = 2 # 2 layers of decoder block 
FF_DIM = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(MAX_SEQUENCE_LENGTH,))
transformer_2 = Transformer(NUM_LAYERS, EMBEDDING_DIM, NUM_HEADS, FF_DIM, MAX_SEQUENCE_LENGTH)

outputs2 = transformer_2(inputs)

model_transformer_2 = keras.Model(inputs=inputs, outputs=outputs2)
model_transformer_2.summary()


Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 400)]             0         
_________________________________________________________________
transformer_1 (Transformer)  (None, 1)                 12682629  
Total params: 12,682,629
Trainable params: 257,329
Non-trainable params: 12,425,300
_________________________________________________________________


In [None]:
model_transformer_2.compile("adam", "binary_crossentropy", metrics=["accuracy"]) 
historytransformer_2 = model_transformer_2.fit(
    X_train_padded, y_train, batch_size=32, epochs=2, validation_data=(X_test_padded, y_test)
)


Epoch 1/2
 247/1250 [====>.........................] - ETA: 15:33 - loss: 0.3548 - accuracy: 0.8406