## Sentiment Analysis with Transformer-based BERT model

We used Transformer-based BERT model for sentiment analysis (movie reviews).

In [1]:
# !pip install kaggle
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from transformers import BertTokenizer, BertModel, TFAutoModel
import torch
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
from kaggle.api.kaggle_api_extended import KaggleApi
import zipfile
import os
api = KaggleApi()
api.authenticate()
for file in ['train.tsv', 'test.tsv']:
    api.competition_download_file('sentiment-analysis-on-movie-reviews', f'{file}.zip', path='./')
    with zipfile.ZipFile(f'{file}.zip', 'r') as zip_ref:
        zip_ref.extractall('./')
    os.remove(f'{file}.zip')

Downloading train.tsv.zip to .


100%|██████████| 1.28M/1.28M [00:00<00:00, 12.6MB/s]







Downloading test.tsv.zip to .


100%|██████████| 494k/494k [00:00<00:00, 56.2MB/s]







In [3]:
df = pd.read_csv('train.tsv', sep='\t')
print(df.shape)
df.sample(4)

(156060, 4)


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
76167,76168,3907,unwavering and,2
82936,82937,4283,-LRB- Hayek -RRB- throws herself into this dre...,4
95349,95350,4977,Tautou,2
4770,4771,182,are powerful and moving without stooping to ba...,4


In [4]:
df['Sentiment'].value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [5]:
dff=[len(i.split(" ")) for i in df.Phrase[:10]]         
max(dff)

37

### Create tensor

In [6]:
#  Create two input tensor (input IDs and attention mask)
import numpy as np
seq_len = 40               # Sequence length of tokenized sequences for BERT
num_samples = len(df)        # Number of samples in our dataset is 156060   

# Initialize 
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
# Tokenize 
tokens = tokenizer(df['Phrase'].tolist(), max_length=seq_len, truncation=True,
                   padding=True, add_special_tokens=True,
                   return_tensors='np')   # Return Numpy tensors
tokens.keys()      # Three numpy arrays: input_ids, token_type_ids and attention_mask.

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [7]:
tokens['input_ids'][:5]

array([[  101,   138,  1326,  1104, 13936, 25265, 16913, 15107,  1103,
         8050,  2553,  1115,  1184,  1110,  1363,  1111,  1103, 20398,
         1110,  1145,  1363,  1111,  1103,   176,  9900,   117,  1199,
         1104,  1134,  5411,  1821, 14225,  1133,  3839,  1104,  1134,
         7919,  1106,  1277,   102],
       [  101,   138,  1326,  1104, 13936, 25265, 16913, 15107,  1103,
         8050,  2553,  1115,  1184,  1110,  1363,  1111,  1103, 20398,
          102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0],
       [  101,   138,  1326,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0],
       [  101,   138,   102,     0,  

In [8]:
tokens['attention_mask'][:5]            # 1 means the value, 0 means none.

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [9]:
# Save as binary files (Numpy)
with open('movie-xids.npy', 'wb') as f:
    np.save(f, tokens['input_ids'])
with open('movie-xmask.npy', 'wb') as f:
    np.save(f, tokens['attention_mask'])    
# Free memory.
del tokens

### Target (labels)

In [10]:
# One-hot encoded labels array
arr = df['Sentiment'].values

labels = np.zeros((num_samples, arr.max()+1))
print(labels.shape)

labels[np.arange(num_samples), arr] = 1
# labels = labels.to(device)
print(labels)
with open('movie-labels.npy', 'wb') as f:
    np.save(f, labels)

(156060, 5)
[[0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 ...
 [0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]]


In [11]:
# Load datasets
with open('movie-xids.npy', 'rb') as f:
    Xids = np.load(f, allow_pickle=True)
with open('movie-xmask.npy', 'rb') as f:
    Xmask = np.load(f, allow_pickle=True)
with open('movie-labels.npy', 'rb') as f:
    labels = np.load(f, allow_pickle=True)

In [12]:
# Create TF dataset 
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))
dataset.take(1)

<TakeDataset element_spec=(TensorSpec(shape=(40,), dtype=tf.int32, name=None), TensorSpec(shape=(40,), dtype=tf.int32, name=None), TensorSpec(shape=(5,), dtype=tf.float64, name=None))>

In [13]:
# Rearange the dataset format
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

# Apply map for transformation function.
dataset = dataset.map(map_func)
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(40,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(40,), dtype=tf.int32, name=None)}, TensorSpec(shape=(5,), dtype=tf.float64, name=None))>

In [14]:
batch_size = 64
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(64, 40), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(64, 40), dtype=tf.int32, name=None)}, TensorSpec(shape=(64, 5), dtype=tf.float64, name=None))>

In [15]:
ratio = 0.8
size = int((Xids.shape[0] / batch_size) * ratio)
train_ds = dataset.take(size)
val_ds = dataset.skip(size)
# val_ds.element_spec == train_ds.element_spec

tf.data.experimental.save(train_ds, 'train')
tf.data.experimental.save(val_ds, 'val')
del dataset                   # Fresh memory

### Pretrained Bert model

In [16]:
bert = TFAutoModel.from_pretrained('bert-base-cased')
bert.summary()

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_________________________________________________________________


In [17]:
# Two input layers
input_ids = tf.keras.layers.Input(shape=(seq_len,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(seq_len,), name='attention_mask', dtype='int32')

# Access the transformer model
embeddings = bert.bert(input_ids, attention_mask=mask)[1]  # [1] final activations
# Convert into five classes
x = tf.keras.layers.Dense(1024, activation='relu')(embeddings)
y = tf.keras.layers.Dense(5, activation='softmax', name='outputs')(x)
# Initialize 
model = Model(inputs=[input_ids, mask], outputs=y)
model.layers[2].trainable = False     # (optional) freeze bert layer to save time.
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 40)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 40)]         0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 40,                                            

In [18]:
# Customized model
optimizer = tf.keras.optimizers.Adam(lr=1e-1, decay=1e-1)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

  super(Adam, self).__init__(name, **kwargs)


In [19]:
element_spec = ({'input_ids': tf.TensorSpec(shape=(batch_size, seq_len), dtype='int32', name=None),
                 'attention_mask': tf.TensorSpec(shape=(batch_size, seq_len), dtype='int32', name=None)},
                tf.TensorSpec(shape=(batch_size, 5), dtype=tf.float64, name=None))

# load the training and validation sets
train_ds = tf.data.experimental.load('train', element_spec=element_spec)
val_ds = tf.data.experimental.load('val', element_spec=element_spec)
# view format
train_ds.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(64, 40), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(64, 40), dtype=tf.int32, name=None)}, TensorSpec(shape=(64, 5), dtype=tf.float64, name=None))>

In [20]:
val_ds.element_spec

({'input_ids': TensorSpec(shape=(64, 40), dtype=tf.int32, name=None),
  'attention_mask': TensorSpec(shape=(64, 40), dtype=tf.int32, name=None)},
 TensorSpec(shape=(64, 5), dtype=tf.float64, name=None))

In [21]:
train_ds.element_spec

({'input_ids': TensorSpec(shape=(64, 40), dtype=tf.int32, name=None),
  'attention_mask': TensorSpec(shape=(64, 40), dtype=tf.int32, name=None)},
 TensorSpec(shape=(64, 5), dtype=tf.float64, name=None))

In [22]:
# Train model 
history = model.fit(train_ds, validation_data=val_ds, epochs=3)  # We use simple parameters to save time 

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [23]:
model.save('sentiment_model')



INFO:tensorflow:Assets written to: sentiment_model\assets


INFO:tensorflow:Assets written to: sentiment_model\assets
  return generic_utils.serialize_keras_object(obj)


In [4]:
model = tf.keras.models.load_model('sentiment_model')
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 40)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 40)]         0           []                               
                                                                                                  
 bert (Custom>TFBertMainLayer)  {'pooler_output': (  108310272   ['input_ids[0][0]',              
                                None, 768),                       'attention_mask[0][0]']         
                                 'last_hidden_state                                               
                                ': (None, 40, 768)}                                           

In [20]:
def prep_data(text):
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    tokens = tokenizer.encode_plus(text, max_length=max_length,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True, return_token_type_ids=False,
                                   return_tensors='tf')
    # Convert int32 tensors to float64
    return {'input_ids': tf.cast(tokens['input_ids'], tf.float64),
            'attention_mask': tf.cast(tokens['attention_mask'], tf.float64)}

In [21]:
# Test
max_length=40
probs = model.predict(prep_data("hello world"))[0]
print(probs)
# np.argmax(probs)

[0.02690142 0.08788886 0.56382006 0.27686545 0.04452418]


In [22]:
pd.set_option('display.max_colwidth', None)
df = pd.read_csv('test.tsv', sep='\t')
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine effort .
1,156062,8545,An intermittently pleasing but mostly routine effort
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [25]:
df_test = df.drop_duplicates(subset=['SentenceId'], keep='first')[:50]    # save time [:50]
df_test['Sentiment'] = None
for i, row in df_test.iterrows():
    tokens = prep_data(row['Phrase'])
    probs = model.predict(tokens)
    pred = np.argmax(probs)
    df_test.at[i, 'Sentiment'] = pred
df_test.sample(5)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
839,156900,8569,"A fitfully amusing romp that , if nothing else , will appeal to fans of Malcolm in the Middle and its pubescent star , Frankie Muniz .",3
546,156607,8560,"Chilling but uncommercial look into the mind of Jeffrey Dahmer , serial killer .",2
476,156537,8557,"For all its failed connections , Divine Secrets of the Ya-Ya Sisterhood is nurturing , in a gauzy , dithering way .",3
1541,157602,8594,"And when you 're talking about a slapstick comedy , that 's a pretty big problem .",1
1229,157290,8582,What Jackson has accomplished here is amazing on a technical level .,3


###  LSTM model

In [64]:
bert = TFAutoModel.from_pretrained('bert-base-cased')
input_ids = tf.keras.layers.Input(shape=(max_length,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(max_length,), name='attention_mask', dtype='int32')
embeddings = bert.bert(input_ids, attention_mask=mask)[0]  # access final activations with [0]

# convert bert embeddings into 5 output classes
x = tf.keras.layers.LSTM(32, dropout=.3, recurrent_dropout=.3, return_sequences=True)(embeddings)
x = tf.keras.layers.LSTM(16, dropout=.4, recurrent_dropout=.4, return_sequences=False)(x)
# normalize
x = tf.keras.layers.BatchNormalization()(x)
# output
x = tf.keras.layers.Dense(64, activation='relu')(x)
y = tf.keras.layers.Dense(5, activation='softmax', name='outputs')(x)

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [67]:
# initialize model
model2 = tf.keras.Model(inputs=[input_ids, mask], outputs=y)
# (optional) freeze bert layer
model2.layers[2].trainable = False
model2.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 40)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 40)]         0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 40,                                          

In [68]:
optimizer = tf.keras.optimizers.Adam(lr=1e-1, decay=1e-1)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
mod  el2.compile(optimizer=optimizer, loss=loss, metrics=[acc])

  super(Adam, self).__init__(name, **kwargs)


In [70]:
element_spec = ({'input_ids': tf.TensorSpec(shape=(64, 40), dtype='int32', name=None),
                 'attention_mask': tf.TensorSpec(shape=(64, 40), dtype='int32', name=None)},
                tf.TensorSpec(shape=(64, 5), dtype=tf.float64, name=None))

# load the training and validation sets
train_ds = tf.data.experimental.load('train', element_spec=element_spec)
val_ds = tf.data.experimental.load('val', element_spec=element_spec)

# view the input format
train_ds.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(64, 40), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(64, 40), dtype=tf.int32, name=None)}, TensorSpec(shape=(64, 5), dtype=tf.float64, name=None))>

In [None]:
history = model2.fit(train_ds, validation_data=val_ds, epochs=2)        # Save time
model2.save('sentiment_model2')

In [None]:
# Acknowledge
# References:
        1. Kaggle
        2. Udemy
        3. http://web.stanford.edu/class/cs224n/       