# TensorFlow 2 - BERT: Movie Review Sentiment Analysis

**Dataset**

Dataset made by me was a result of Scrapping 1,96,875 news related to finance from MoneyControl between 12-05-2017 to 26-02-2021 and TCS:NSE Stock Movement for the same duration. Dataset is named `final.csv`

**Problem**

We need to find weather given news will have postitve or negative effect on TCS stock movement of next day

In [1]:
# Install the required package
!pip install bert-for-tf2

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/a5/a1/acb891630749c56901e770a34d6bac8a509a367dd74a05daf7306952e910/bert-for-tf2-0.14.9.tar.gz (41kB)
[K     |████████                        | 10kB 26.6MB/s eta 0:00:01[K     |████████████████                | 20kB 16.7MB/s eta 0:00:01[K     |███████████████████████▉        | 30kB 12.0MB/s eta 0:00:01[K     |███████████████████████████████▉| 40kB 12.5MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 5.1MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/aa/e0/4f663d8abf83c8084b75b995bd2ab3a9512ebc5b97206fde38cef906ab07/py-params-0.10.2.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... 

In [2]:
# Import modules
import pandas as pd
import numpy as np
import bert
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import  Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tqdm import tqdm
import matplotlib.pyplot as plt

print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)
pd.set_option('display.max_colwidth',1000)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


TensorFlow Version: 2.4.1
Hub version:  0.11.0
Num GPUs Available:  1


In [3]:
#tf.debugging.set_log_device_placement(False)

## Data preprocessing

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# Read the IMDB Dataset.csv into Pandas dataframe
df=pd.read_csv('/content/drive/MyDrive/BERT/final.csv')

In [8]:
# Take a peek at the dataset
df.head(5)

Unnamed: 0,News,Date,Up
0,"0% returns for 10 years, but here's why you should look at GMDC stock now",12-05-2017,0
1,3 Years of Modi Govt: Effect in the Employee Benefit Industry,12-05-2017,0
2,AAP govt's minimum wages too little & inadequate: HC,12-05-2017,0
3,ADF Foods: Outcome of board meeting,12-05-2017,0
4,AIESL completes maintenance check of Jet Airways A330 plane,12-05-2017,0


In [9]:
print("The number of rows and columns in the dataset is: {}".format(df.shape))

The number of rows and columns in the dataset is: (196875, 3)


In [10]:
# Identify missing values
df.apply(lambda x: sum(x.isnull()), axis=0)

News    0
Date    0
Up      0
dtype: int64

In [11]:
# Check the target class balance
df["Up"].value_counts()

1    122660
0     74215
Name: Up, dtype: int64

In [12]:
# Functions for constructing BERT Embeddings: input_ids, input_masks, input_segments and Inputs
MAX_SEQ_LEN=500 # max sequence length

def get_masks(tokens):
    """Masks: 1 for real tokens and 0 for paddings"""
    return [1]*len(tokens) + [0] * (MAX_SEQ_LEN - len(tokens))
 
def get_segments(tokens):
    """Segments: 0 for the first sequence, 1 for the second"""  
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (MAX_SEQ_LEN - len(tokens))

def get_ids(tokens, tokenizer):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens,)
    input_ids = token_ids + [0] * (MAX_SEQ_LEN - len(token_ids))
    return input_ids

def create_single_input(sentence, tokenizer, max_len):
    """Create an input from a sentence"""
    stokens = tokenizer.tokenize(sentence)
    stokens = stokens[:max_len]
    stokens = ["[CLS]"] + stokens + ["[SEP]"]
 
    ids = get_ids(stokens, tokenizer)
    masks = get_masks(stokens)
    segments = get_segments(stokens)

    return ids, masks, segments
 
def convert_sentences_to_features(sentences, tokenizer):
    """Convert sentences to features: input_ids, input_masks and input_segments"""
    input_ids, input_masks, input_segments = [], [], []
 
    for sentence in tqdm(sentences,position=0, leave=True):
      ids,masks,segments=create_single_input(sentence,tokenizer,MAX_SEQ_LEN-2)
      assert len(ids) == MAX_SEQ_LEN
      assert len(masks) == MAX_SEQ_LEN
      assert len(segments) == MAX_SEQ_LEN
      input_ids.append(ids)
      input_masks.append(masks)
      input_segments.append(segments)

    return [np.asarray(input_ids, dtype=np.int32), 
          np.asarray(input_masks, dtype=np.int32), 
          np.asarray(input_segments, dtype=np.int32)]

def create_tonkenizer(bert_layer):
    """Instantiate Tokenizer with vocab"""
    vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case=bert_layer.resolved_object.do_lower_case.numpy() 
    tokenizer=bert.bert_tokenization.FullTokenizer(vocab_file,do_lower_case)
    return tokenizer

## Modelling

In [13]:
def nlp_model(callable_object):
    # Load the pre-trained BERT base model
    bert_layer = hub.KerasLayer(handle=callable_object, trainable=True)  
   
    # BERT layer three inputs: ids, masks and segments
    input_ids = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="input_ids")           
    input_masks = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="input_masks")       
    input_segments = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="segment_ids")
    
    inputs = [input_ids, input_masks, input_segments] # BERT inputs
    pooled_output, sequence_output = bert_layer(inputs) # BERT outputs
    
    # Add a hidden layer
    x = Dense(units=768, activation='relu')(pooled_output)
    x = Dropout(0.1)(x)
 
    # Add output layer
    outputs = Dense(2, activation="softmax")(x)

    # Construct a new model
    model = Model(inputs=inputs, outputs=outputs)
    return model

model = nlp_model("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1")
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 500)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 500)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 500)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_ids[0][0]                  
                                                                 input_masks[0][0]            

## Model training


In [14]:
# Create examples for training and testing
df = df.sample(frac=1) # Shuffle the dataset
tokenizer = create_tonkenizer(model.layers[3])
X_train = convert_sentences_to_features(df['News'][:175000], tokenizer)
X_test = convert_sentences_to_features(df['News'][175000:], tokenizer)

# df['sentiment'].replace('positive',1.,inplace=True)
# df['sentiment'].replace('negative',0.,inplace=True)
one_hot_encoded = to_categorical(df['Up'].values)
y_train = one_hot_encoded[:175000]
y_test =  one_hot_encoded[175000:]

100%|██████████| 175000/175000 [00:41<00:00, 4247.56it/s]
100%|██████████| 21875/21875 [00:05<00:00, 4345.93it/s]


In [15]:
# Train the model
BATCH_SIZE = 8
EPOCHS = 1

# Use Adam optimizer to minimize the categorical_crossentropy loss
opt = Adam(learning_rate=2e-5)
model.compile(optimizer=opt, 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

# Fit the data to the model
history = model.fit(X_train, y_train,
                    validation_data=(X_test, y_test),
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    verbose = 1)

# Save the trained model
model.save('nlp_model.h5')



## Analysis of model performance

In [16]:
# Load the pretrained nlp_model
from tensorflow.keras.models import load_model
new_model = load_model('nlp_model.h5',custom_objects={'KerasLayer':hub.KerasLayer})

In [17]:
# Predict on test dataset
from sklearn.metrics import classification_report
pred_test = np.argmax(new_model.predict(X_test), axis=1)

In [18]:
print(classification_report(np.argmax(y_test,axis=1), pred_test))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      8282
           1       0.62      1.00      0.77     13593

    accuracy                           0.62     21875
   macro avg       0.31      0.50      0.38     21875
weighted avg       0.39      0.62      0.48     21875



  _warn_prf(average, modifier, msg_start, len(result))
