# Predicting Movie Review Sentiment with BERT on TF Hub

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
import time
import gc

import utils
import tokenization as tz

# Data

First, let's download the dataset, hosted by Stanford. The code below, which downloads, extracts, and imports the IMDB Large Movie Review Dataset, is borrowed from [this Tensorflow tutorial](https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub).

In [None]:
from tensorflow import keras
import os
import re

# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
    data = {}
    data["sentence"] = []
    data["sentiment"] = []
    for file_path in os.listdir(directory):
        with tf.compat.v1.gfile.GFile(os.path.join(directory, file_path), "r") as f:
            data["sentence"].append(f.read())
            data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
    return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
    pos_df = load_directory_data(os.path.join(directory, "pos"))
    neg_df = load_directory_data(os.path.join(directory, "neg"))
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
    dataset = tf.compat.v1.keras.utils.get_file(
        fname="aclImdb.tar.gz", 
        origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
        extract=True)
  
    train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                       "aclImdb", "train"))
    test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                      "aclImdb", "test"))
  
    return train_df, test_df


In [None]:
train, test = download_and_load_datasets()

To keep training fast, we'll take a sample of 5000 train and test examples, respectively.

In [None]:
train = train.sample(5000)
test = test.sample(5000)

In [None]:
train.columns

For us, our input data is the 'sentence' column and our label is the 'polarity' column (0, 1 for negative and positive, respecitvely)

In [None]:
DATA_COLUMN = 'sentence'
LABEL_COLUMN = 'polarity'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]

# Data Preprocessing

We'll need to transform our data into a format BERT understands. This involves two steps. First, we create  `InputExample`'s using the constructor provided in the BERT library.

- `text_a` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. 
- `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank.
- `label` is the label for our example, i.e. True, False

In [None]:
# Use the InputExample class from BERT's run_classifier code to create examples from the data( I copied that code and put it in util.py)

train_InputExamples = train.apply(lambda x: utils.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

test_InputExamples = test.apply(lambda x: utils.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library):


1. Lowercase our text (if we're using a BERT lowercase model)
2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
4. Map our words to indexes using a vocab file that BERT provides
5. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert))
6. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))

Happily, we don't have to worry about most of these details.




To start, we'll need to load a vocabulary file and build a tokenizer:


In [None]:
tokenizer=tz.FullTokenizer(
      vocab_file='vocab.txt', do_lower_case=True)

Great--we just learned that the BERT model we're using expects lowercase data and we also loaded BERT's vocab file. We also created a tokenizer, which breaks words into word pieces:

In [None]:
tokenizer.tokenize("This here's an example of using the BERT tokenizer")

Using our tokenizer, we'll call `convert_examples_to_features` on our InputExamples to convert them into features BERT understands.

In [None]:
# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 128
# Convert our train and test features to InputFeatures that BERT understands.
train_features = utils.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = utils.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)


# Creating a model

Now that we've prepared our data, let's focus on building a model. First, it loads the BERT tf hub module as a keras layer. Next, it creates a single new layer that will be trained to adapt BERT to our sentiment task (i.e. classifying whether a movie review is positive or negative). This strategy of using a mostly trained model is called [fine-tuning](http://wiki.fast.ai/index.php/Fine_tuning).

In [None]:
class BERT(tf.keras.Model):
    def __init__(self,para=None):
        
        super().__init__()
        
        drop_rate=para['drop_rate']
        
        '''self.bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2",
                            trainable=True,name='bert_layer')'''
        
        self.bert_layer = hub.KerasLayer("bert_en_uncased_L-12_H-768_A-12_2",
                            trainable=True,name='bert_layer')

        self.dp_layer=tf.keras.layers.Dropout(drop_rate)
        
        self.task_output=tf.keras.layers.Dense(1,activation=tf.nn.sigmoid,name='task_specific_output_layer')
        
    def call(self,ft,training=False):
        
        pooled_output, sequence_output = self.bert_layer([ft['input_ids'], ft['input_mask'], ft['segment_ids']])
        
        pooled_output=self.dp_layer(pooled_output,training=training)
        
        return self.task_output(pooled_output)

In [None]:
# training
para={}
para['drop_rate']=0.1
model=BERT(para)

# build trainset from train_features
input_params={}
input_params['batch_size']=32
train_input_fn=utils.input_fn_builder(train_features,MAX_SEQ_LENGTH,is_training=True,drop_remainder=False)
trainset=train_input_fn(input_params)

step=0
epoch_num=3
epoch_size=len(train_features)//input_params['batch_size']+1

#set loss,optimizer and metric for training
loss=tf.keras.losses.BinaryCrossentropy()
opt=tf.keras.optimizers.Adam(learning_rate=0.00002)
mt=tf.keras.metrics.AUC()

loss_=[]
mt_=[]
ob=3
loss_sum=0

print('Start TRAINING...')
now=time.time()
currdate=time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(now))
print(currdate)
print("Epoch Size: %d"%epoch_size)
for i in range(epoch_num):
    for ft in trainset:
        with tf.GradientTape() as tape:
            pred=model(ft)
            cur_loss=loss(ft['label_ids'],pred)
        grads=tape.gradient(cur_loss,model.trainable_variables)
        opt.apply_gradients(zip(grads,model.trainable_variables))
        
        loss_sum+=cur_loss
        
        if (step+1)%ob==0:
            loss_.append(loss_sum/ob)
            loss_sum=0
            print("Step %d of Epoch %d......"%(step,i))
            print("Logloss: %.4f"%loss_[-1])
        
        step+=1
        
        if step==epoch_size:
            step=0
            break
    #save model weights of each epoch 
    model.save_weights('save/sw_epoch%d'%i)

print('TRAINING is done!')
now=time.time()
currdate=time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(now))
print(currdate)

Now let's use our test data to see how well our model did:

In [None]:
# Calculate evaluation metrics. 
def metric_fn(label_ids, predicted_labels):
    accuracy = tf.keras.metrics.BinaryAccuracy()
    accuracy.update_state(label_ids, predicted_labels)
    
    auc = tf.keras.metrics.AUC()
    auc.update_state(label_ids, predicted_labels)
    
    recall = tf.keras.metrics.Recall()
    recall.update_state(label_ids, predicted_labels)
    r=recall.result().numpy()
    
    precision = tf.keras.metrics.Precision()
    precision.update_state(label_ids, predicted_labels)
    p=precision.result().numpy()
    
    return {
        "eval_accuracy": accuracy.result().numpy(),
        "auc": auc.result().numpy(),
        "precision": p,
        "recall": r,
        "f1_score": 2*p*r/(p+r)

    }

In [None]:
#build testset from test_features
input_params={}
input_params['batch_size']=32
test_input_fn=utils.input_fn_builder(test_features,MAX_SEQ_LENGTH,is_training=False,drop_remainder=False)
testset=test_input_fn(input_params)

predicted_labels=[]
label_ids=[]

#make predictions on testset
step=0
for ft in testset:
    if step%10==0:
        print("Step %d ..." % step)
    predicted_labels+=model(ft).numpy().tolist()
    label_ids+=ft['label_ids'].numpy().tolist()
    step+=1

#evaluation
result=metric_fn(label_ids, predicted_labels)

print("auc: %.4f"%result['auc'])
print("eval_accuracy: %.4f"%result['eval_accuracy'])
print("f1_score: %.4f"%result['f1_score'])
print("precision: %.4f"%result['precision'])
print("recall: %.4f"%result['recall'])


Now let's write code to make predictions on new sentences:

In [None]:
def getPrediction(in_sentences):
    labels = ["Negative", "Positive"]
    input_examples = [utils.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label
    input_features = utils.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
    
    predict_input_fn = utils.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
    input_params={}
    input_params['batch_size']=32
    newset=predict_input_fn(input_params)
    
    predicted_probs=[]
    for ft in newset:
        predicted_probs+=model(ft).numpy().squeeze().tolist()
    return [(sentence, prediction, labels[1] if prediction>0.5 else labels[0]) for sentence, prediction in zip(in_sentences, predicted_probs)]

In [None]:
pred_sentences = [
  "That movie was absolutely awful",
  "The acting was a bit lacking",
  "The film was creative and surprising",
  "Absolutely fantastic!"
]

In [None]:
predictions = getPrediction(pred_sentences)

Voila! We have a sentiment classifier!

In [None]:
predictions