## Install Requirements and Import Libraries

In [1]:
# Install requirements

! pip install -r requirements.txt



In [2]:
# Import libraries
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
# Read cleaned data
df = pd.read_csv("../../data/curated/reviews/yiting_cleaned_reviews.csv")

# Display 5 random samples
df.sample(5)

Unnamed: 0,Sentiment,Time,Text,processed_text
3976,0,4/7/21,My 4 border collies expect one of these each n...,border collie expect one night bed learn impor...
3880,0,4/7/21,The beans came well packaged and in time. We h...,bean come well package time never problem bean...
90,0,25/5/21,"Ironically, I am an Amazon seasonal worker. I...",ironically amazon seasonal worker discover tas...
5,0,7/7/21,"Like the other people mentioned, this coffee h...",like people mention coffee great taste try dif...
2715,0,10/8/18,This particular flavor of Folger's gourmet cof...,particular flavor folger gourmet coffee turn f...


In [4]:
# Re-labelling of columns headers
df.rename(columns = {'Sentiment' : 'labels', 'processed_text' : 'text'}, inplace = True)

# Extracting out the necessary columns
df = df[['text','labels']]

# Display the current dataframe
df.head()

Unnamed: 0,text,labels
0,healthy dog food good digestion also good smal...,0
1,pleased natural balance dog food dog issue dog...,0
2,educate feline nutrition allow cat become addi...,0
3,holistic vet recommend along brand try cat pre...,0
4,buy coffee much cheaper ganocafe organic reish...,0


In [5]:
# Extract labels values to get the size
arr = df['labels'].values
arr.size

5444

In [6]:
# Creating 2D array to indicate which row of data the label belongs to
labels = np.zeros((arr.size, arr.max() + 1), dtype=int)
print(f"Label Shape: {labels.shape}")

# Indicating the label (0 or 1) of the respective row of data
labels[np.arange(arr.size), arr] = 1
print(f"Label Shape: {labels.shape}")

Label Shape: (5444, 2)
Label Shape: (5444, 2)


In [7]:
# Specify max seq length of the model
MAX_LEN = 512

Some of the sentences in the text column are too long. When these sentences are converted to tokens and sent inside the model, they exceed the 512 seq_length limit of the model. This is a problem as the embedding of the model used in the sentiment-analysis task was trained on 512 tokens embedding.

To fix this issue we can either: 
 
1. Filter out the long sentences and keep only smaller ones (with token length < 512)

2. Truncate the sentences with truncating = True
```
sentiment = classifier(data.iloc[i,0], truncation=True)
```

## BertTokenizer

In [8]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

  from .autonotebook import tqdm as notebook_tqdm


Loading BERT tokenizer...


## Preparing data inputs for the model

Article: https://betterprogramming.pub/build-a-natural-language-classifier-with-bert-and-tensorflow-4770d4442d41

In [9]:
# Initialise two arrays for input tensors
Xids = np.zeros((len(df), MAX_LEN))
Xmask = np.zeros((len(df), MAX_LEN))
Xids.shape

(5444, 512)

In [10]:
import tensorflow as tf

# For each text in the dataframe...
for i, sequence in enumerate(df['text']):
    
    # Return a dictionary containing the encoded sentence
    tokens = tokenizer.encode_plus(str(sequence), max_length = MAX_LEN, 
                                   truncation = True,               # Needed since there are text seq > 512
                                   padding = "max_length",          # For sentence < 512, padding is applied to reach a length of 512
                                   add_special_tokens = True,       # Mark the start and end of sequences
                                   return_token_type_ids = False, 
                                   return_attention_mask = True, 
                                   return_tensors = 'tf')           # Return TensorFlow object
    
    # Retrieve input_ids and attention_mask
    ### input_ids : list of integers uniquely tied to a specific word
    ### attention_mask : binary tokens indicating which tokens are the actual input tokens and which are padding tokens
    Xids[i, :], Xmask[i, :] = tokens['input_ids'], tokens['attention_mask']

2023-03-27 09:55:40.069375: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-27 09:55:55.852188: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
Xids

array([[  101.,  7965.,  3899., ...,     0.,     0.,     0.],
       [  101.,  7537.,  3019., ...,     0.,     0.,     0.],
       [  101., 16957., 10768., ...,     0.,     0.,     0.],
       ...,
       [  101.,  4067.,  2643., ...,     0.,     0.,     0.],
       [  101.,  4031.,  2204., ...,     0.,     0.,     0.],
       [  101., 11498.,  7446., ...,     0.,     0.,     0.]])

In [12]:
Xmask

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

In [13]:
# Combine arrays into tensorflow object
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

# Display one training example
for i in dataset.take(1):
    print(i)

(<tf.Tensor: shape=(512,), dtype=float64, numpy=
array([  101.,  7965.,  3899.,  2833.,  2204., 17886.,  3258.,  2036.,
        2204.,  2235., 17022.,  3899., 20323.,  5478.,  3815.,  2296.,
        8521.,   102.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0

In [14]:
# Create function to restructure the dataset
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks},labels

# Apply map method to apply our function above to the dataset
dataset = dataset.map(map_func)
for i in dataset.take(1):
    print(i)

({'input_ids': <tf.Tensor: shape=(512,), dtype=float64, numpy=
array([  101.,  7965.,  3899.,  2833.,  2204., 17886.,  3258.,  2036.,
        2204.,  2235., 17022.,  3899., 20323.,  5478.,  3815.,  2296.,
        8521.,   102.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,

In [15]:
# Shuffle the data
dataset = dataset.shuffle(100000, reshuffle_each_iteration = False)

# Display one training example
for i in dataset.take(1):
    print(i)

({'input_ids': <tf.Tensor: shape=(512,), dtype=float64, numpy=
array([  101.,  2293.,  5510.,  2224.,  3769.,  9781.,  2191., 24593.,
       12136.,  2100.,  2139.,  8566.,  6593.,  2732.,  2131.,  4658.,
        7053.,  2143., 13675.,  2483.,  3597.,  2066.,  2460.,  7406.,
        4412.,  2677.,  5223.,  9099.,  6638.,   102.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,

In [16]:
# Obtain length of dataset
DS_LEN = len(list(dataset))
DS_LEN

5444

## Splitting into train, test and validation set

In [17]:
# Specify test-train split
SPLIT = .8

# Take or skip the specified number of batches to split by factor
test = dataset.skip(round(DS_LEN * SPLIT)).batch(32)
trainevalu = dataset.take(round(DS_LEN * SPLIT)) # 282

DS_LEN2 = len(list(trainevalu))

train = trainevalu.take(round(DS_LEN2 * SPLIT)).batch(32)
evalu = trainevalu.skip(round(DS_LEN2 * SPLIT)).batch(32)

# Uncomment the code below to delete dataset and free up disk space
# del dataset

In [18]:
print(f"Test data: {len(test)}")
print(f"Train data: {len(train)}")
print(f"Train evaluation data: {len(evalu)}")

Test data: 35
Train data: 109
Train evaluation data: 28


## Model

In [19]:
from transformers import BertConfig

# Initialise BERT Model
bertConfig = BertConfig.from_pretrained('bert-base-uncased', 
                                        output_hidden_states = True,
                                        num_labels = 2,
                                        max_length = MAX_LEN
                                        )

In [20]:
from transformers import TFBertForSequenceClassification

tranformersPreTrainedModelName = 'bert-base-uncased'
bert = TFBertForSequenceClassification.from_pretrained(tranformersPreTrainedModelName, config = bertConfig)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# Build 2 input layers to Bert Model where name needs to match the input values in the dataset
input_ids = tf.keras.Input(shape = (MAX_LEN, ), name = 'input_ids', dtype = 'int32')
mask = tf.keras.Input(shape = (MAX_LEN, ), name = 'attention_mask', dtype = 'int32')

# Consume the last_hidden_state from BERT
embedings = bert.layers[0](input_ids, attention_mask=mask)[0]

# Original Author: Ferry Djaja
# https://djajafer.medium.com/multi-class-text-classification-with-keras-and-lstm-4c5525bef592
X = tf.keras.layers.Flatten()(embedings)
X = tf.keras.layers.Dropout(0.5)(X)
y = tf.keras.layers.Dense(2, activation='softmax', name='outputs')(X)

model = tf.keras.Model(inputs=[input_ids,mask], outputs=y)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                           

In [22]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc, tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

In [23]:
history = model.fit(train, validation_data = evalu, epochs = 2, shuffle=True)

Epoch 1/2


: 

: 

tf.keras.models.save_model(model, 'SA')

In [24]:
loaded_model = tf.keras.models.load_model("../../model/BERT_SA")


## Model Parameters

Finding the best model parameters

To refine manual searches

In [None]:
history_2 = model.fit(train, validation_data = evalu, epochs = 3, shuffle=True)

In [None]:
history_3 = model.fit(train, validation_data = evalu, epochs = 4, shuffle=True)