In [1]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
# read cleaned data
df = pd.read_csv("../../data/curated/reviews/yiting_cleaned_reviews.csv")

In [3]:
df.sample(5)

Unnamed: 0,Sentiment,Time,Text,processed_text
3400,0,20/8/21,For the price you cant beet it. It is not to s...,price cant beet sweet enough flavor really tas...
4826,1,24/4/21,Formula itself is great. Works wonders for a b...,formula great work wonder baby spit wtf expens...
4840,1,20/2/21,"It says Habenero peppers on the ingredients, b...",say habenero pepper ingredients guess wave clo...
1084,0,7/3/20,These Quaker Banana Bread bars are so tasty an...,quaker banana bread bar tasty love low calorie...
2074,0,10/8/18,"This was a good purchase. Tastes good, pretty ...",good purchase taste good pretty color seem go ...


some of the sentences in text column are too long. when these sentences are converted to tokens and sent inside the model they are exceeding the 512 seq_length limit of the model, the embedding of the model used in the sentiment-analysis task was trained on 512 tokens embedding.

to fix this issue we can either: 
 
1. filter out the long sentences and keep only smaller ones (with token length < 512 )

2. you can truncate the sentences with truncating = True
```
sentiment = classifier(data.iloc[i,0], truncation=True)
```

In [3]:
df.rename(columns = {'Sentiment' : 'labels', 'processed_text' : 'text'}, inplace = True)
df = df[['text','labels']]
df.head()

Unnamed: 0,text,labels
0,healthy dog food good digestion also good smal...,0
1,please natural balance dog food dog issue dog ...,0
2,educate feline nutrition allow cat become addi...,0
3,holistic vet recommend along brand try cat pre...,0
4,buy coffee much cheaper ganocafe organic reish...,0


In [4]:
arr = df['labels'].values
arr.size

5444

In [5]:
import numpy as np
labels = np.zeros((arr.size, arr.max() + 1), dtype=int)
labels.shape

(5444, 2)

In [6]:
labels[np.arange(arr.size), arr] = 1
labels.shape

(5444, 2)

In [7]:
MAX_LEN = 512

## BertTokenizer

In [8]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
#tokens = tokenizer.encode_plus('hello world', max_length=SEQ_LEN, truncation=True, padding="max_length", add_special_tokens=True, return_token_type_ids=False, return_attention_mask=True, return_tensors='tf')
#tokens

Loading BERT tokenizer...


## Preparing data inputs for the model

In [9]:
Xids = np.zeros((len(df), MAX_LEN))
Xmask = np.zeros((len(df), MAX_LEN))
Xids.shape

(5444, 512)

In [10]:
import tensorflow as tf
for i, sequence in enumerate(df['text']):
    #print(sequence, type(sequence)) 
    tokens = tokenizer.encode_plus(str(sequence), max_length=MAX_LEN, truncation=True, padding="max_length", add_special_tokens=True, return_token_type_ids=False, return_attention_mask=True, return_tensors='tf')
    Xids[i, :], Xmask[i, :] = tokens['input_ids'], tokens['attention_mask']

2023-03-19 22:50:09.838318: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-19 22:50:14.331708: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
Xids

array([[  101.,  7965.,  3899., ...,     0.,     0.,     0.],
       [  101.,  3531.,  3019., ...,     0.,     0.,     0.],
       [  101., 16957., 10768., ...,     0.,     0.,     0.],
       ...,
       [  101.,  4067.,  2643., ...,     0.,     0.,     0.],
       [  101.,  4031.,  2204., ...,     0.,     0.,     0.],
       [  101., 11498.,  7446., ...,     0.,     0.,     0.]])

In [12]:
Xmask

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

In [11]:
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))
for i in dataset.take(1):
    print(i)

(<tf.Tensor: shape=(512,), dtype=float64, numpy=
array([  101.,  7965.,  3899.,  2833.,  2204., 17886.,  3258.,  2036.,
        2204.,  2235., 26781., 13046.,  3899.,  4521.,  5478.,  3815.,
        2296.,  5438.,   102.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0

In [12]:
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks},labels

dataset = dataset.map(map_func)
for i in dataset.take(1):
    print(i)

({'input_ids': <tf.Tensor: shape=(512,), dtype=float64, numpy=
array([  101.,  7965.,  3899.,  2833.,  2204., 17886.,  3258.,  2036.,
        2204.,  2235., 26781., 13046.,  3899.,  4521.,  5478.,  3815.,
        2296.,  5438.,   102.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,

In [13]:
dataset = dataset.shuffle(100000, reshuffle_each_iteration=False)
for i in dataset.take(1):
    print(i)

({'input_ids': <tf.Tensor: shape=(512,), dtype=float64, numpy=
array([  101.,  4031.,  2295.,  6450.,  2247.,  2911.,  2190.,  5510.,
        2512.,  7107.,  5510.,  5587.,  2210., 21161.,  5699.,  4392.,
        3085.,  2882.,  3211.,  5104.,  2655.,  3894.,  6501., 12087.,
        2028.,  7427.,  4374.,  3929.,  7744.,  2986.,  9898.,   102.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,

In [14]:
DS_LEN = len(list(dataset))
DS_LEN

5444

## Splitting into train, test and validation set

In [15]:
SPLIT = .8

# take or skip the specified number of batches to split by factor
test = dataset.skip(round(DS_LEN * SPLIT)).batch(64)
trainevalu = dataset.take(round(DS_LEN * SPLIT)) #282

DS_LEN2 = len(list(trainevalu))

train = trainevalu.take(round(DS_LEN2 * SPLIT)).batch(64)
evalu = trainevalu.skip(round(DS_LEN2 * SPLIT)).batch(64)

#del dataset

In [16]:
print (f"test data: {len(test)}")
print (f"train data: {len(train)}, train evalu data: {len(evalu)}")

test data: 35
train data: 109, train evalu data: 28


## Model

In [17]:
from transformers import BertConfig
bertConfig = BertConfig.from_pretrained('bert-base-uncased'
                                        , output_hidden_states=True
                                        , num_labels=2
                                        , max_length=MAX_LEN
                                        )


In [18]:
from transformers import TFBertForSequenceClassification
tranformersPreTrainedModelName = 'bert-base-uncased'
bert = TFBertForSequenceClassification.from_pretrained(tranformersPreTrainedModelName, config=bertConfig)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# build 2 input layers to Bert Model where name needs to match the input values in the dataset
input_ids = tf.keras.Input(shape=(MAX_LEN,), name = 'input_ids', dtype='int32')
mask = tf.keras.Input(shape=(MAX_LEN,), name = 'attention_mask', dtype='int32')

embedings = bert.layers[0](input_ids, attention_mask=mask)[0]

#Original Author: Ferry Djaja
#https://djajafer.medium.com/multi-class-text-classification-with-keras-and-lstm-4c5525bef592
X = tf.keras.layers.Flatten()(embedings)
X = tf.keras.layers.Dropout(0.5)(X)
y = tf.keras.layers.Dense(2, activation='softmax', name='outputs')(X)

model = tf.keras.Model(inputs=[input_ids,mask], outputs=y)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                           

In [20]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc, tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

In [21]:
history = model.fit(train, validation_data = evalu, epochs = 2, shuffle=True)

Epoch 1/2
Epoch 2/2


In [23]:
tf.keras.models.save_model(model, 'SA')




INFO:tensorflow:Assets written to: SA/assets


INFO:tensorflow:Assets written to: SA/assets


In [24]:
loaded_model = tf.keras.models.load_model("../../model/BERT_SA")
