In [35]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.models import Model
from transformers import TFBertModel, BertTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
df = pd.read_csv("amazon_reviews_3.csv")
df.columns

Index(['Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'RATING',
       'VERIFIED_PURCHASE', 'REVIEW_TITLE', 'REVIEW_TEXT', 'NUM_NOUNS',
       'NUM_VERBS', 'NUM_ADJECTIVES', 'NUM_ADVERBS', 'REVIEW_LENGTH',
       'SENTIMENT_SCORE', 'TITLE_LENGTH', 'AVERAGE_RATING', 'RATING_DEVIATION',
       'NUM_REVIEWS', 'READABILITY_FRE', 'SENTIMENT_CATEGORY_ENCODED',
       'RATING_CATEGORY_ENCODED', 'COHERENT_ENCODED', 'AVG_WORD_LENGTH',
       'LABEL_ENCODED', 'NUM_NAMED_ENTITIES', 'CAPITAL_CHAR_COUNT',
       'PUNCTUATION_COUNT', 'PREPROCESSED_REVIEW_TEXT'],
      dtype='object')

In [22]:
text_features = df['REVIEW_TEXT']
numerical_features = df[[
        'RATING',
       'VERIFIED_PURCHASE', 'NUM_NOUNS',
       'NUM_VERBS', 'NUM_ADJECTIVES', 'NUM_ADVERBS', 'REVIEW_LENGTH',
       'SENTIMENT_SCORE', 'TITLE_LENGTH', 'AVERAGE_RATING', 'RATING_DEVIATION',
       'NUM_REVIEWS', 'READABILITY_FRE', 'SENTIMENT_CATEGORY_ENCODED',
       'RATING_CATEGORY_ENCODED', 'COHERENT_ENCODED', 'AVG_WORD_LENGTH',
       'LABEL_ENCODED', 'NUM_NAMED_ENTITIES', 'CAPITAL_CHAR_COUNT',
       'PUNCTUATION_COUNT', 'PREPROCESSED_REVIEW_TEXT'
]]
labels = df['LABEL_ENCODED']

In [7]:
#LOAD BERT TOKENIZER
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [9]:
#TOKENIZE AND ENCODE THE TEXTUAL FEATURES
text_tokens = tokenizer.batch_encode_plus(
    text_features,
    truncation=True,
    padding=True,
    return_tensors='tf'
)

In [10]:
#CONVERT TOKENISED INPUTSINTO TENSORFLOW TENSORS
input_ids = text_tokens['input_ids']
attention_mask = text_tokens['attention_mask']

In [12]:
num_numerical_features = len(numeric_features)
max_sequence_length = 0
for text in text_features:
    if len(text.split()) > max_sequence_length:
        max_sequence_length = len(text.split())

In [13]:
# DEEP LEARNING MODEL ARCHITECTURE

# INPUT LAYERS FOR NUMERICAL FEATURES AND BERT INPUTS
numerical_input = Input(shape=(num_numerical_features,))
bert_input_ids = Input(shape=(max_sequence_length,), dtype=tf.int32)
bert_attention_mask = Input(shape=(max_sequence_length,), dtype=tf.int32)

In [14]:
# LOAD THE BERT MODEL
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [15]:
# RETRIEVE THE BERT EMBEDDINGS
bert_embeddings = bert_model(bert_input_ids, attention_mask=bert_attention_mask)[0]

In [16]:
#FLATTEN THE BERT EMBEDDINGS
flattened_bert = tf.keras.layers.Flatten()(bert_embeddings)

In [17]:
#CONCATENATE THE NUMERICAL FEATURES AND FLATTENED BERT EMBEDDING
concatenated_features = Concatenate()([numerical_input, flattened_bert])

In [18]:
#ADDITIONAL LAYERS
dense_layer = Dense(128, activation='relu')(concatenated_features)
output_layer = Dense(1, activation='sigmoid')(dense_layer)

In [19]:
#CREATE THE MODEL
model = Model(inputs=[numerical_input, bert_input_ids, bert_attention_mask], outputs=output_layer)

In [20]:
#COMPILE THE MODEL
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [32]:
num_epochs = 50
batch_size = 16
#FLOATS ARE COMPATIBLE WITH TENSORFLOW; NOT INTEGERS
labels = np.array(labels, dtype=np.float32)
numerical_features = np.array(labels, dtype=np.float32)

In [36]:
input_ids = pad_sequences(input_ids, maxlen=max_sequence_length, padding='post')
attention_mask = pad_sequences(attention_mask, maxlen=max_sequence_length, padding='post')

In [37]:
#TRAIN THE MODEL WITH: NUMERICAL FEATURES, BERT INPUT AND LABELS
model.fit([numerical_features, input_ids, attention_mask], labels, epochs=num_epochs, batch_size=batch_size, validation_split=0.2)


Epoch 1/50


ValueError: in user code:

    File "/Users/kpandey/anaconda3/lib/python3.10/site-packages/keras/engine/training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "/Users/kpandey/anaconda3/lib/python3.10/site-packages/keras/engine/training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/kpandey/anaconda3/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "/Users/kpandey/anaconda3/lib/python3.10/site-packages/keras/engine/training.py", line 1050, in train_step
        y_pred = self(x, training=True)
    File "/Users/kpandey/anaconda3/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/kpandey/anaconda3/lib/python3.10/site-packages/keras/backend.py", line 3581, in concatenate
        return tf.concat([to_dense(x) for x in tensors], axis)

    ValueError: Exception encountered when calling layer 'concatenate' (type Concatenate).
    
    Shape must be rank 1 but is rank 2 for '{{node model/concatenate/concat}} = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32](IteratorGetNext, model/flatten/Reshape, model/concatenate/concat/axis)' with input shapes: [16], [16,1038336], [].
    
    Call arguments received by layer 'concatenate' (type Concatenate):
      • inputs=['tf.Tensor(shape=(16,), dtype=float32)', 'tf.Tensor(shape=(16, 1038336), dtype=float32)']
