In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
#READ THE DATA FILE
df = pd.read_csv("Datasets/amazon_reviews_3.csv")

In [7]:
textual_features = df['PREPROCESSED_REVIEW_TEXT']
numerical_features = df[['REVIEW_LENGTH', 'VERIFIED_PURCHASE']]
label = df['LABEL_ENCODED']

In [8]:
X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    df['PREPROCESSED_REVIEW_TEXT'],
    df[['REVIEW_LENGTH', 'VERIFIED_PURCHASE']],
    df['LABEL_ENCODED'],
    test_size=0.2,
    random_state=42
)

In [9]:
#SCALE NUMERICAL FEATURES
sc = StandardScaler()
X_num_train = sc.fit_transform(X_num_train)
X_num_test = sc.transform(X_num_test)

In [10]:
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.models import Model

2023-06-27 13:47:50.498570: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
#LOAD BERT MODEL AND TOKENIZER
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [12]:
#INPUT LAYERS OF NEURAL NETWORK
text_input = Input(shape=(None,), dtype=tf.int32, name='text_input')
num_input = Input(shape=(1,), dtype=tf.float32, name='num_input')

In [13]:
#BERT LAYER
bert_output = bert_model(text_input)[0]  # Extract the last hidden state from BERT
text_features = tf.reduce_mean(bert_output, axis=1)  # Average pooling over the tokens

In [14]:
# Concatenate BERT features with numerical features
merged_features = Concatenate()([text_features, num_input])

In [15]:
output = Dense(1, activation='sigmoid')(merged_features)  # Binary classification example


In [16]:
model = Model(inputs=[text_input, num_input], outputs=output)

In [17]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [20]:
model.fit([X_text_train, X_num_train], y_train, epochs=10, batch_size=32)

Epoch 1/10


ValueError: in user code:

    File "/Users/kpandey/anaconda3/lib/python3.10/site-packages/keras/engine/training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "/Users/kpandey/anaconda3/lib/python3.10/site-packages/keras/engine/training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/kpandey/anaconda3/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "/Users/kpandey/anaconda3/lib/python3.10/site-packages/keras/engine/training.py", line 1050, in train_step
        y_pred = self(x, training=True)
    File "/Users/kpandey/anaconda3/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/kpandey/anaconda3/lib/python3.10/site-packages/keras/engine/input_spec.py", line 280, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer 'model' (type Functional).
    
    Input 0 of layer "dense" is incompatible with the layer: expected axis -1 of input shape to have value 769, but received input with shape (32, 770)
    
    Call arguments received by layer 'model' (type Functional):
      • inputs=('tf.Tensor(shape=(32, 1), dtype=string)', 'tf.Tensor(shape=(32, 2), dtype=float32)')
      • training=True
      • mask=None


In [21]:
import tensorflow as tf
import pandas as pd
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

In [22]:
df = pd.read_csv("Datasets/amazon_reviews_3.csv")

In [23]:
#PREPROCESS THE TEXTUAL DATA
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenized_reviews = df['REVIEW_TEXT'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
max_length = max(map(len, tokenized_reviews))
padded_reviews = tf.keras.preprocessing.sequence.pad_sequences(tokenized_reviews, maxlen=max_length, padding='post')

Token indices sequence length is longer than the specified maximum sequence length for this model (1143 > 512). Running this sequence through the model will result in indexing errors


In [25]:
#PREPROCESS THE NUMERICAL FEATURES
normalized_features = df[['REVIEW_LENGTH', 'VERIFIED_PURCHASE']].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [26]:
X_text = padded_reviews
X_num = normalized_features.values
y = df['LABEL_ENCODED']

X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    X_text, X_num, y, test_size=0.2, random_state=42
)

In [27]:
#BERT MODEL
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [28]:
#INPUT LAYERS
text_input = Input(shape=(max_length,), dtype=tf.int32, name='text_input')
num_input = Input(shape=(2,), dtype=tf.float32, name='num_input')


In [29]:
#BERT EMBEDDING LAYER
bert_output = bert_model(text_input)[0]  # Extract the last hidden state from BERT
text_features = tf.reduce_mean(bert_output, axis=1)  # Average pooling over the tokens

In [30]:
#CONCATENATE TEXTUAL AND NUMERICAL FEATURES
merged_features = Concatenate()([text_features, num_input])

In [35]:
output = Dense(1, activation='softmax')(merged_features)  # Binary classification example


In [36]:
model = Model(inputs=[text_input, num_input], outputs=output)


In [37]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit([X_text_train, X_num_train], y_train, epochs=10, batch_size=32, validation_data=([X_text_test, X_num_test], y_test))


Epoch 1/10


2023-06-27 13:57:16.923362: I tensorflow/core/common_runtime/executor.cc:1197] [/job:localhost/replica:0/task:0/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: indices[0,512] = 512 is not in [0, 512)
	 [[{{node model_2/tf_bert_model_1/bert/embeddings/Gather_1}}]]
