In [20]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification

In [21]:
df = pd.read_csv("amazon_reviews_3.csv")

In [22]:
labels = df['LABEL_ENCODED'].values

In [23]:
# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [24]:
# Tokenize the review texts
tokenized_inputs = tokenizer(
    df['REVIEW_TEXT'].tolist(),
    truncation=True,
    padding=True,
    max_length=256,  # Adjust the maximum sequence length as needed
    return_tensors='tf'
)

In [25]:
# Create the BERT model
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
# Set up the model inputs
input_ids = Input(shape=(256,), dtype=tf.int32, name='input_ids')
attention_mask = Input(shape=(256,), dtype=tf.int32, name='attention_mask')

In [27]:
# Call the BERT model on the inputs
bert_outputs = bert_model(input_ids, attention_mask=attention_mask)

# Create the final model
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=bert_outputs.logits)

In [28]:
# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])


In [31]:
# Split the data into training and validation sets
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    tokenized_inputs,
    labels,
    test_size=0.2,
    random_state=42
)
# Create TensorFlow Datasets from the training and validation sets
train_dataset = tf.data.Dataset.from_tensor_slices((train_inputs, train_labels)).shuffle(100).batch(32)
val_dataset = tf.data.Dataset.from_tensor_slices((val_inputs, val_labels)).batch(32)

ValueError: Found input variables with inconsistent numbers of samples: [3, 21000]

In [29]:
# Convert tokenized inputs to NumPy arrays
input_ids_np = tokenized_inputs['input_ids'].numpy()
attention_mask_np = tokenized_inputs['attention_mask'].numpy()

In [30]:
# Split the data into training and validation sets
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    {'input_ids': input_ids_np, 'attention_mask': attention_mask_np},
    labels,
    test_size=0.2,
    random_state=42
)

# Create TensorFlow Datasets from the training and validation sets
train_dataset = tf.data.Dataset.from_tensor_slices((train_inputs, train_labels)).shuffle(100).batch(32)
val_dataset = tf.data.Dataset.from_tensor_slices((val_inputs, val_labels)).batch(32)


ValueError: Found input variables with inconsistent numbers of samples: [2, 21000]

In [None]:
# Train the model
model.fit(
    train_dataset,
    epochs=5,  # Adjust the number of epochs as needed
    validation_data=val_dataset
)