In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from datasets import Dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, TFBertModel, DataCollatorWithPadding

In [2]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
df = pd.read_csv('review.csv')
df = df.dropna(axis=0, subset=['reviewText'])
df['sentiment'] = df['overall'].map({1:0, 2:0, 3:1, 4:1, 5:1})
df = df[["sentiment", "reviewText"]]

In [4]:
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)

In [6]:
def tokenization(review):
    return tokenizer(review["reviewText"], truncation=True)

In [7]:
tok_data = dataset.map(tokenization, batched=True)

  0%|          | 0/50 [00:03<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [8]:
tf_train_data = tok_data["train"].to_tf_dataset(
                            columns=['attention_mask', 'input_ids', 'token_type_ids'],
                            label_cols=['sentiment'],
                            shuffle=False,
                            collate_fn=data_collator,
                            batch_size=8)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [9]:
model.compile(optimizer="adam", loss=SparseCategoricalCrossentropy(from_logits=True), metrics=["accuracy"],)

In [None]:
model.fit(tf_train_data)

   3/6189 [..............................] - ETA: 2220:32:13 - loss: 0.5525 - accuracy: 0.7083

In [15]:
tok_data['train']

Dataset({
    features: ['sentiment', 'reviewText', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 49512
})

In [None]:
sequences = ["I love apostle's message, it is always spirit filled", 
             "Landmark university has good discipline, but the staffs make the school unbearable"]