In [1]:
!pip3 install transformers
!pip3 install datasets



In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, InputExample, InputFeatures

# Load the IMDB movie reviews dataset
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")



In [3]:
# Define a function to convert sentiment labels to numeric values
def cat2num(value):
    if value == 'positive':
        return 1
    else:
        return 0

df['sentiment'] = df['sentiment'].apply(cat2num)
train = df[:45000]
test = df[45000:]

# Load BERT model and tokenizer
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [4]:
# Define a function to convert data to InputExamples
def convert_data_to_examples(data, review_col, sentiment_col):
    examples = data.apply(lambda x: InputExample(guid=None, text_a=x[review_col], label=x[sentiment_col]), axis=1)
    return examples

train_examples = convert_data_to_examples(train, 'review', 'sentiment')
validation_examples = convert_data_to_examples(test, 'review', 'sentiment')

In [5]:
# Define a function to convert InputExamples to a TF dataset
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = []

    for example in examples:
        input_dict = tokenizer.encode_plus(
            example.text_a,
            add_special_tokens=True,
            max_length=max_length,
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True,
            truncation=True
        )
        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"], input_dict["token_type_ids"], input_dict['attention_mask'])
        features.append(InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=example.label))

    def generator():
        for feature in features:
            yield (
                {
                    "input_ids": feature.input_ids,
                    "attention_mask": feature.attention_mask,
                    "token_type_ids": feature.token_type_ids,
                },
                feature.label,
            )

    return tf.data.Dataset.from_generator(
        generator,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )

In [6]:
# Prepare train and validation datasets
train_data = convert_examples_to_tf_dataset(train_examples, tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(validation_examples, tokenizer)
validation_data = validation_data.batch(32)

# Compile and train the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

# Accuracy Before Training
accuracy_before_training = model.evaluate(validation_data)[1]  # 1 corresponds to Accuracy in the metrics list

# Train
model.fit(train_data, epochs=2, validation_data=validation_data)

# Accuracy After Training
accuracy_after_training = model.evaluate(validation_data)[1]  # 1 corresponds to Accuracy in the metrics list



Epoch 1/2
Epoch 2/2


In [7]:
# Results
print(f'Accuracy Before Training: {accuracy_before_training}')
print(f'Accuracy After Training: {accuracy_after_training}')

Accuracy Before Training: 0.5052000284194946
Accuracy After Training: 0.8820000290870667


In [8]:
# Make predictions with the fine-tuned model
pred_sentences = ['worst movie of my life, will never watch movies from this series', 'Wow, blew my mind, what a movie by Marvel, animation and story is amazing']

tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['Negative', 'Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
    print(pred_sentences[i], ": ", labels[label[i]])

worst movie of my life, will never watch movies from this series :  Negative
Wow, blew my mind, what a movie by Marvel, animation and story is amazing :  Positive


In [9]:
model_path = "bert_sentiment_model"
model.save(model_path)