In [86]:
from transformers import TFBertForSequenceClassification, BertTokenizer, create_optimizer
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split

# Load and split data using skilearn 
#80% used for training from traning.csv and 20% for testing. 
df = pd.read_csv("train.csv")
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(), df["target"].tolist(), test_size=0.2, random_state=42
)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def create_dataset(texts, labels, tokenizer, batch_size=16):
    tokens = tokenizer(texts, truncation=True, padding=True)
    inputs = {
        "input_ids": tf.convert_to_tensor(tokens["input_ids"]),
        "attention_mask": tf.convert_to_tensor(tokens["attention_mask"]),
       
        "token_type_ids": tf.convert_to_tensor(tokens.get("token_type_ids", [[0]*len(tokens["input_ids"][0])] * len(texts))),
        "labels": tf.convert_to_tensor(labels)
    }
    return tf.data.Dataset.from_tensor_slices(inputs).batch(batch_size)

# Create TensorFlow datasets
train_dataset = create_dataset(train_texts, train_labels, tokenizer)
val_dataset = create_dataset(val_texts, val_labels, tokenizer)

# Load pretrained BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Calculate steps and create optimizer
steps_per_epoch = len(train_dataset)
num_train_steps = steps_per_epoch * 2  # 2 epochs
optimizer, _ = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=num_train_steps)

# Compile the model
model.compile(optimizer=optimizer, metrics=["accuracy"])

# Train the model with validation
model.fit(train_dataset, epochs=2, validation_data=val_dataset)

# Evaluate on validation set
loss, accuracy = model.evaluate(val_dataset)
print(f"Validation Accuracy: {accuracy:.4f}")


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2
Epoch 2/2
Validation Accuracy: 0.8293
