In [None]:
# Import necessary libraries
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset

# Load the IMDB dataset
dataset = load_dataset('imdb')

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
def preprocess_function(examples):
    return tokenizer(
        examples['text'], 
        truncation=True, 
        padding='max_length', 
        max_length=256  # Reduced sequence length from 512 to 256
    )

# Apply preprocessing to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Reduce dataset size for faster training
small_train_dataset = tokenized_datasets['train'].shuffle(seed=42).select(range(1000))
small_test_dataset = tokenized_datasets['test'].shuffle(seed=42).select(range(1000))

# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set up training arguments with optimizations
training_args = TrainingArguments(
    output_dir='./results',                     # Directory to save results
    evaluation_strategy='epoch',               # Evaluate model after each epoch
    learning_rate=2e-5,                        # Learning rate
    per_device_train_batch_size=32,            # Increased batch size for faster processing
    per_device_eval_batch_size=128,            # Increased evaluation batch size
    num_train_epochs=3,                        # Number of training epochs
    weight_decay=0.01,                         # Weight decay for regularization
    fp16=True,                                 # Enable mixed precision for faster training
    save_total_limit=1,                        # Limit saved checkpoints
    logging_dir='./logs',                      # Directory for storing logs
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_test_dataset,
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

# Save the model and tokenizer
model.save_pretrained('./sentiment_model')
tokenizer.save_pretrained('./sentiment_model')


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/4689 [00:00<?, ?it/s]