In [None]:
!pip install datasets

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import torch

In [None]:
dataset = load_dataset("imdb")

In [None]:
train_dataset = dataset["train"].shuffle(seed=42).select(range(200))  # Subset of train split
test_dataset = dataset["test"].shuffle(seed=42).select(range(50))  # Subset of test split

In [None]:
model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

In [None]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

In [None]:
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

In [None]:
model.save_pretrained("./full_finetuned_model")
tokenizer.save_pretrained("./full_finetuned_model")

In [None]:
# Load the Fine-Tuned Model and Tokenizer
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained("./full_finetuned_model")
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./full_finetuned_model")

In [None]:
examples = [
    "The movie was absolutely fantastic! The storyline and acting were top-notch.",
    "I did not enjoy this film. It was boring and poorly made.",
    "An average movie. Some parts were good, but it lacked overall depth.",
]

In [None]:
inputs = fine_tuned_tokenizer(examples, truncation=True, padding=True, max_length=128, return_tensors="pt")

In [None]:
fine_tuned_model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Disable gradient computation for inference
    outputs = fine_tuned_model(**inputs)

In [None]:
predicted_logits = outputs.logits
predicted_probabilities = torch.nn.functional.softmax(predicted_logits, dim=-1)
predicted_classes = torch.argmax(predicted_probabilities, dim=1)

In [None]:
for i, example in enumerate(examples):
    label = "POSITIVE" if predicted_classes[i] == 1 else "NEGATIVE"
    confidence = predicted_probabilities[i][predicted_classes[i]].item()
    print(f"Text: {example}")
    print(f"Prediction: {label} with confidence {confidence:.4f}\n")
