📌 Step 1: Import Required Libraries


In [1]:
%pip install datasets --q
%pip install transformers --q
from datasets import load_dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


 📌 **Step 2: Select and Load Dataset**  
 In this step, we:
 - Choose a dataset suitable for our binary classification task
 - Load the dataset using the `datasets` library



In [2]:
# Load a smaller dataset for faster training
ds = load_dataset("stanfordnlp/imdb")

In [3]:
df_train = pd.DataFrame(ds["train"])
print(df_train.head())



                                                text  label
0  I rented I AM CURIOUS-YELLOW from my video sto...      0
1  "I Am Curious: Yellow" is a risible and preten...      0
2  If only to avoid making this type of film in t...      0
3  This film was probably inspired by Godard's Ma...      0
4  Oh, brother...after hearing about this ridicul...      0


📌 **Step 3: Preprocess Data & Handle Class Imbalance**  
 - Check for class imbalance in labels  
 - Tokenize text using a AutoTokenizer from HF  
 

In [4]:
#check df if the classes are balanced
df_train['label'].value_counts()


label
0    12500
1    12500
Name: count, dtype: int64

In [5]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function with a reduced max_length
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Shuffle and select a random subset
small_train_dataset = ds["train"].shuffle(seed=42).select(range(12000))
small_test_dataset = ds["test"].shuffle(seed=42).select(range(8000))

# Remove unnecessary columns and format dataset
train_dataset = small_train_dataset.map(tokenize_function, batched=True)
test_dataset = small_test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format("torch")
test_dataset.set_format("torch")


Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [6]:
# Extract labels as lists
train_labels = small_train_dataset["label"]
test_labels = small_test_dataset["label"]

# Convert lists to Pandas Series
train_labels_series = pd.Series(train_labels)
test_labels_series = pd.Series(test_labels)

# Use .value_counts() to check class distribution
print("Train Label Distribution:\n", train_labels_series.value_counts())
print("\nTest Label Distribution:\n", test_labels_series.value_counts())


Train Label Distribution:
 0    6017
1    5983
Name: count, dtype: int64

Test Label Distribution:
 1    4012
0    3988
Name: count, dtype: int64


📌 **Step 4: Training Hyperparameters**
Train with HuggingFace Transformers Trainer (PyTorch)

In [7]:
# Load BERT model with dropout enabled
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)


# Define faster training arguments

training_args = TrainingArguments(
    output_dir="./fast_results",
    evaluation_strategy="epoch",  # Evaluates at the end of each epoch
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    fp16=True,  # Enable mixed precision for faster training
    logging_steps=50  # Log training loss every 50 steps
)



# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.3152,0.304129
2,0.1949,0.427558
3,0.0405,0.571896


TrainOutput(global_step=2250, training_loss=0.22279253461625842, metrics={'train_runtime': 320.9693, 'train_samples_per_second': 112.16, 'train_steps_per_second': 7.01, 'total_flos': 2368063282176000.0, 'train_loss': 0.22279253461625842, 'epoch': 3.0})

In [8]:
# Prediction function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=-1).numpy()  # Convert logits to class predictions (0 or 1)
    
    # No need to convert labels to numpy, it's already a NumPy array
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")

    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Assign compute_metrics to Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,  
)

# Run model evaluation
eval_results = trainer.evaluate()

# Print results in a readable format
print("\nEvaluation Metrics:")
print(f"Loss: {eval_results['eval_loss']:.4f}")
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")  
print(f"Precision: {eval_results['eval_precision']:.4f}")
print(f"Recall: {eval_results['eval_recall']:.4f}")
print(f"F1-score: {eval_results['eval_f1']:.4f}")




Evaluation Metrics:
Loss: 0.5719
Accuracy: 0.8774
Precision: 0.8647
Recall: 0.8956
F1-score: 0.8799


In [9]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Define model save path
save_directory = "./my_fine_tuned_model"

# Save model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print("Model and tokenizer saved successfully!")


Model and tokenizer saved successfully!


In [14]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
from huggingface_hub import HfApi

repo_name = "Tryfonas/fine-tuned-bert-classifier-bds24"  
api = HfApi()

api.create_repo(repo_name, exist_ok=True)  # Create repo if it doesn't exist

# Upload model
from huggingface_hub import upload_folder

upload_folder(
    folder_path=save_directory,
    repo_id=repo_name,
    commit_message="Uploading fine-tuned model"
)

print(f"Model uploaded to: https://huggingface.co/{repo_name}")


- empty or missing yaml metadata in repo card
