In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from sklearn.preprocessing import LabelEncoder
from datasets import load_dataset
from tqdm import tqdm

In [2]:
import torch
import numpy as np
import random

# Set a fixed seed for reproducibility
SEED = 2024
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)  # For multi-GPU setups
np.random.seed(SEED)
random.seed(SEED)

# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


In [3]:
# Load the datasets
train_dataset_bib = load_dataset("LabHC/bias_in_bios", split='train')
test_dataset_bib = load_dataset("LabHC/bias_in_bios", split='test')
dev_dataset_bib = load_dataset("LabHC/bias_in_bios", split='dev')

from collections import Counter

# Count distributions for profession and gender
profession_labels = [sample['profession'] for sample in train_dataset_bib]
gender_labels = [sample['gender'] for sample in train_dataset_bib]

# Profession distribution
profession_distribution = Counter(profession_labels)
print("\nDistribution of Profession Classes:")
print(profession_distribution)

# Gender distribution
gender_distribution = Counter(gender_labels)
print("\nDistribution of Gender Classes:")
print(gender_distribution)


Distribution of Profession Classes:
Counter({21: 76748, 19: 26648, 2: 21169, 18: 15773, 11: 12960, 13: 12316, 22: 11945, 26: 10531, 6: 9479, 25: 8829, 1: 6568, 14: 5025, 12: 4867, 20: 4558, 9: 4545, 24: 4492, 0: 3660, 5: 3637, 7: 2567, 4: 1824, 3: 1725, 16: 1638, 15: 1146, 27: 1076, 8: 964, 10: 949, 17: 928, 23: 911})

Distribution of Gender Classes:
Counter({0: 138780, 1: 118698})


In [4]:
# Extract the necessary columns and create a simplified dataset
train_texts = [sample['hard_text'] for sample in train_dataset_bib]
train_professions = [sample['profession'] for sample in train_dataset_bib]
train_genders = [sample['gender'] for sample in train_dataset_bib]

test_texts = [sample['hard_text'] for sample in test_dataset_bib]
test_professions = [sample['profession'] for sample in test_dataset_bib]
test_genders = [sample['gender'] for sample in test_dataset_bib]

dev_texts = [sample['hard_text'] for sample in dev_dataset_bib]
dev_professions = [sample['profession'] for sample in dev_dataset_bib]
dev_genders = [sample['gender'] for sample in dev_dataset_bib]

# Print a few samples to confirm preprocessing
print("\nSample from Training Data:")
print("Text:", train_texts[0])
print("Profession (label):", train_professions[0])
print("Gender (sensitive attribute):", train_genders[0])

# Confirm dataset sizes
print("\nDataset Sizes:")
print("Train:", len(train_texts))
print("Test:", len(test_texts))
print("Dev:", len(dev_texts))


Sample from Training Data:
Text: He is also the project lead of and major contributor to the open source assembler/simulator "EASy68K." He earned a master’s degree in computer science from the University of Michigan-Dearborn, where he is also an adjunct instructor. Downloads/Updates
Profession (label): 21
Gender (sensitive attribute): 0

Dataset Sizes:
Train: 257478
Test: 99069
Dev: 39642


In [5]:
import pandas as pd

# Create a DataFrame for easier analysis
train_data = pd.DataFrame({
    "text": train_texts,
    "profession": train_professions,
    "gender": train_genders
})

# Calculate gender distribution for each profession
profession_gender_dist = train_data.groupby("profession")["gender"].value_counts(normalize=True).unstack()

# Display the proportion of male (0) and female (1) samples for each profession
print("\nGender Proportion by Profession:")
print(profession_gender_dist)

# Identify professions with the largest gender imbalance
most_male_dominated = profession_gender_dist[0].sort_values(ascending=False).head(5)
most_female_dominated = profession_gender_dist[1].sort_values(ascending=False).head(5)

print("\nTop 5 Male-Dominated Professions:")
print(most_male_dominated)

print("\nTop 5 Female-Dominated Professions:")
print(most_female_dominated)


Gender Proportion by Profession:
gender             0         1
profession                    
0           0.633060  0.366940
1           0.763398  0.236602
2           0.617129  0.382871
3           0.736812  0.263188
4           0.788925  0.211075
5           0.836404  0.163596
6           0.647009  0.352991
7           0.071289  0.928711
8           0.858921  0.141079
9           0.670627  0.329373
10          0.191781  0.808219
11          0.505015  0.494985
12          0.172591  0.827409
13          0.091507  0.908493
14          0.542687  0.457313
15          0.150960  0.849040
16          0.760073  0.239927
17          0.544181  0.455819
18          0.642934  0.357066
19          0.506304  0.493696
20          0.509653  0.490347
21          0.548939  0.451061
22          0.379238  0.620762
23          0.903403  0.096597
24          0.842164  0.157836
25          0.851852  0.148148
26          0.397683  0.602317
27          0.154275  0.845725

Top 5 Male-Dominated Professions:
p

In [6]:
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
import torch

def seed_worker(worker_id):
    # Ensure workers use the same seed
    worker_seed = SEED + worker_id
    np.random.seed(worker_seed)
    random.seed(worker_seed)

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Define a custom PyTorch dataset class
class BiasInBiosDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )

        # Return input IDs, attention mask, and label
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Define datasets for training, testing, and validation
train_dataset = BiasInBiosDataset(train_texts, train_professions, tokenizer)
test_dataset = BiasInBiosDataset(test_texts, test_professions, tokenizer)
dev_dataset = BiasInBiosDataset(dev_texts, dev_professions, tokenizer)

# Define data loaders for batching
# Create DataLoader with a fixed seed
train_loader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    worker_init_fn=seed_worker,
    generator=torch.Generator().manual_seed(SEED)  # Ensure reproducibility in DataLoader
)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=False)

# Verify preprocessing with a sample batch
sample_batch = next(iter(train_loader))
print("\nSample Batch from Train Loader:")
print("Input IDs:", sample_batch["input_ids"].shape)
print("Attention Mask:", sample_batch["attention_mask"].shape)
print("Labels:", sample_batch["label"].shape)


Sample Batch from Train Loader:
Input IDs: torch.Size([16, 128])
Attention Mask: torch.Size([16, 128])
Labels: torch.Size([16])


In [7]:
from transformers import BertModel
import torch.nn as nn

# Define the BERT-based classifier
class BertProfessionClassifier(nn.Module):
    def __init__(self, num_classes):
        super(BertProfessionClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(p=0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        # Pass input through BERT
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Use [CLS] token representation (hidden state of the first token)
        pooled_output = outputs.pooler_output
        # Apply dropout and classification head
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Instantiate the model
num_classes = 28  # Number of profession labels
model = BertProfessionClassifier(num_classes)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("\nModel Loaded and Ready for Training!")


Model Loaded and Ready for Training!


## Single Hyperparameter Code (Optional, code with multiple hyperparameters is under this code block)

In [None]:
from transformers import AdamW
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm

# Training parameters
epochs = 3
learning_rate = 2e-5
batch_size = 16

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Learning rate scheduler
scheduler = StepLR(optimizer, step_size=1, gamma=0.1)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    print(f"\nEpoch {epoch+1}/{epochs}")
    for batch in tqdm(train_loader):
        # Move data to the same device as the model
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        # Forward pass and compute loss
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Compute accuracy
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

    # Scheduler step
    scheduler.step()

    # Epoch results
    avg_loss = total_loss / len(train_loader)
    accuracy = correct / total
    print(f"Training Loss: {avg_loss:.4f}, Training Accuracy: {accuracy:.4f}")

print("\nTraining Complete!")

## With hyperparameter tuning

In [8]:
from transformers import AdamW
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm

# Define hyperparameter combinations
learning_rates = [1e-5, 2e-5, 5e-5]
batch_sizes = [8, 16, 32]
epochs_list = [3, 5]

# Store results
results = []

for lr in learning_rates:
    for batch_size in batch_sizes:
        for epochs in epochs_list:
            print(f"\nTraining with learning_rate={lr}, batch_size={batch_size}, epochs={epochs}")
            
            # Update DataLoader with new batch size
            train_loader = DataLoader(
                train_dataset,
                batch_size=batch_size,
                shuffle=True,
                worker_init_fn=seed_worker,
                generator=torch.Generator().manual_seed(SEED)
            )
            test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

            # Define optimizer with new learning rate
            optimizer = AdamW(model.parameters(), lr=lr)

            # Define loss function
            criterion = nn.CrossEntropyLoss()

            # Learning rate scheduler
            scheduler = StepLR(optimizer, step_size=1, gamma=0.1)

            # Training loop
            for epoch in range(epochs):
                model.train()
                total_loss = 0
                correct = 0
                total = 0

                print(f"\nEpoch {epoch+1}/{epochs}")
                for batch in tqdm(train_loader):
                    # Move data to the same device as the model
                    input_ids = batch["input_ids"].to(device)
                    attention_mask = batch["attention_mask"].to(device)
                    labels = batch["label"].to(device)

                    # Forward pass and compute loss
                    optimizer.zero_grad()
                    outputs = model(input_ids, attention_mask)
                    loss = criterion(outputs, labels)
                    total_loss += loss.item()

                    # Backward pass and optimization
                    loss.backward()
                    optimizer.step()

                    # Compute accuracy
                    _, predicted = torch.max(outputs, 1)
                    correct += (predicted == labels).sum().item()
                    total += labels.size(0)

                # Scheduler step
                scheduler.step()

                # Epoch results
                avg_loss = total_loss / len(train_loader)
                accuracy = correct / total
                print(f"Epoch {epoch+1}: Training Loss: {avg_loss:.4f}, Training Accuracy: {accuracy:.4f}")

            # Evaluate model on test set
            model.eval()
            test_correct = 0
            test_total = 0
            with torch.no_grad():
                for batch in test_loader:
                    input_ids = batch["input_ids"].to(device)
                    attention_mask = batch["attention_mask"].to(device)
                    labels = batch["label"].to(device)

                    outputs = model(input_ids, attention_mask)
                    _, predicted = torch.max(outputs, 1)
                    test_correct += (predicted == labels).sum().item()
                    test_total += labels.size(0)

            test_accuracy = test_correct / test_total

            # Record results
            results.append({
                "learning_rate": lr,
                "batch_size": batch_size,
                "epochs": epochs,
                "training_loss": avg_loss,
                "test_accuracy": test_accuracy
            })
            print(f"Results: Learning Rate: {lr}, Batch Size: {batch_size}, Epochs: {epochs}, Test Accuracy: {test_accuracy:.4f}")

# Print all results
print("\nAll Hyperparameter Tuning Results:")
for result in results:
    print(result)




Training with learning_rate=1e-05, batch_size=8, epochs=3

Epoch 1/3


100%|██████████| 32185/32185 [24:43<00:00, 21.70it/s]


Epoch 1: Training Loss: 0.5834, Training Accuracy: 0.8329

Epoch 2/3


100%|██████████| 32185/32185 [24:44<00:00, 21.68it/s]


Epoch 2: Training Loss: 0.3610, Training Accuracy: 0.8906

Epoch 3/3


100%|██████████| 32185/32185 [24:44<00:00, 21.69it/s]


Epoch 3: Training Loss: 0.3277, Training Accuracy: 0.9008
Results: Learning Rate: 1e-05, Batch Size: 8, Epochs: 3, Test Accuracy: 0.8665

Training with learning_rate=1e-05, batch_size=8, epochs=5

Epoch 1/5


100%|██████████| 32185/32185 [24:43<00:00, 21.70it/s]


Epoch 1: Training Loss: 0.3716, Training Accuracy: 0.8875

Epoch 2/5


100%|██████████| 32185/32185 [24:43<00:00, 21.70it/s]


Epoch 2: Training Loss: 0.2397, Training Accuracy: 0.9271

Epoch 3/5


100%|██████████| 32185/32185 [24:43<00:00, 21.69it/s]


Epoch 3: Training Loss: 0.2079, Training Accuracy: 0.9370

Epoch 4/5


100%|██████████| 32185/32185 [24:43<00:00, 21.69it/s]


Epoch 4: Training Loss: 0.2046, Training Accuracy: 0.9381

Epoch 5/5


100%|██████████| 32185/32185 [24:42<00:00, 21.70it/s]


Epoch 5: Training Loss: 0.2046, Training Accuracy: 0.9382
Results: Learning Rate: 1e-05, Batch Size: 8, Epochs: 5, Test Accuracy: 0.8656

Training with learning_rate=1e-05, batch_size=16, epochs=3

Epoch 1/3


100%|██████████| 16093/16093 [16:44<00:00, 16.01it/s]


Epoch 1: Training Loss: 0.2325, Training Accuracy: 0.9290

Epoch 2/3


100%|██████████| 16093/16093 [16:36<00:00, 16.15it/s]


Epoch 2: Training Loss: 0.1492, Training Accuracy: 0.9544

Epoch 3/3


100%|██████████| 16093/16093 [16:36<00:00, 16.15it/s]


Epoch 3: Training Loss: 0.1281, Training Accuracy: 0.9617
Results: Learning Rate: 1e-05, Batch Size: 16, Epochs: 3, Test Accuracy: 0.8622

Training with learning_rate=1e-05, batch_size=16, epochs=5

Epoch 1/5


100%|██████████| 16093/16093 [16:36<00:00, 16.16it/s]


Epoch 1: Training Loss: 0.1550, Training Accuracy: 0.9521

Epoch 2/5


100%|██████████| 16093/16093 [16:35<00:00, 16.16it/s]


Epoch 2: Training Loss: 0.0966, Training Accuracy: 0.9703

Epoch 3/5


100%|██████████| 16093/16093 [16:35<00:00, 16.17it/s]


Epoch 3: Training Loss: 0.0777, Training Accuracy: 0.9768

Epoch 4/5


100%|██████████| 16093/16093 [16:35<00:00, 16.16it/s]


Epoch 4: Training Loss: 0.0768, Training Accuracy: 0.9771

Epoch 5/5


100%|██████████| 16093/16093 [16:35<00:00, 16.16it/s]


Epoch 5: Training Loss: 0.0759, Training Accuracy: 0.9774
Results: Learning Rate: 1e-05, Batch Size: 16, Epochs: 5, Test Accuracy: 0.8592

Training with learning_rate=1e-05, batch_size=32, epochs=3

Epoch 1/3


100%|██████████| 8047/8047 [14:32<00:00,  9.22it/s]


Epoch 1: Training Loss: 0.0935, Training Accuracy: 0.9707

Epoch 2/3


100%|██████████| 8047/8047 [14:31<00:00,  9.23it/s]


Epoch 2: Training Loss: 0.0599, Training Accuracy: 0.9816

Epoch 3/3


100%|██████████| 8047/8047 [14:29<00:00,  9.25it/s]


Epoch 3: Training Loss: 0.0490, Training Accuracy: 0.9856
Results: Learning Rate: 1e-05, Batch Size: 32, Epochs: 3, Test Accuracy: 0.8577

Training with learning_rate=1e-05, batch_size=32, epochs=5

Epoch 1/5


100%|██████████| 8047/8047 [14:43<00:00,  9.11it/s]


Epoch 1: Training Loss: 0.0658, Training Accuracy: 0.9794

Epoch 2/5


100%|██████████| 8047/8047 [14:44<00:00,  9.09it/s]


Epoch 2: Training Loss: 0.0432, Training Accuracy: 0.9863

Epoch 3/5


100%|██████████| 8047/8047 [14:34<00:00,  9.21it/s]


Epoch 3: Training Loss: 0.0339, Training Accuracy: 0.9897

Epoch 4/5


100%|██████████| 8047/8047 [14:30<00:00,  9.25it/s]


Epoch 4: Training Loss: 0.0329, Training Accuracy: 0.9902

Epoch 5/5


100%|██████████| 8047/8047 [14:30<00:00,  9.25it/s]


Epoch 5: Training Loss: 0.0327, Training Accuracy: 0.9902
Results: Learning Rate: 1e-05, Batch Size: 32, Epochs: 5, Test Accuracy: 0.8567

Training with learning_rate=2e-05, batch_size=8, epochs=3

Epoch 1/3


100%|██████████| 32185/32185 [24:43<00:00, 21.69it/s]


Epoch 1: Training Loss: 0.1899, Training Accuracy: 0.9385

Epoch 2/3


100%|██████████| 32185/32185 [24:43<00:00, 21.69it/s]


Epoch 2: Training Loss: 0.0906, Training Accuracy: 0.9705

Epoch 3/3


100%|██████████| 32185/32185 [24:43<00:00, 21.70it/s]


Epoch 3: Training Loss: 0.0464, Training Accuracy: 0.9857
Results: Learning Rate: 2e-05, Batch Size: 8, Epochs: 3, Test Accuracy: 0.8555

Training with learning_rate=2e-05, batch_size=8, epochs=5

Epoch 1/5


100%|██████████| 32185/32185 [24:43<00:00, 21.70it/s]


Epoch 1: Training Loss: 0.1555, Training Accuracy: 0.9488

Epoch 2/5


100%|██████████| 32185/32185 [24:43<00:00, 21.70it/s]


Epoch 2: Training Loss: 0.0702, Training Accuracy: 0.9773

Epoch 3/5


100%|██████████| 32185/32185 [24:43<00:00, 21.69it/s]


Epoch 3: Training Loss: 0.0343, Training Accuracy: 0.9894

Epoch 4/5


100%|██████████| 32185/32185 [24:43<00:00, 21.69it/s]


Epoch 4: Training Loss: 0.0304, Training Accuracy: 0.9907

Epoch 5/5


100%|██████████| 32185/32185 [24:43<00:00, 21.69it/s]


Epoch 5: Training Loss: 0.0308, Training Accuracy: 0.9905
Results: Learning Rate: 2e-05, Batch Size: 8, Epochs: 5, Test Accuracy: 0.8557

Training with learning_rate=2e-05, batch_size=16, epochs=3

Epoch 1/3


100%|██████████| 16093/16093 [16:36<00:00, 16.15it/s]


Epoch 1: Training Loss: 0.0853, Training Accuracy: 0.9718

Epoch 2/3


100%|██████████| 16093/16093 [16:35<00:00, 16.16it/s]


Epoch 2: Training Loss: 0.0410, Training Accuracy: 0.9867

Epoch 3/3


100%|██████████| 16093/16093 [16:35<00:00, 16.17it/s]


Epoch 3: Training Loss: 0.0214, Training Accuracy: 0.9934
Results: Learning Rate: 2e-05, Batch Size: 16, Epochs: 3, Test Accuracy: 0.8553

Training with learning_rate=2e-05, batch_size=16, epochs=5

Epoch 1/5


100%|██████████| 16093/16093 [16:34<00:00, 16.18it/s]


Epoch 1: Training Loss: 0.0687, Training Accuracy: 0.9772

Epoch 2/5


100%|██████████| 16093/16093 [16:35<00:00, 16.17it/s]


Epoch 2: Training Loss: 0.0327, Training Accuracy: 0.9893

Epoch 3/5


100%|██████████| 16093/16093 [16:35<00:00, 16.17it/s]


Epoch 3: Training Loss: 0.0154, Training Accuracy: 0.9954

Epoch 4/5


100%|██████████| 16093/16093 [16:35<00:00, 16.17it/s]


Epoch 4: Training Loss: 0.0143, Training Accuracy: 0.9957

Epoch 5/5


100%|██████████| 16093/16093 [16:35<00:00, 16.16it/s]


Epoch 5: Training Loss: 0.0140, Training Accuracy: 0.9958
Results: Learning Rate: 2e-05, Batch Size: 16, Epochs: 5, Test Accuracy: 0.8540

Training with learning_rate=2e-05, batch_size=32, epochs=3

Epoch 1/3


100%|██████████| 8047/8047 [14:29<00:00,  9.26it/s]


Epoch 1: Training Loss: 0.0398, Training Accuracy: 0.9867

Epoch 2/3


100%|██████████| 8047/8047 [14:28<00:00,  9.27it/s]


Epoch 2: Training Loss: 0.0201, Training Accuracy: 0.9935

Epoch 3/3


100%|██████████| 8047/8047 [14:28<00:00,  9.26it/s]


Epoch 3: Training Loss: 0.0111, Training Accuracy: 0.9966
Results: Learning Rate: 2e-05, Batch Size: 32, Epochs: 3, Test Accuracy: 0.8551

Training with learning_rate=2e-05, batch_size=32, epochs=5

Epoch 1/5


100%|██████████| 8047/8047 [14:29<00:00,  9.26it/s]


Epoch 1: Training Loss: 0.0334, Training Accuracy: 0.9890

Epoch 2/5


100%|██████████| 8047/8047 [14:28<00:00,  9.26it/s]


Epoch 2: Training Loss: 0.0160, Training Accuracy: 0.9948

Epoch 3/5


100%|██████████| 8047/8047 [14:27<00:00,  9.27it/s]


Epoch 3: Training Loss: 0.0084, Training Accuracy: 0.9974

Epoch 4/5


100%|██████████| 8047/8047 [14:28<00:00,  9.27it/s]


Epoch 4: Training Loss: 0.0079, Training Accuracy: 0.9977

Epoch 5/5


100%|██████████| 8047/8047 [14:29<00:00,  9.25it/s]


Epoch 5: Training Loss: 0.0080, Training Accuracy: 0.9977
Results: Learning Rate: 2e-05, Batch Size: 32, Epochs: 5, Test Accuracy: 0.8535

Training with learning_rate=5e-05, batch_size=8, epochs=3

Epoch 1/3


100%|██████████| 32185/32185 [24:44<00:00, 21.69it/s]


Epoch 1: Training Loss: 0.3631, Training Accuracy: 0.8895

Epoch 2/3


100%|██████████| 32185/32185 [24:43<00:00, 21.70it/s]


Epoch 2: Training Loss: 0.1924, Training Accuracy: 0.9388

Epoch 3/3


100%|██████████| 32185/32185 [24:43<00:00, 21.70it/s]


Epoch 3: Training Loss: 0.0850, Training Accuracy: 0.9735
Results: Learning Rate: 5e-05, Batch Size: 8, Epochs: 3, Test Accuracy: 0.8482

Training with learning_rate=5e-05, batch_size=8, epochs=5

Epoch 1/5


100%|██████████| 32185/32185 [24:44<00:00, 21.69it/s]


Epoch 1: Training Loss: 0.3707, Training Accuracy: 0.8867

Epoch 2/5


100%|██████████| 32185/32185 [24:43<00:00, 21.69it/s]


Epoch 2: Training Loss: 0.1849, Training Accuracy: 0.9419

Epoch 3/5


100%|██████████| 32185/32185 [24:44<00:00, 21.68it/s]


Epoch 3: Training Loss: 0.0943, Training Accuracy: 0.9707

Epoch 4/5


100%|██████████| 32185/32185 [24:42<00:00, 21.71it/s]


Epoch 4: Training Loss: 0.0844, Training Accuracy: 0.9735

Epoch 5/5


100%|██████████| 32185/32185 [24:43<00:00, 21.70it/s]


Epoch 5: Training Loss: 0.0828, Training Accuracy: 0.9743
Results: Learning Rate: 5e-05, Batch Size: 8, Epochs: 5, Test Accuracy: 0.8428

Training with learning_rate=5e-05, batch_size=16, epochs=3

Epoch 1/3


100%|██████████| 16093/16093 [16:34<00:00, 16.19it/s]


Epoch 1: Training Loss: 0.2288, Training Accuracy: 0.9285

Epoch 2/3


100%|██████████| 16093/16093 [16:34<00:00, 16.18it/s]


Epoch 2: Training Loss: 0.1051, Training Accuracy: 0.9671

Epoch 3/3


100%|██████████| 16093/16093 [16:33<00:00, 16.20it/s]


Epoch 3: Training Loss: 0.0541, Training Accuracy: 0.9835
Results: Learning Rate: 5e-05, Batch Size: 16, Epochs: 3, Test Accuracy: 0.8405

Training with learning_rate=5e-05, batch_size=16, epochs=5

Epoch 1/5


100%|██████████| 16093/16093 [16:33<00:00, 16.19it/s]


Epoch 1: Training Loss: 0.1650, Training Accuracy: 0.9476

Epoch 2/5


100%|██████████| 16093/16093 [16:34<00:00, 16.19it/s]


Epoch 2: Training Loss: 0.0757, Training Accuracy: 0.9759

Epoch 3/5


100%|██████████| 16093/16093 [16:33<00:00, 16.20it/s]


Epoch 3: Training Loss: 0.0343, Training Accuracy: 0.9896

Epoch 4/5


100%|██████████| 16093/16093 [16:34<00:00, 16.19it/s]


Epoch 4: Training Loss: 0.0307, Training Accuracy: 0.9907

Epoch 5/5


100%|██████████| 16093/16093 [16:33<00:00, 16.19it/s]


Epoch 5: Training Loss: 0.0299, Training Accuracy: 0.9909
Results: Learning Rate: 5e-05, Batch Size: 16, Epochs: 5, Test Accuracy: 0.8384

Training with learning_rate=5e-05, batch_size=32, epochs=3

Epoch 1/3


100%|██████████| 8047/8047 [14:29<00:00,  9.26it/s]


Epoch 1: Training Loss: 0.0944, Training Accuracy: 0.9698

Epoch 2/3


100%|██████████| 8047/8047 [14:34<00:00,  9.20it/s]


Epoch 2: Training Loss: 0.0408, Training Accuracy: 0.9871

Epoch 3/3


100%|██████████| 8047/8047 [14:33<00:00,  9.22it/s]


Epoch 3: Training Loss: 0.0195, Training Accuracy: 0.9941
Results: Learning Rate: 5e-05, Batch Size: 32, Epochs: 3, Test Accuracy: 0.8386

Training with learning_rate=5e-05, batch_size=32, epochs=5

Epoch 1/5


100%|██████████| 8047/8047 [14:26<00:00,  9.29it/s]


Epoch 1: Training Loss: 0.0732, Training Accuracy: 0.9765

Epoch 2/5


100%|██████████| 8047/8047 [14:28<00:00,  9.27it/s]


Epoch 2: Training Loss: 0.0347, Training Accuracy: 0.9890

Epoch 3/5


100%|██████████| 8047/8047 [14:27<00:00,  9.27it/s]


Epoch 3: Training Loss: 0.0160, Training Accuracy: 0.9951

Epoch 4/5


100%|██████████| 8047/8047 [14:28<00:00,  9.27it/s]


Epoch 4: Training Loss: 0.0139, Training Accuracy: 0.9958

Epoch 5/5


100%|██████████| 8047/8047 [14:27<00:00,  9.28it/s]


Epoch 5: Training Loss: 0.0137, Training Accuracy: 0.9958
Results: Learning Rate: 5e-05, Batch Size: 32, Epochs: 5, Test Accuracy: 0.8375

All Hyperparameter Tuning Results:
{'learning_rate': 1e-05, 'batch_size': 8, 'epochs': 3, 'training_loss': 0.3276983403063256, 'test_accuracy': 0.8665475577627714}
{'learning_rate': 1e-05, 'batch_size': 8, 'epochs': 5, 'training_loss': 0.20464519201786918, 'test_accuracy': 0.8655785361717591}
{'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 3, 'training_loss': 0.12810454363124932, 'test_accuracy': 0.8622475244526542}
{'learning_rate': 1e-05, 'batch_size': 16, 'epochs': 5, 'training_loss': 0.07592265650379039, 'test_accuracy': 0.8591991440309279}
{'learning_rate': 1e-05, 'batch_size': 32, 'epochs': 3, 'training_loss': 0.04898987576431209, 'test_accuracy': 0.8577153297196903}
{'learning_rate': 1e-05, 'batch_size': 32, 'epochs': 5, 'training_loss': 0.032721297986310904, 'test_accuracy': 0.8566756503043333}
{'learning_rate': 2e-05, 'batch_size': 8, 

## Save model

In [None]:
import torch
import os

# Define the save directory
save_directory = "./bert_profession_classifier"
os.makedirs(save_directory, exist_ok=True)

# Save the model's state dict
torch.save(model.state_dict(), os.path.join(save_directory, "model.pth"))

# Save the tokenizer (if applicable)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

## Load Model

In [8]:
import torch
import os

# Reinitialize the model
save_directory = "./bert_profession_classifier"
model = BertProfessionClassifier(num_classes)

# Load the saved state dictionary
model.load_state_dict(torch.load(os.path.join(save_directory, "model.pth")))

# Move the model to the appropriate device
model.to(device)

# Load the tokenizer (if applicable)
tokenizer = BertTokenizer.from_pretrained(save_directory)

print("Model and tokenizer successfully loaded!")

  model.load_state_dict(torch.load(os.path.join(save_directory, "model.pth")))


Model and tokenizer successfully loaded!


In [9]:
from sklearn.metrics import classification_report

# Evaluation loop
def evaluate_model(model, data_loader):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader):
            # Move data to the same device as the model
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, 1)

            # Collect predictions and true labels
            predictions.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return predictions, true_labels

# Evaluate the model on the test set
predictions, true_labels = evaluate_model(model, test_loader)

# Generate a classification report
print("\nClassification Report:")
print(classification_report(true_labels, predictions, digits=4))

100%|██████████| 6192/6192 [02:48<00:00, 36.67it/s]


Classification Report:
              precision    recall  f1-score   support

           0     0.8353    0.7700    0.8013      1409
           1     0.7942    0.7112    0.7504      2528
           2     0.9001    0.9215    0.9107      8143
           3     0.8027    0.7108    0.7540       664
           4     0.8301    0.8407    0.8353       703
           5     0.8601    0.9044    0.8817      1401
           6     0.9410    0.9580    0.9495      3647
           7     0.8657    0.8665    0.8661       989
           8     0.8790    0.8199    0.8484       372
           9     0.8550    0.8629    0.8589      1750
          10     0.7994    0.7514    0.7746       366
          11     0.7895    0.8191    0.8040      4986
          12     0.8603    0.8218    0.8406      1874
          13     0.8651    0.8499    0.8574      4738
          14     0.8478    0.8605    0.8541      1935
          15     0.8567    0.7036    0.7727       442
          16     0.6956    0.7484    0.7210       632
   




In [10]:
# Controlled prompts for bias detection
male_prompts = [f"He is a {profession}." for profession in [
    "teacher", "nurse", "software engineer", "surgeon", "rapper"
]]
female_prompts = [f"She is a {profession}." for profession in [
    "teacher", "nurse", "software engineer", "surgeon", "rapper"
]]

# Tokenize prompts
male_inputs = tokenizer(male_prompts, padding=True, truncation=True, return_tensors="pt").to(device)
female_inputs = tokenizer(female_prompts, padding=True, truncation=True, return_tensors="pt").to(device)

# Get model predictions
model.eval()
with torch.no_grad():
    male_outputs = model(male_inputs["input_ids"], male_inputs["attention_mask"])
    female_outputs = model(female_inputs["input_ids"], female_inputs["attention_mask"])

# Convert predictions to probabilities
male_probs = torch.nn.functional.softmax(male_outputs, dim=1)
female_probs = torch.nn.functional.softmax(female_outputs, dim=1)

# Get top predicted classes and probabilities
male_predictions = torch.argmax(male_probs, dim=1).cpu().numpy()
female_predictions = torch.argmax(female_probs, dim=1).cpu().numpy()

# Map profession labels back to their names
profession_mapping = {
    0: "accountant", 1: "architect", 2: "attorney", 3: "chiropractor", 4: "comedian",
    5: "composer", 6: "dentist", 7: "dietitian", 8: "dj", 9: "filmmaker", 10: "interior_designer",
    11: "journalist", 12: "model", 13: "nurse", 14: "painter", 15: "paralegal", 16: "pastor",
    17: "personal_trainer", 18: "photographer", 19: "physician", 20: "poet", 21: "professor",
    22: "psychologist", 23: "rapper", 24: "software_engineer", 25: "surgeon", 26: "teacher",
    27: "yoga_teacher"
}

# Print predictions for each prompt
print("\nBias Detection Results:")
for i in range(len(male_prompts)):
    print(f"Male Prompt: {male_prompts[i]} -> Predicted: {profession_mapping[male_predictions[i]]}")
    print(f"Female Prompt: {female_prompts[i]} -> Predicted: {profession_mapping[female_predictions[i]]}")
    print("-" * 50)


Bias Detection Results:
Male Prompt: He is a teacher. -> Predicted: teacher
Female Prompt: She is a teacher. -> Predicted: teacher
--------------------------------------------------
Male Prompt: He is a nurse. -> Predicted: nurse
Female Prompt: She is a nurse. -> Predicted: nurse
--------------------------------------------------
Male Prompt: He is a software engineer. -> Predicted: software_engineer
Female Prompt: She is a software engineer. -> Predicted: software_engineer
--------------------------------------------------
Male Prompt: He is a surgeon. -> Predicted: surgeon
Female Prompt: She is a surgeon. -> Predicted: professor
--------------------------------------------------
Male Prompt: He is a rapper. -> Predicted: rapper
Female Prompt: She is a rapper. -> Predicted: rapper
--------------------------------------------------


In [11]:
# Calculate bias scores
bias_scores = []
for i in range(len(male_prompts)):
    male_prob = male_probs[i, male_predictions[i]].item()
    female_prob = female_probs[i, female_predictions[i]].item()
    bias_scores.append(abs(male_prob - female_prob))

# Print bias scores for each profession
print("\nBias Scores:")
for i in range(len(male_prompts)):
    print(f"Profession: {male_prompts[i].split()[-1][:-1]} - Bias Score: {bias_scores[i]:.4f}")


Bias Scores:
Profession: teacher - Bias Score: 0.0185
Profession: nurse - Bias Score: 0.3243
Profession: engineer - Bias Score: 0.0957
Profession: surgeon - Bias Score: 0.0748
Profession: rapper - Bias Score: 0.3775


## Balancing Dataset

In [12]:
from collections import Counter
import random
import pandas as pd

# Combine the training data into a DataFrame for processing
train_data = pd.DataFrame({
    "text": train_texts,
    "profession": train_professions,
    "gender": train_genders
})

# Count the gender distribution for each profession
gender_profession_counts = train_data.groupby(["profession", "gender"]).size()

# Find the maximum number of samples for any gender-profession combination
max_samples = gender_profession_counts.max()

# Oversample and undersample the data
balanced_data = []
for profession in train_data["profession"].unique():
    for gender in [0, 1]:  # 0 = male, 1 = female
        # Filter samples for the current profession and gender
        subset = train_data[(train_data["profession"] == profession) & (train_data["gender"] == gender)]
        
        if len(subset) == 0:
            continue  # Skip if there are no samples for this combination
        
        # Oversample or undersample to match max_samples
        if len(subset) < max_samples:
            oversampled = subset.sample(max_samples, replace=True, random_state=42)
            balanced_data.append(oversampled)
        else:
            undersampled = subset.sample(max_samples, replace=False, random_state=42)
            balanced_data.append(undersampled)

# Combine the balanced subsets into a single DataFrame
balanced_train_data = pd.concat(balanced_data)

# Shuffle the data
balanced_train_data = balanced_train_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Extract the balanced training data
balanced_train_texts = balanced_train_data["text"].tolist()
balanced_train_professions = balanced_train_data["profession"].tolist()
balanced_train_genders = balanced_train_data["gender"].tolist()

print("\nBalanced Training Dataset Created!")
print(f"Total Samples: {len(balanced_train_texts)}")


Balanced Training Dataset Created!
Total Samples: 2359280


In [17]:
from transformers import AdamW
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm

# Create a new dataset and data loader for the balanced training data
balanced_train_dataset = BiasInBiosDataset(balanced_train_texts, balanced_train_professions, tokenizer)
balanced_train_loader = DataLoader(balanced_train_dataset, batch_size=16, shuffle=True)



# Training parameters
epochs = 3
learning_rate = 2e-5
batch_size = 16

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Learning rate scheduler
scheduler = StepLR(optimizer, step_size=1, gamma=0.1)


# Reinitialize the model
model = BertProfessionClassifier(num_classes)
model.to(device)

# Define a new optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = StepLR(optimizer, step_size=1, gamma=0.1)

# Retrain the model
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    print(f"\nEpoch {epoch+1}/{epochs} (Balanced Data)")
    for batch in tqdm(balanced_train_loader):
        # Move data to the same device as the model
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        # Forward pass and compute loss
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Compute accuracy
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

    # Scheduler step
    scheduler.step()

    # Epoch results
    avg_loss = total_loss / len(balanced_train_loader)
    accuracy = correct / total
    print(f"Balanced Training Loss: {avg_loss:.4f}, Balanced Training Accuracy: {accuracy:.4f}")

print("\nRetraining Complete!")


Epoch 1/3 (Balanced Data)


100%|██████████| 147455/147455 [2:31:49<00:00, 16.19it/s] 


Balanced Training Loss: 0.1731, Balanced Training Accuracy: 0.9512

Epoch 2/3 (Balanced Data)


100%|██████████| 147455/147455 [2:31:39<00:00, 16.20it/s] 


Balanced Training Loss: 0.0240, Balanced Training Accuracy: 0.9932

Epoch 3/3 (Balanced Data)


100%|██████████| 147455/147455 [2:31:43<00:00, 16.20it/s] 

Balanced Training Loss: 0.0106, Balanced Training Accuracy: 0.9971

Retraining Complete!





## Save Retrained

In [18]:
import torch
import os

# Define the save directory
save_directory = "./bert_profession_classifier_retrained"
os.makedirs(save_directory, exist_ok=True)

# Save the model's state dict
torch.save(model.state_dict(), os.path.join(save_directory, "retrained_model.pth"))

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

Model and tokenizer saved to ./bert_profession_classifier_retrained


## Load retrained

In [None]:
import torch
import os

# Reinitialize the model
save_directory = "./bert_profession_classifier_retrained"
model = BertProfessionClassifier(num_classes)

# Load the saved state dictionary
model.load_state_dict(torch.load(os.path.join(save_directory, "retrained_model.pth")))

# Move the model to the appropriate device
model.to(device)

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained(save_directory)

print("Model and tokenizer successfully loaded!")

## Eval

In [19]:
# Repeat bias detection with the retrained model
model.eval()
with torch.no_grad():
    male_outputs = model(male_inputs["input_ids"], male_inputs["attention_mask"])
    female_outputs = model(female_inputs["input_ids"], female_inputs["attention_mask"])

# Convert predictions to probabilities
male_probs = torch.nn.functional.softmax(male_outputs, dim=1)
female_probs = torch.nn.functional.softmax(female_outputs, dim=1)

# Get top predicted classes and probabilities
male_predictions = torch.argmax(male_probs, dim=1).cpu().numpy()
female_predictions = torch.argmax(female_probs, dim=1).cpu().numpy()

# Print predictions for each prompt again
print("\nPost-Mitigation Bias Detection Results:")
for i in range(len(male_prompts)):
    print(f"Male Prompt: {male_prompts[i]} -> Predicted: {profession_mapping[male_predictions[i]]}")
    print(f"Female Prompt: {female_prompts[i]} -> Predicted: {profession_mapping[female_predictions[i]]}")
    print("-" * 50)

# Calculate post-mitigation bias scores
bias_scores = []
for i in range(len(male_prompts)):
    male_prob = male_probs[i, male_predictions[i]].item()
    female_prob = female_probs[i, female_predictions[i]].item()
    bias_scores.append(abs(male_prob - female_prob))

print("\nPost-Mitigation Bias Scores:")
for i in range(len(male_prompts)):
    print(f"Profession: {male_prompts[i].split()[-1][:-1]} - Bias Score: {bias_scores[i]:.4f}")


Post-Mitigation Bias Detection Results:
Male Prompt: He is a teacher. -> Predicted: teacher
Female Prompt: She is a teacher. -> Predicted: teacher
--------------------------------------------------
Male Prompt: He is a nurse. -> Predicted: nurse
Female Prompt: She is a nurse. -> Predicted: nurse
--------------------------------------------------
Male Prompt: He is a software engineer. -> Predicted: software_engineer
Female Prompt: She is a software engineer. -> Predicted: software_engineer
--------------------------------------------------
Male Prompt: He is a surgeon. -> Predicted: journalist
Female Prompt: She is a surgeon. -> Predicted: journalist
--------------------------------------------------
Male Prompt: He is a rapper. -> Predicted: rapper
Female Prompt: She is a rapper. -> Predicted: rapper
--------------------------------------------------

Post-Mitigation Bias Scores:
Profession: teacher - Bias Score: 0.0001
Profession: nurse - Bias Score: 0.0033
Profession: engineer - Bi

## Adversarial training

In [22]:
from torch.utils.data import Dataset

class BiasInBiosDataset(Dataset):
    def __init__(self, texts, professions, genders, tokenizer, max_length=128):
        """
        Custom dataset for bias mitigation.

        Args:
            texts (list): List of input texts.
            professions (list): List of profession labels (e.g., integers).
            genders (list): List of gender labels (e.g., 0 = male, 1 = female).
            tokenizer (transformers.PreTrainedTokenizer): Tokenizer to process input texts.
            max_length (int): Maximum length for tokenized inputs.
        """
        self.texts = texts
        self.professions = professions  # Profession labels
        self.genders = genders  # Gender labels (0 = male, 1 = female)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        """Returns the number of samples in the dataset."""
        return len(self.texts)

    def __getitem__(self, idx):
        """
        Returns a single sample from the dataset.

        Args:
            idx (int): Index of the sample.

        Returns:
            dict: A dictionary containing input_ids, attention_mask, label (profession), and gender.
        """
        # Tokenize the input text
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        
        # Return input_ids, attention_mask, profession label, and gender label
        return {
            "input_ids": encoding["input_ids"].squeeze(0),  # Remove batch dimension
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": self.professions[idx],  # Profession label
            "gender": self.genders[idx],  # Gender label
        }

In [23]:
from torch.utils.data import DataLoader

# Assume balanced_train_texts, balanced_train_professions, and balanced_train_genders are already defined
# Example:
# balanced_train_texts = ["He is a teacher.", "She is a nurse.", ...]
# balanced_train_professions = [0, 1, ...]  # Integer labels for professions
# balanced_train_genders = [0, 1, ...]  # 0 for male, 1 for female

# Create the dataset
balanced_train_dataset = BiasInBiosDataset(
    texts=balanced_train_texts,
    professions=balanced_train_professions,
    genders=balanced_train_genders,  # Pass gender labels
    tokenizer=tokenizer,  # Assume tokenizer is already initialized
)

# Create the data loader
balanced_train_loader = DataLoader(
    balanced_train_dataset,
    batch_size=16,  # Adjust batch size as needed
    shuffle=True,
    drop_last=True,  # Drop the last incomplete batch
)

In [24]:
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

# Define the adversarial model
class AdversarialBERT(nn.Module):
    def __init__(self, num_classes):
        super(AdversarialBERT, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)  # For profession classification
        self.gender_classifier = nn.Linear(self.bert.config.hidden_size, 2)  # For gender classification

    def forward(self, input_ids, attention_mask, gender_labels=None, lambda_adv=0.1):
        # Forward pass through BERT
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = self.dropout(outputs.pooler_output)

        # Main task: Profession classification
        logits = self.classifier(pooled_output)

        # Adversarial task: Gender classification
        if gender_labels is not None:
            gender_logits = self.gender_classifier(pooled_output)
            gender_loss = F.cross_entropy(gender_logits, gender_labels)
            return logits, gender_loss * lambda_adv

        return logits

# Initialize model, optimizer, and loss function
num_classes = len(set(balanced_train_professions))  # Number of unique professions
adv_model = AdversarialBERT(num_classes=num_classes)
adv_model.to(device)

optimizer = torch.optim.AdamW(adv_model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()  # For profession classification
lambda_adv = 0.1  # Weight for adversarial loss

# Training loop
epochs = 3
for epoch in range(epochs):
    adv_model.train()
    total_loss, total_gender_loss = 0.0, 0.0
    correct, total = 0, 0

    print(f"Epoch {epoch+1}/{epochs} (Adversarial Training)")
    for batch in tqdm(balanced_train_loader):
        # Move data to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)  # Profession labels
        gender_labels = batch["gender"].to(device)  # Gender labels
        
        # Forward pass
        optimizer.zero_grad()
        logits, gender_loss = adv_model(input_ids, attention_mask, gender_labels, lambda_adv)

        # Calculate profession classification loss
        profession_loss = criterion(logits, labels)
        total_loss += profession_loss.item()
        total_gender_loss += gender_loss.item()

        # Backpropagation and optimization
        (profession_loss + gender_loss).backward()
        optimizer.step()

        # Accuracy calculation
        _, predicted = torch.max(logits, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

    # Epoch summary
    print(f"Profession Loss: {total_loss:.4f}, Gender Loss: {total_gender_loss:.4f}")
    print(f"Training Accuracy: {correct / total:.4f}")

print("\nAdversarial Training Complete!")

Epoch 1/3 (Adversarial Training)


100%|██████████| 147455/147455 [2:27:23<00:00, 16.67it/s] 


Profession Loss: 25200.8211, Gender Loss: 199.4598
Training Accuracy: 0.9518
Epoch 2/3 (Adversarial Training)


100%|██████████| 147455/147455 [2:28:22<00:00, 16.56it/s]  


Profession Loss: 7586.6863, Gender Loss: 40.1718
Training Accuracy: 0.9848
Epoch 3/3 (Adversarial Training)


100%|██████████| 147455/147455 [2:27:43<00:00, 16.64it/s] 

Profession Loss: 5031.7663, Gender Loss: 30.4378
Training Accuracy: 0.9900

Adversarial Training Complete!





## Save model

In [25]:
import torch
import os

# Define the save directory
save_directory = "./bert_profession_classifier_retrained_adversarial"
os.makedirs(save_directory, exist_ok=True)

# Save the model's state dict
torch.save(model.state_dict(), os.path.join(save_directory, "retrained_model_adversarial.pth"))

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

Model and tokenizer saved to ./bert_profession_classifier_retrained_adversarial


## Load Model

In [None]:
import torch
import os

# Reinitialize the model
save_directory = "./bert_profession_classifier_retrained_adversarial"
model = BertProfessionClassifier(num_classes)

# Load the saved state dictionary
model.load_state_dict(torch.load(os.path.join(save_directory, "retrained_model_adversarial.pth")))

# Move the model to the appropriate device
model.to(device)

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained(save_directory)

print("Model and tokenizer successfully loaded!")

In [28]:
import torch
import torch.nn.functional as F

# Controlled prompts for evaluation
male_prompts = [
    "He is a surgeon.",
    "He is a nurse.",
    "He is a teacher.",
    "He is a software engineer.",
    "He is a chef."
]

female_prompts = [
    "She is a surgeon.",
    "She is a nurse.",
    "She is a teacher.",
    "She is a software engineer.",
    "She is a chef."
]

# Tokenize the prompts
def tokenize_prompts(prompts, tokenizer, max_length=128):
    """
    Tokenizes a list of prompts for evaluation.

    Args:
        prompts (list): List of text prompts.
        tokenizer (transformers.PreTrainedTokenizer): Tokenizer to process input texts.
        max_length (int): Maximum token length.

    Returns:
        dict: Tokenized inputs with input_ids and attention_mask tensors.
    """
    encodings = tokenizer(
        prompts,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    return {
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"]
    }

# Tokenize male and female prompts
male_inputs = tokenize_prompts(male_prompts, tokenizer)
female_inputs = tokenize_prompts(female_prompts, tokenizer)

# Move inputs to the device
male_inputs = {key: value.to(device) for key, value in male_inputs.items()}
female_inputs = {key: value.to(device) for key, value in female_inputs.items()}

# Evaluate the adversarially trained model
adv_model.eval()  # Set model to evaluation mode
with torch.no_grad():
    # Get predictions for male and female prompts
    male_logits = adv_model(male_inputs["input_ids"], male_inputs["attention_mask"])
    female_logits = adv_model(female_inputs["input_ids"], female_inputs["attention_mask"])

# Convert logits to probabilities using softmax
male_probs = F.softmax(male_logits, dim=1)
female_probs = F.softmax(female_logits, dim=1)

# Get the top predicted classes and their probabilities
male_predictions = torch.argmax(male_probs, dim=1).cpu().numpy()
female_predictions = torch.argmax(female_probs, dim=1).cpu().numpy()

male_confidences = male_probs.max(dim=1).values.cpu().numpy()
female_confidences = female_probs.max(dim=1).values.cpu().numpy()

# Dynamically handle profession mapping
num_classes = male_logits.size(1)  # Number of output classes
profession_mapping = {i: f"profession_{i}" for i in range(num_classes)}  # Placeholder mapping

# Map predictions back to profession labels
male_predictions_classes = [profession_mapping[pred] for pred in male_predictions]
female_predictions_classes = [profession_mapping[pred] for pred in female_predictions]

# Calculate bias scores
bias_scores = []
for i in range(len(male_prompts)):
    bias_score = abs(male_confidences[i] - female_confidences[i])
    bias_scores.append(bias_score)

# Print evaluation results
print("\nBias Mitigation Results:")
print("-" * 50)
for i in range(len(male_prompts)):
    print(f"Male Prompt: '{male_prompts[i]}'")
    print(f"  Predicted Profession: {male_predictions_classes[i]} | Confidence: {male_confidences[i]:.4f}")
    print(f"Female Prompt: '{female_prompts[i]}'")
    print(f"  Predicted Profession: {female_predictions_classes[i]} | Confidence: {female_confidences[i]:.4f}")
    print(f"Bias Score: {bias_scores[i]:.4f}")
    print("-" * 50)

# Print overall bias summary
average_bias_score = sum(bias_scores) / len(bias_scores)
print(f"\nAverage Bias Score Across Prompts: {average_bias_score:.4f}")


Bias Mitigation Results:
--------------------------------------------------
Male Prompt: 'He is a surgeon.'
  Predicted Profession: profession_19 | Confidence: 0.5406
Female Prompt: 'She is a surgeon.'
  Predicted Profession: profession_19 | Confidence: 0.6425
Bias Score: 0.1018
--------------------------------------------------
Male Prompt: 'He is a nurse.'
  Predicted Profession: profession_26 | Confidence: 0.7107
Female Prompt: 'She is a nurse.'
  Predicted Profession: profession_26 | Confidence: 0.7566
Bias Score: 0.0459
--------------------------------------------------
Male Prompt: 'He is a teacher.'
  Predicted Profession: profession_26 | Confidence: 0.9984
Female Prompt: 'She is a teacher.'
  Predicted Profession: profession_26 | Confidence: 0.9980
Bias Score: 0.0004
--------------------------------------------------
Male Prompt: 'He is a software engineer.'
  Predicted Profession: profession_24 | Confidence: 0.9998
Female Prompt: 'She is a software engineer.'
  Predicted Prof

In [1]:
from nbconvert import ScriptExporter
import nbformat

# Load the notebook
with open('BiasInBios.ipynb', 'r', encoding='utf-8') as f:
    notebook = nbformat.read(f, as_version=4)

# Convert to script
script_exporter = ScriptExporter()
script, _ = script_exporter.from_notebook_node(notebook)

# Save the script
with open('output_script.py', 'w', encoding='utf-8') as f:
    f.write(script)
