In [3]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import numpy as np
from torch.utils.data import Dataset
from sklearn.metrics import roc_auc_score, f1_score, hamming_loss, matthews_corrcoef, cohen_kappa_score
from transformers import EvalPrediction, TrainingArguments, Trainer
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig

# Load and preprocess data
df = pd.read_csv('drive/MyDrive/preprocessed.csv')
rng_seed = 100
df_randomized_order = df.sample(frac=1, random_state=rng_seed)
df_randomized_order = df_randomized_order[df_randomized_order["section_code"] != "-"]
df_randomized_order['section_code'] = df_randomized_order['section_code'].str.split(',')

# Multi-label binarization
multilabel = MultiLabelBinarizer()
labels = multilabel.fit_transform(df_randomized_order['section_code']).astype('float32')
texts = df_randomized_order['abstracted_heading_plus_content'].tolist()

# Balancing the dataset
def balance_dataset(texts, labels):
    total_instances = len(texts)
    balanced_texts = []
    balanced_labels = []

    for i in range(labels.shape[1]):
        class_indices = np.where(labels[:, i] == 1)[0]
        class_texts = [texts[j] for j in class_indices]
        class_labels = labels[class_indices]

        num_dup = total_instances // len(class_texts)
        balanced_class_texts = class_texts * num_dup
        balanced_class_labels = np.tile(class_labels, (num_dup, 1))

        num_add = total_instances % len(class_texts)
        resampled_texts = resample(class_texts, n_samples=num_add, random_state=0)
        resampled_labels = resample(class_labels, n_samples=num_add, random_state=0)

        balanced_class_texts.extend(resampled_texts)
        # Pass a tuple of arrays to vstack
        balanced_class_labels = np.vstack((balanced_class_labels, resampled_labels))

        balanced_texts.extend(balanced_class_texts)
        balanced_labels.append(balanced_class_labels)

    balanced_labels = np.vstack(balanced_labels)
    return balanced_texts, balanced_labels

balanced_texts, balanced_labels = balance_dataset(texts, labels)

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(balanced_texts, balanced_labels, test_size=0.3, random_state=42)

# Load tokenizer and model
checkpoint = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(checkpoint)
model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=labels.shape[1])

# Custom dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = torch.tensor(self.labels[idx])
        encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label
        }

train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

from sklearn.metrics import matthews_corrcoef, cohen_kappa_score

def multi_labels_metrics(predictions, labels, threshold=0.3):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    y_true = labels

    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_micro = f1_score(y_true, y_pred, average='micro')
    roc_auc_macro = roc_auc_score(y_true, y_pred, average='macro')
    roc_auc_micro = roc_auc_score(y_true, y_pred, average='micro')
    hamming = hamming_loss(y_true, y_pred)

    # Label-wise weighted MCC calculation
    num_labels = y_true.shape[1]
    mcc_weighted = 0
    total_weight = 0

    for i in range(num_labels):
        y_true_label = y_true[:, i]
        y_pred_label = y_pred[:, i]

        # Weight by the proportion of positive instances for the label
        weight = float(y_true_label.sum()) / y_true_label.shape[0]
        total_weight += weight
        mcc_weighted += weight * matthews_corrcoef(y_true_label, y_pred_label)

    # Normalize by total weight
    if total_weight > 0:
        mcc_weighted /= total_weight

    # Weighted Kappa calculation (if needed)
    y_true_flat = y_true.flatten()
    y_pred_flat = y_pred.flatten()
    kappa_weighted = cohen_kappa_score(y_true_flat, y_pred_flat, weights="quadratic")

    metrics = {
        "roc_auc_macro": roc_auc_macro,
        "roc_auc_micro": roc_auc_micro,
        "hamming_loss": hamming,
        "f1_macro": f1_macro,
        "f1_micro": f1_micro,
        "mcc_weighted": mcc_weighted,
        "kappa_weighted": kappa_weighted
    }
    return metrics


def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_labels_metrics(predictions=preds, labels=p.label_ids)
    return result

peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['query'])

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


# Training Arguments without saving the model
args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir='./drive/MyDrive/fit',
    num_train_epochs=6,  # Do not save the model
    evaluation_strategy="epoch",  # Evaluate after each epoch
    logging_dir='./drive/MyDrive/logs',
    save_strategy="epoch"
)

# Trainer initialization
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 79,111 || all params: 109,566,734 || trainable%: 0.0722


In [2]:
pip install peft

Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.13.0->peft)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.13.0->peft)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.13.0->peft)
  Using cached nvidia_cufft_cu12-11.

In [4]:
trainer.train()

Epoch,Training Loss,Validation Loss,Roc Auc Macro,Roc Auc Micro,Hamming Loss,F1 Macro,F1 Micro,Mcc Weighted,Kappa Weighted,Runtime,Samples Per Second,Steps Per Second
1,0.3109,0.282984,0.808035,0.804599,0.140781,0.652784,0.642554,0.556838,0.556173,282.8636,31.376,3.924
2,0.2431,0.209151,0.878195,0.872953,0.098592,0.76891,0.747099,0.692035,0.68661,283.4602,31.31,3.916
3,0.2156,0.174019,0.899093,0.892574,0.0814,0.80879,0.786273,0.738613,0.736373,284.3772,31.209,3.903
4,0.1913,0.155686,0.910161,0.904653,0.078133,0.826271,0.797902,0.759472,0.749974,283.787,31.273,3.911
5,0.185,0.146509,0.915537,0.910046,0.073819,0.834554,0.808149,0.770816,0.762875,282.2682,31.442,3.932
6,0.1816,0.143072,0.916724,0.910993,0.071662,0.83804,0.812547,0.775125,0.768608,282.3195,31.436,3.932


TrainOutput(global_step=15534, training_loss=0.2394851997918691, metrics={'train_runtime': 10800.9987, 'train_samples_per_second': 11.503, 'train_steps_per_second': 1.438, 'total_flos': 3.2721105674686464e+16, 'train_loss': 0.2394851997918691, 'epoch': 6.0})