In [2]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import numpy as np
from torch.utils.data import Dataset
from sklearn.metrics import roc_auc_score, f1_score, hamming_loss, matthews_corrcoef, cohen_kappa_score
from transformers import EvalPrediction, TrainingArguments, Trainer
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig

# Load and preprocess data
df = pd.read_csv('drive/MyDrive/preprocessed.csv')
rng_seed = 100
df_randomized_order = df.sample(frac=1, random_state=rng_seed)
df_randomized_order = df_randomized_order[df_randomized_order["section_code"] != "-"]
df_randomized_order['section_code'] = df_randomized_order['section_code'].str.split(',')

# Multi-label binarization
multilabel = MultiLabelBinarizer()
labels = multilabel.fit_transform(df_randomized_order['section_code']).astype('float32')
texts = df_randomized_order['abstracted_heading_plus_content'].tolist()

# Balancing the dataset
def balance_dataset(texts, labels):
    total_instances = len(texts)
    balanced_texts = []
    balanced_labels = []

    for i in range(labels.shape[1]):
        class_indices = np.where(labels[:, i] == 1)[0]
        class_texts = [texts[j] for j in class_indices]
        class_labels = labels[class_indices]

        num_dup = total_instances // len(class_texts)
        balanced_class_texts = class_texts * num_dup
        balanced_class_labels = np.tile(class_labels, (num_dup, 1))

        num_add = total_instances % len(class_texts)
        resampled_texts = resample(class_texts, n_samples=num_add, random_state=0)
        resampled_labels = resample(class_labels, n_samples=num_add, random_state=0)

        balanced_class_texts.extend(resampled_texts)
        balanced_class_labels = np.vstack((balanced_class_labels, resampled_labels))

        balanced_texts.extend(balanced_class_texts)
        balanced_labels.append(balanced_class_labels)

    balanced_labels = np.vstack(balanced_labels)
    return balanced_texts, balanced_labels

balanced_texts, balanced_labels = balance_dataset(texts, labels)

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(balanced_texts, balanced_labels, test_size=0.3, random_state=42)

# Load tokenizer and model
checkpoint = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(checkpoint)
model = RobertaForSequenceClassification.from_pretrained(checkpoint, num_labels=labels.shape[1])

# Custom dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = torch.tensor(self.labels[idx])
        encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label
        }

train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

# Multi-label metrics
def multi_labels_metrics(predictions, labels, threshold=0.3):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    y_true = labels

    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_micro = f1_score(y_true, y_pred, average='micro')
    roc_auc_macro = roc_auc_score(y_true, y_pred, average='macro')
    roc_auc_micro = roc_auc_score(y_true, y_pred, average='micro')
    hamming = hamming_loss(y_true, y_pred)

    # Label-wise weighted MCC calculation
    num_labels = y_true.shape[1]
    mcc_weighted = 0
    total_weight = 0

    for i in range(num_labels):
        y_true_label = y_true[:, i]
        y_pred_label = y_pred[:, i]

        # Weight by the proportion of positive instances for the label
        weight = float(y_true_label.sum()) / y_true_label.shape[0]
        total_weight += weight
        mcc_weighted += weight * matthews_corrcoef(y_true_label, y_pred_label)

    # Normalize by total weight
    if total_weight > 0:
        mcc_weighted /= total_weight

    # Weighted Kappa calculation (if needed)
    y_true_flat = y_true.flatten()
    y_pred_flat = y_pred.flatten()
    kappa_weighted = cohen_kappa_score(y_true_flat, y_pred_flat, weights="quadratic")

    metrics = {
        "roc_auc_macro": roc_auc_macro,
        "roc_auc_micro": roc_auc_micro,
        "hamming_loss": hamming,
        "f1_macro": f1_macro,
        "f1_micro": f1_micro,
        "mcc_weighted": mcc_weighted,
        "kappa_weighted": kappa_weighted
    }
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_labels_metrics(predictions=preds, labels=p.label_ids)
    return result

# PEFT configuration
peft_config = LoraConfig(
    task_type="SEQ_CLS",
    r=4,
    lora_alpha=32,
    lora_dropout=0.01,
    target_modules=['query']
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Training Arguments
args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir='./drive/MyDrive/fit',
    num_train_epochs=6,  # Number of epochs
    evaluation_strategy="epoch",  # Evaluate after each epoch
    logging_dir='./drive/MyDrive/logs',
    save_strategy="epoch"
)

# Trainer initialization
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Start training



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 669,703 || all params: 125,320,718 || trainable%: 0.5344


In [1]:
pip install peft

Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.13.0->peft)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.13.0->peft)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.13.0->peft)
  Using cached nvidia_cufft_cu12-11.

In [3]:
trainer.train()

Epoch,Training Loss,Validation Loss,Roc Auc Macro,Roc Auc Micro,Hamming Loss,F1 Macro,F1 Micro,Mcc Weighted,Kappa Weighted,Runtime,Samples Per Second,Steps Per Second
1,0.1932,0.160108,0.904428,0.899595,0.072322,0.821777,0.806177,0.762345,0.76191,265.0268,33.487,4.188
2,0.1522,0.117995,0.933695,0.92912,0.051525,0.874804,0.859771,0.828152,0.828285,264.4039,33.566,4.198
3,0.1345,0.100818,0.946264,0.941963,0.04462,0.892094,0.878836,0.851334,0.851562,264.6236,33.538,4.195


Epoch,Training Loss,Validation Loss,Roc Auc Macro,Roc Auc Micro,Hamming Loss,F1 Macro,F1 Micro,Mcc Weighted,Kappa Weighted,Runtime,Samples Per Second,Steps Per Second
1,0.1932,0.160108,0.904428,0.899595,0.072322,0.821777,0.806177,0.762345,0.76191,265.0268,33.487,4.188
2,0.1522,0.117995,0.933695,0.92912,0.051525,0.874804,0.859771,0.828152,0.828285,264.4039,33.566,4.198
3,0.1345,0.100818,0.946264,0.941963,0.04462,0.892094,0.878836,0.851334,0.851562,264.6236,33.538,4.195
4,0.1187,0.09209,0.952604,0.949505,0.042946,0.899885,0.884682,0.861087,0.8584,264.4508,33.56,4.197
5,0.1142,0.086383,0.954708,0.951326,0.038873,0.906812,0.894297,0.870714,0.870539,266.0888,33.354,4.172
6,0.1107,0.084631,0.955094,0.951575,0.037392,0.909712,0.897751,0.874461,0.874915,266.0445,33.359,4.172


TrainOutput(global_step=15534, training_loss=0.15300700080368654, metrics={'train_runtime': 9832.1551, 'train_samples_per_second': 12.636, 'train_steps_per_second': 1.58, 'total_flos': 3.294651776432947e+16, 'train_loss': 0.15300700080368654, 'epoch': 6.0})