In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from sklearn.utils import resample
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, hamming_loss
from transformers import EvalPrediction, TrainingArguments, Trainer
from sklearn.metrics import matthews_corrcoef, cohen_kappa_score

# Load the DataFrame from the CSV file
df = pd.read_csv('drive/MyDrive/preprocessed.csv')
df.head()
rng_seed = 100
df_randomized_order = df.sample(frac=1, random_state=rng_seed)
df_randomized_order['section_code'] = df_randomized_order['section_code'].replace('-', 'excluded')
df_randomized_order['section_code'] = df_randomized_order['section_code'].str.split(',')
multilabel = MultiLabelBinarizer()
labels = multilabel.fit_transform(df_randomized_order['section_code']).astype('float32')
texts = df_randomized_order['abstracted_heading_plus_content'].tolist()

# Balancing the dataset
def balance_dataset(texts, labels):
    total_instances = len(texts)
    balanced_texts = []
    balanced_labels = []

    for i in range(labels.shape[1]):
        class_indices = np.where(labels[:, i] == 1)[0]
        class_texts = [texts[j] for j in class_indices]
        class_labels = labels[class_indices]

        num_dup = total_instances // len(class_texts)
        balanced_class_texts = class_texts * num_dup
        balanced_class_labels = np.tile(class_labels, (num_dup, 1))

        num_add = total_instances % len(class_texts)
        resampled_texts = resample(class_texts, n_samples=num_add, random_state=0)
        resampled_labels = resample(class_labels, n_samples=num_add, random_state=0)

        balanced_class_texts.extend(resampled_texts)
        balanced_class_labels = np.vstack((balanced_class_labels, resampled_labels))

        balanced_texts.extend(balanced_class_texts)
        balanced_labels.append(balanced_class_labels)

    balanced_labels = np.vstack(balanced_labels)
    return balanced_texts, balanced_labels

balanced_texts, balanced_labels = balance_dataset(texts, labels)
train_texts, val_texts, train_labels, val_labels = train_test_split(balanced_texts, balanced_labels, test_size=0.3, random_state=42, stratify=balanced_labels)

# Update the checkpoint for RoBERTa
checkpoint = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(checkpoint)
model = RobertaForSequenceClassification.from_pretrained(checkpoint, num_labels=labels.shape[1], problem_type="multi_label_classification")

# Custom dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = torch.tensor(self.labels[idx])
        encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label
        }

train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

def multi_labels_metrics(predictions, labels, threshold=0.3):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    y_true = labels

    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_micro = f1_score(y_true, y_pred, average='micro')
    roc_auc_macro = roc_auc_score(y_true, y_pred, average='macro')
    roc_auc_micro = roc_auc_score(y_true, y_pred, average='micro')
    hamming = hamming_loss(y_true, y_pred)

    num_labels = y_true.shape[1]
    mcc_weighted = 0
    total_weight = 0

    for i in range(num_labels):
        y_true_label = y_true[:, i]
        y_pred_label = y_pred[:, i]

        weight = float(y_true_label.sum()) / y_true_label.shape[0]
        total_weight += weight
        mcc_weighted += weight * matthews_corrcoef(y_true_label, y_pred_label)

    if total_weight > 0:
        mcc_weighted /= total_weight

    y_true_flat = y_true.flatten()
    y_pred_flat = y_pred.flatten()
    kappa_weighted = cohen_kappa_score(y_true_flat, y_pred_flat, weights="quadratic")

    metrics = {
        "roc_auc_macro": roc_auc_macro,
        "roc_auc_micro": roc_auc_micro,
        "hamming_loss": hamming,
        "f1_macro": f1_macro,
        "f1_micro": f1_micro,
        "mcc_weighted": mcc_weighted,
        "kappa_weighted": kappa_weighted
    }
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_labels_metrics(predictions=preds, labels=p.label_ids)
    return result

# Training Arguments without saving the model
args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir='./drive/MyDrive/fit',
    num_train_epochs=2,  # Do not save the model
    evaluation_strategy="epoch",  # Evaluate after each epoch
    logging_dir='./drive/MyDrive/logs',
    save_strategy="epoch"
)

# Trainer initialization
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
trainer.train()

Epoch,Training Loss,Validation Loss,Roc Auc Macro,Roc Auc Micro,Hamming Loss,F1 Macro,F1 Micro,Mcc Weighted,Kappa Weighted,Runtime,Samples Per Second,Steps Per Second
1,0.0522,0.039949,0.982084,0.980514,0.011094,0.96666,0.963357,0.955891,0.95682,358.0557,32.537,4.069
2,0.0125,0.012151,0.995512,0.995251,0.003337,0.989921,0.988967,0.98678,0.987002,361.5111,32.226,4.03


TrainOutput(global_step=6796, training_loss=0.06373457441335569, metrics={'train_runtime': 5970.8218, 'train_samples_per_second': 9.105, 'train_steps_per_second': 1.138, 'total_flos': 1.4304539980234752e+16, 'train_loss': 0.06373457441335569, 'epoch': 2.0})