In [1]:
import pandas as pd
from pathlib import Path
import os
from datasets import Dataset

In [2]:
data = '~/data/spancat/strategies-df.csv'

In [3]:
df = pd.read_csv(data).dropna()
classes = list(df.columns[1:])
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for id, class_ in enumerate(classes)}

labels = [label[1:] for label in df.values.tolist()]
df['labels'] = labels

ds = Dataset.from_pandas(df[['text', 'labels']], preserve_index=False).train_test_split(test_size=0.2)

In [5]:
from transformers import AutoTokenizer

model_path = 'microsoft/deberta-v3-small'
tokenizer = AutoTokenizer.from_pretrained(model_path, trucation=False)

def tokenize_text(example):
    return tokenizer(example['text'], truncation=False) 

ds = ds.map(tokenize_text)

Map:   0%|          | 0/973 [00:00<?, ? examples/s]

Map:   0%|          | 0/244 [00:00<?, ? examples/s]

In [6]:
import evaluate
import numpy as np

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
   return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):
   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))

In [7]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                           num_labels=len(classes), 
                                                           id2label=id2class, 
                                                           label2id=class2id, 
                                                           problem_type = "multi_label_classification")

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
training_args = TrainingArguments(

   output_dir="testing",
   learning_rate=2e-5,
   per_device_train_batch_size=3,
   per_device_eval_batch_size=3,
   num_train_epochs=4,
   weight_decay=0.01,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=ds["train"],
   eval_dataset=ds["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtiedaar1[0m ([33mai-aloe[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.324738,0.882172,0.479638,0.834646,0.336508
2,0.356800,0.306562,0.890369,0.54661,0.821656,0.409524
3,0.356800,0.270415,0.897541,0.561404,0.907801,0.406349
4,0.282700,0.266738,0.900102,0.592902,0.865854,0.450794


TrainOutput(global_step=1300, training_loss=0.30443372286283055, metrics={'train_runtime': 106.9677, 'train_samples_per_second': 36.385, 'train_steps_per_second': 12.153, 'total_flos': 119480122574256.0, 'train_loss': 0.30443372286283055, 'epoch': 4.0})