In [1]:
import datasets
from transformers import (Trainer, TrainingArguments, DataCollatorWithPadding,
                          AutoTokenizer, AutoModelForSequenceClassification)
import pandas as pd

In [2]:
DATA = '../../../data/spancat/'
model_name_or_path = 'microsoft/deberta-v3-large'
dataset_path = DATA+'strategies-ds.hf'
output_dir = DATA+'spancat/results/metacognitive-cls'
model_max_length = 2056
dataframe = '~/data/spancat/strategies-df.csv'

In [3]:
df = pd.read_csv(dataframe).dropna()
classes = list(df.columns[1:])
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for id, class_ in enumerate(classes)}

In [4]:
def load_dataset(dataset_path):
    ds = datasets.load_from_disk(dataset_path)
    return ds
ds = load_dataset(dataset_path)

In [5]:
import evaluate
import numpy as np

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
   return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):
   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
    max_length=model_max_length,
    )
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=16, return_tensors='pt')



In [7]:
epochs = 12
learning_rate = 9.946303722432942e-06
warmup_steps = 500
weight_decay = 0.01
batch_size = 16

In [8]:

''' The main training loop.
'''
# wandb.init()

# config = wandb.config

model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path,
                                                           num_labels=8,
                                                           id2label=id2class, 
                                                           label2id=class2id, 
                                                           problem_type = "multi_label_classification")
    
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='epoch',
    optim='adamw_torch',
    learning_rate=learning_rate,
    num_train_epochs=epochs,
    weight_decay=weight_decay,
    warmup_steps = warmup_steps,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    disable_tqdm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds['train'],
    eval_dataset=ds['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtiedaar1[0m ([33mai-aloe[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111352268813385, max=1.0)…

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.692,0.660293,0.634221,0.215385,0.164154,0.313099
2,0.5269,0.379402,0.871926,0.505929,0.663212,0.408946
3,0.3652,0.340934,0.877049,0.473684,0.755245,0.345048
4,0.3282,0.308118,0.893955,0.558635,0.839744,0.41853
5,0.2936,0.275621,0.902152,0.589247,0.901316,0.4377
6,0.2529,0.255029,0.91291,0.680451,0.826484,0.578275
7,0.2141,0.238818,0.922131,0.736111,0.806084,0.677316
8,0.1802,0.205888,0.927766,0.741284,0.87069,0.645367
9,0.1494,0.206372,0.922131,0.753247,0.765677,0.741214
10,0.1179,0.178622,0.940061,0.809756,0.824503,0.795527


TrainOutput(global_step=732, training_loss=0.27466803170292753, metrics={'train_runtime': 602.0839, 'train_samples_per_second': 19.393, 'train_steps_per_second': 1.216, 'total_flos': 4128159774807552.0, 'train_loss': 0.27466803170292753, 'epoch': 12.0})

In [22]:
from huggingface_hub import notebook_login

In [31]:
trainer.push_to_hub('metacognitive_cls')

CommitInfo(commit_url='https://huggingface.co/tiedaar/training1/commit/b306f94e30d959e60793d90d0c7d003832bdb4b0', commit_message='metacognitive_strategy_cls', commit_description='', oid='b306f94e30d959e60793d90d0c7d003832bdb4b0', pr_url=None, pr_revision=None, pr_num=None)

In [25]:
tokenizer.push_to_hub('metacognitive_cls')

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/tiedaar/metacognitive_strategy_cls/commit/c06842978b11b5f08f1d1e8c68e1e42c4f770dbb', commit_message='Upload tokenizer', commit_description='', oid='c06842978b11b5f08f1d1e8c68e1e42c4f770dbb', pr_url=None, pr_revision=None, pr_num=None)

In [120]:
def sigmoid(x):
   return 1/(1 + np.exp(-x))
    
def generate_output(sequence):
    input_ids = tokenizer(sequence, return_tensors='pt')['input_ids']
    outputs = np.array(model(input_ids).logits.detach().reshape(-1))
    predictions = sigmoid(outputs)
    predictions = (predictions > 0.5).astype(int)
    return predictions

test_df = pd.DataFrame(ds['test'])[['text', 'labels']]
test_df['preds'] = test_df['text'].apply(lambda x: generate_output(x))
test_df

Unnamed: 0,text,labels,preds
0,"For this exam, I started by reading the assign...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0]","[0, 0, 0, 1, 0, 0, 0, 1]"
1,To prepare for this exam I carefully read the ...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0]","[0, 0, 0, 1, 0, 0, 0, 0]"
2,I read through the textbook using the study gu...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0, 0, 0, 0, 0, 0, 0, 0]"
3,I began studying for the exam by reviewing my ...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0]","[0, 0, 0, 1, 0, 0, 0, 1]"
4,I read the textbook after lectures. I wrote do...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0, 0, 0, 0, 0, 0, 0, 1]"
...,...,...,...
239,Took notes on each lecture while reviewing the...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]","[0, 0, 0, 0, 0, 0, 0, 0]"
240,"Prior to the classes, I read all of the requir...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0]","[0, 0, 0, 1, 0, 0, 0, 1]"
241,To study for this exam I read over all of the ...,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[1, 0, 0, 0, 0, 0, 0, 0]"
242,I went through the slides and filled out the s...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0]","[0, 0, 0, 1, 0, 0, 0, 1]"


In [122]:
test_df['text'].iloc[0]

'For this exam, I started by reading the assigned textbook chapters. I took note of vocabulary words and their definitions. I also took note of concepts that I wasn’t already familiar with with. Later, I went through the study guide and answered the questions listed. Finally, I used the practice exam to test my knowledge.'

In [115]:
import pandas as pd
pd.Series(scores).mean()

0.9431352459016393

In [101]:
i = 1

def sigmoid(x):
   return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):
   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))


sequence = ds['test']['text'][i]
labels = ds['test']['labels'][i]
print(labels)
input_ids = tokenizer(sequence, return_tensors='pt')
outputs = model(input_ids['input_ids']).logits
predictions = sigmoid(np.array(outputs.detach().reshape(-1)))
predictions = (predictions>0.5).astype(int)
predictions

[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0]


array([0, 0, 0, 1, 0, 0, 0, 0])