In [1]:
import pandas as pd

In [2]:
en_df = pd.read_csv("train/ntcir17_mednlp-sc_sm_en_train_26_06_23.csv")
fr_df = pd.read_csv("train/ntcir17_mednlp-sc_sm_fr_train_26_06_23.csv")
de_df = pd.read_csv("train/ntcir17_mednlp-sc_sm_de_train_26_06_23.csv")
ja_df = pd.read_csv("train/ntcir17_mednlp-sc_sm_ja_train_26_06_23.csv")

In [3]:
remove_common_name = lambda x: x.split(":")[0]

In [4]:
en_df = en_df.rename(remove_common_name, axis='columns')
fr_df = fr_df.rename(remove_common_name, axis='columns')
de_df = de_df.rename(remove_common_name, axis='columns')
ja_df = ja_df.rename(remove_common_name, axis='columns')

In [5]:
ade_columns = ['C0027497', 'C0011991', 'C0015672', 'C0042963', 'C0003123',
               'C0018681', 'C0015967', 'C0206062', 'C0023895', 'C0012833',
               'C0030193', 'C0002170', 'C0004096', 'C0022658', 'C0020517',
               'C0917801', 'C0009806', 'C0005956', 'C0000737', 'C0010692',
               'C0015230', 'C0149745']

In [6]:
comb_train = pd.concat([en_df, fr_df, de_df, ja_df])
comb_train[ade_columns] = comb_train[ade_columns].astype(float)
comb_train["labels"] = comb_train[ade_columns].values.tolist()
xtrain_ds = comb_train.drop(columns=ade_columns+["train_id"])
xtrain_ds = xtrain_ds.sample(frac=1).reset_index(drop=True)

In [7]:
id2label = {idx:label for idx, label in enumerate(ade_columns)}
label2id = {label:idx for idx, label in enumerate(ade_columns)}

In [8]:
from transformers import AutoTokenizer
import numpy as np
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")

In [9]:
import datasets
from datasets import Dataset

In [10]:
ds = Dataset.from_pandas(xtrain_ds, preserve_index=False)

In [11]:
def preprocess_data(examples):
    # take a batch of texts
    # encode them
    encoding = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    encoding["labels"] = examples["labels"]
    return encoding

In [12]:
encoded_dataset = ds.map(preprocess_data, batched=True, remove_columns=ds.column_names)

Map:   0%|          | 0/31856 [00:00<?, ? examples/s]

In [13]:
encoded_dataset.set_format("torch")

In [14]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("xlm-roberta-large",
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(ade_columns),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.den

In [15]:
from transformers import TrainingArguments, Trainer
batch_size = 32
args = TrainingArguments(
    f"robert-finetuned-pharma",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=6,
    weight_decay=0.01,
 #   load_best_model_at_end=True,
    metric_for_best_model="f1",
    #push_to_hub=True,
)

In [16]:
outputs = model(input_ids=encoded_dataset['input_ids'][0].unsqueeze(0), labels=encoded_dataset[0]['labels'].unsqueeze(0))
print(outputs)

SequenceClassifierOutput(loss=tensor(0.7224, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[ 0.6097,  0.2221,  0.6396, -0.0841, -0.2875,  0.0645, -0.1703, -0.1534,
         -0.1157, -0.0084,  0.0229,  0.1980, -0.4429,  0.4388,  0.5379, -0.3154,
          0.3499,  0.3215, -0.2606,  0.1387,  0.0212,  0.1068]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [17]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.087
1000,0.0352
1500,0.0235
2000,0.0195


In [19]:
trainer.save_model()