In [1]:
import pandas as pd

In [2]:
en_df = pd.read_csv("ntcir17_mednlp-sc_sm_en_train_26_06_23.csv")
fr_df = pd.read_csv("ntcir17_mednlp-sc_sm_fr_train_26_06_23.csv")
de_df = pd.read_csv("ntcir17_mednlp-sc_sm_de_train_26_06_23.csv")
ja_df = pd.read_csv("ntcir17_mednlp-sc_sm_ja_train_26_06_23.csv")

In [3]:
remove_common_name = lambda x: x.split(":")[0]

In [4]:
en_df = en_df.rename(remove_common_name, axis='columns')
fr_df = fr_df.rename(remove_common_name, axis='columns')
de_df = de_df.rename(remove_common_name, axis='columns')
ja_df = ja_df.rename(remove_common_name, axis='columns')

In [5]:
ade_columns = ['C0027497', 'C0011991', 'C0015672', 'C0042963', 'C0003123',
               'C0018681', 'C0015967', 'C0206062', 'C0023895', 'C0012833',
               'C0030193', 'C0002170', 'C0004096', 'C0022658', 'C0020517',
               'C0917801', 'C0009806', 'C0005956', 'C0000737', 'C0010692',
               'C0015230', 'C0149745']

In [6]:
comb_train = pd.concat([en_df, fr_df, de_df, ja_df])

In [7]:
comb_train[ade_columns] = comb_train[ade_columns].astype(float)
comb_train["labels"] = comb_train[ade_columns].values.tolist()
xtrain_ds = comb_train.drop(columns=ade_columns+["train_id"])
xtrain_ds = xtrain_ds.sample(frac=1).reset_index(drop=True)  # shuffle

In [8]:
id2label = {idx:label for idx, label in enumerate(ade_columns)}
label2id = {label:idx for idx, label in enumerate(ade_columns)}

In [9]:
from transformers import AutoTokenizer
import numpy as np
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

In [10]:
import datasets
from datasets import Dataset

In [11]:
ds = Dataset.from_pandas(xtrain_ds, preserve_index=False)

In [12]:
def preprocess_data(examples):
    # take a batch of texts
    # encode them
    encoding = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    encoding["labels"] = examples["labels"]
    return encoding

In [13]:
train_dataset = ds.map(preprocess_data, batched=True, remove_columns=ds.column_names)

Map:   0%|          | 0/31856 [00:00<?, ? examples/s]

In [14]:
train_dataset.set_format("torch")

In [15]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased",
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(ade_columns),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [17]:
from transformers import TrainingArguments, Trainer
batch_size = 16
args = TrainingArguments(
    f"bert-finetuned",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    num_train_epochs=6,
    weight_decay=0.01
)

In [20]:
outputs = model(input_ids=train_dataset['input_ids'][0].unsqueeze(0), labels=train_dataset[0]['labels'].unsqueeze(0))
print(outputs)

SequenceClassifierOutput(loss=tensor(0.6956, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-0.0857,  0.0210,  0.0811, -0.0081, -0.1899,  0.0681,  0.1964, -0.1105,
          0.0055,  0.0085,  0.1192, -0.1962,  0.0741,  0.0489,  0.0142,  0.0802,
          0.0788,  0.0079, -0.0598, -0.1201, -0.0390,  0.0621]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [22]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    tokenizer=tokenizer
)

In [23]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.1415
1000,0.0824
1500,0.0684
2000,0.0579
2500,0.0472
3000,0.0415
3500,0.0352
4000,0.0317
4500,0.0253
5000,0.0234


TrainOutput(global_step=11946, training_loss=0.030632380742274715, metrics={'train_runtime': 4787.4651, 'train_samples_per_second': 39.924, 'train_steps_per_second': 2.495, 'total_flos': 1.257475633717248e+16, 'train_loss': 0.030632380742274715, 'epoch': 6.0})

In [24]:
trainer.save_model()