In [1]:
import json
from transformers import AutoTokenizer

max_length = 128
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
label2id = {
    "O": 0,
    "B-DRUG_NAME": 1,
    "I-DRUG_NAME": 2,
    "B-DOSAGE": 3,
    "I-DOSAGE": 4,
    "B-FORM": 5,
    "I-FORM": 6,
    "B-WARNINGS": 7,
    "I-WARNINGS": 8,
    "B-INDICATIONS": 9,
    "I-INDICATIONS": 10,
    "B-USAGE_INSTRUCTIONS": 11,
    "I-USAGE_INSTRUCTIONS": 12
}

def prepare_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        raw_data = json.load(f)
    input_ids_list = []
    attention_mask_list = []
    labels_list = []
    for item in raw_data:
        tokens = item['tokens']
        labels = item['labels']

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_ids)
        label_ids = [label2id[label] for label in labels]

        input_ids = input_ids[:max_length]
        attention_mask = attention_mask[:max_length]
        label_ids = label_ids[:max_length]

        pad_len = max_length - len(input_ids)
        if pad_len > 0:
            input_ids += [tokenizer.pad_token_id] * pad_len
            attention_mask += [0] * pad_len
            label_ids += [-100] * pad_len

        input_ids_list.append(input_ids)
        attention_mask_list.append(attention_mask)
        labels_list.append(label_ids)
    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_mask_list,
        "labels": labels_list
    }

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def prepare_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        raw_data = json.load(f)
    input_ids_list = []
    attention_mask_list = []
    labels_list = []
    for item in raw_data:
        tokens = item['tokens']
        labels = item['labels']

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_ids)
        label_ids = [label2id[label] for label in labels]

        input_ids = input_ids[:max_length]
        attention_mask = attention_mask[:max_length]
        label_ids = label_ids[:max_length]

        pad_len = max_length - len(input_ids)
        if pad_len > 0:
            input_ids += [tokenizer.pad_token_id] * pad_len
            attention_mask += [0] * pad_len
            label_ids += [-100] * pad_len

        input_ids_list.append(input_ids)
        attention_mask_list.append(attention_mask)
        labels_list.append(label_ids)
    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_mask_list,
        "labels": labels_list
    }

In [6]:
# Prepare Dataset for Evaluation
test_sequence_data = prepare_data('finetuning_data/test.json')
test_shuffled_data = prepare_data('shuffled_finetuning_data/test.json')

# Prepare the data
from datasets import Dataset
test_sequence_dataset = Dataset.from_dict(test_sequence_data)
test_shuffled_dataset = Dataset.from_dict(test_shuffled_data)

In [7]:
label_list = [
    "O",
    "B-DRUG_NAME",
    "I-DRUG_NAME",
    "B-DOSAGE",
    "I-DOSAGE",
    "B-FORM",
    "I-FORM",
    "B-WARNINGS",
    "I-WARNINGS",
    "B-INDICATIONS",
    "I-INDICATIONS",
    "B-USAGE_INSTRUCTIONS",
    "I-USAGE_INSTRUCTIONS"
]

id2label = {i: label for i, label in enumerate(label_list)}

In [8]:
# Function to compute metrics
import numpy as np
from seqeval.metrics import precision_score, recall_score, f1_score

def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=2)

    true_predictions = []
    true_labels = []

    for pred, label in zip(preds, labels):
        current_preds = []
        current_labels = []
        for p_, l_ in zip(pred, label):
            if l_ != -100:
                current_preds.append(id2label[p_])
                current_labels.append(id2label[l_])
        true_predictions.append(current_preds)
        true_labels.append(current_labels)

    # precision, recall, f1
    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)
    f1 = f1_score(true_labels, true_predictions)

    # accuracy (custom)
    total = sum(len(labels) for labels in true_labels)
    correct = sum(
        p == l
        for preds, labels in zip(true_predictions, true_labels)
        for p, l in zip(preds, labels)
    )
    accuracy = correct / total

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": accuracy,
    }

In [10]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="./ner-roberta-shuffled",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
)

## ðŸ’¬ XLM-ROBERTa-Base (Sequence Dataset)

In [12]:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("token-classification", model="waterondaway/xlm-roberta-base")

Device set to use mps:0


In [14]:
# Prepare the data
train_data = prepare_data('shuffled_finetuning_data/train.json')
eval_data = prepare_data('shuffled_finetuning_data/eval.json')

In [15]:
from datasets import Dataset
train_dataset = Dataset.from_dict(train_data)
eval_dataset = Dataset.from_dict(eval_data)

In [None]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="./ner-roberta-shuffled",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=pipe,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

## ðŸ’¬ XLM-ROBERTa-Base (Shuffled Dataset)

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("token-classification", model="waterondaway/xlm-roberta-base-shuffled")