## üí¨ Fine-Tuning Language Model

In [119]:
import json
from transformers import AutoTokenizer

max_length = 128
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
label2id = {
    "O": 0,
    "B-DRUG_NAME": 1,
    "I-DRUG_NAME": 2,
    "B-DOSAGE": 3,
    "I-DOSAGE": 4,
    "B-FORM": 5,
    "I-FORM": 6,
    "B-WARNINGS": 7,
    "I-WARNINGS": 8,
    "B-INDICATIONS": 9,
    "I-INDICATIONS": 10,
    "B-USAGE_INSTRUCTIONS": 11,
    "I-USAGE_INSTRUCTIONS": 12
}

def prepare_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        raw_data = json.load(f)
    input_ids_list = []
    attention_mask_list = []
    labels_list = []
    for item in raw_data:
        tokens = item['tokens']
        labels = item['labels']

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_ids)
        label_ids = [label2id[label] for label in labels]

        input_ids = input_ids[:max_length]
        attention_mask = attention_mask[:max_length]
        label_ids = label_ids[:max_length]

        pad_len = max_length - len(input_ids)
        if pad_len > 0:
            input_ids += [tokenizer.pad_token_id] * pad_len
            attention_mask += [0] * pad_len
            label_ids += [-100] * pad_len

        input_ids_list.append(input_ids)
        attention_mask_list.append(attention_mask)
        labels_list.append(label_ids)
    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_mask_list,
        "labels": labels_list
    }

In [120]:
# Prepare the data
train_data = prepare_data('finetuning_data/train.json')
eval_data = prepare_data('finetuning_data/eval.json')

# Display a sample
print("Train sample:")
print("Input IDs:", train_data['input_ids'][0])
print("Attention Mask:", train_data['attention_mask'][0])
print("Labels:", train_data['labels'][0])

Train sample:
Input IDs: [6, 141635, 11425, 24230, 6, 238521, 137039, 226623, 160918, 190, 65040, 10289, 1372, 142323, 744, 8821, 48229, 95506, 16602, 95122, 2543, 97423, 2101, 10035, 6, 29869, 468, 119477, 76924, 37151, 9513, 387, 3768, 44727, 2588, 116124, 141131, 70006, 11471, 6549, 6, 85070, 16879, 9250, 32181, 1037, 6, 56156, 160228, 6, 139155, 9373, 193965, 164432, 201, 71127, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [121]:
label_list = [
    "O",
    "B-DRUG_NAME",
    "I-DRUG_NAME",
    "B-DOSAGE",
    "I-DOSAGE",
    "B-FORM",
    "I-FORM",
    "B-WARNINGS",
    "I-WARNINGS",
    "B-INDICATIONS",
    "I-INDICATIONS",
    "B-USAGE_INSTRUCTIONS",
    "I-USAGE_INSTRUCTIONS"
]

id2label = {i: label for i, label in enumerate(label_list)}

In [122]:
# Function to compute metrics
import numpy as np
from seqeval.metrics import precision_score, recall_score, f1_score

def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=2)

    true_predictions = []
    true_labels = []

    for pred, label in zip(preds, labels):
        current_preds = []
        current_labels = []
        for p_, l_ in zip(pred, label):
            if l_ != -100:
                current_preds.append(id2label[p_])
                current_labels.append(id2label[l_])
        true_predictions.append(current_preds)
        true_labels.append(current_labels)

    # precision, recall, f1 (‡∏à‡∏≤‡∏Å seqeval)
    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)
    f1 = f1_score(true_labels, true_predictions)

    # accuracy (custom)
    total = sum(len(labels) for labels in true_labels)
    correct = sum(
        p == l
        for preds, labels in zip(true_predictions, true_labels)
        for p, l in zip(preds, labels)
    )
    accuracy = correct / total

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": accuracy,
    }



In [123]:
# Prepare the data
from datasets import Dataset
train_dataset = Dataset.from_dict(train_data)
eval_dataset = Dataset.from_dict(eval_data)

In [124]:
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=len(label2id), label2id=label2id, id2label=id2label)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [125]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="./ner-roberta",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
)

In [126]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0002,5.3e-05,1.0,1.0,1.0,1.0
2,0.0126,0.010388,0.957108,0.994222,0.975312,0.996161
3,0.0001,1.9e-05,1.0,1.0,1.0,1.0


TrainOutput(global_step=4500, training_loss=0.041867565016003534, metrics={'train_runtime': 1366.0482, 'train_samples_per_second': 26.353, 'train_steps_per_second': 3.294, 'total_flos': 2351904685056000.0, 'train_loss': 0.041867565016003534, 'epoch': 3.0})

In [129]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [130]:
from huggingface_hub import HfApi

model_name = "waterondaway/xlm-roberta-base"
model.push_to_hub(model_name)
tokenizer.push_to_hub(model_name)

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/waterondaway/xlm-roberta-base/commit/c56915291027ac3d76a6f03eb790137e0f07cf20', commit_message='Upload tokenizer', commit_description='', oid='c56915291027ac3d76a6f03eb790137e0f07cf20', pr_url=None, repo_url=RepoUrl('https://huggingface.co/waterondaway/xlm-roberta-base', endpoint='https://huggingface.co', repo_type='model', repo_id='waterondaway/xlm-roberta-base'), pr_revision=None, pr_num=None)

In [134]:
# Evaluation
test_data = prepare_data('finetuning_data/test.json')
test_dataset = Dataset.from_dict(test_data)
trainer.evaluate(test_dataset)

{'eval_loss': 1.8599264876684174e-05,
 'eval_precision': 1.0,
 'eval_recall': 1.0,
 'eval_f1': 1.0,
 'eval_accuracy': 1.0,
 'eval_runtime': 11.7613,
 'eval_samples_per_second': 127.537,
 'eval_steps_per_second': 15.985,
 'epoch': 3.0}