In [1]:
# Setup Labels for Download XLM Roberta Base
labels_to_id = {
    "O": 0,
    "B-DRUG_NAME": 1, "I-DRUG_NAME": 2,
    "B-DOSAGE": 3, "I-DOSAGE": 4,
    "B-FORM": 5, "I-FORM": 6,
    "B-WARNINGS": 7, "I-WARNINGS": 8,
    "B-INDICATIONS": 9, "I-INDICATIONS": 10,
    "B-USAGE_INSTRUCTIONS": 11, "I-USAGE_INSTRUCTIONS": 12
}
id_to_labels = {v: k for k, v in labels_to_id.items()}

In [2]:
# XLM Roberta Base - Pretrained Model Download From HuggingFace
from transformers import XLMRobertaForTokenClassification
xlm_roberta_model = XLMRobertaForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=len(labels_to_id), id2label=id_to_labels, label2id=labels_to_id)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# XLM Roberta Base - Tokenizer Download From HuggingFace
from transformers import XLMRobertaTokenizer
xlm_roberta_tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

In [3]:
# Import Custom Tokenizer Dataset to fine-tuning
import json
with open('../data/xlm_roberta_tokenizer_format.json', 'r', encoding='utf-8') as f:
    custom_tokenizer_dataset = json.load(f)

In [4]:
# Mapping dataset
from datasets import Dataset
custom_tokenizer_dataset = Dataset.from_dict({
    "tokens": [item["tokens"] for item in custom_tokenizer_dataset],
    "labels": [item["ner_tags"] for item in custom_tokenizer_dataset]
})
print(custom_tokenizer_dataset)

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 2000
})


In [6]:
# Function to align dataset
def tokenize_and_align_labels(examples):
    tokenized_inputs = xlm_roberta_tokenizer(examples['tokens'], padding="longest", truncation=True, return_tensors="pt", is_split_into_words=True)
    aligned_labels = []
    for i, labels in enumerate(examples['labels']):
        label_ids = [labels_to_id[label] for label in labels]
        padding_length = len(tokenized_inputs['input_ids'][i]) - len(label_ids)
        label_ids += [-100] * padding_length
        aligned_labels.append(label_ids)
        
    tokenized_inputs['labels'] = aligned_labels
    return tokenized_inputs

In [7]:
# Align labels of Custom Tokenizer Dataset
custom_tokenized_dataset = custom_tokenizer_dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 2000/2000 [00:00<00:00, 2143.11 examples/s]


In [13]:
# Setup Traning Arguments
train_dataset = custom_tokenized_dataset.select(range(0, 80))
eval_dataset = custom_tokenized_dataset.select(range(80, 100))

from transformers import Trainer, TrainingArguments
training_arguments = TrainingArguments(
    output_dir='../output/model/original_tokenizer',         
    num_train_epochs=3,            
    per_device_train_batch_size=4,   
    per_device_eval_batch_size=4, 
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)
trainer = Trainer(
    model=xlm_roberta_model, 
    args=training_arguments, 
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=xlm_roberta_tokenizer
)
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,1.055734
2,No log,0.669925
3,No log,0.619236


TrainOutput(global_step=60, training_loss=1.123220698038737, metrics={'train_runtime': 263.4225, 'train_samples_per_second': 0.911, 'train_steps_per_second': 0.228, 'total_flos': 12861978746400.0, 'train_loss': 1.123220698038737, 'epoch': 3.0})

In [18]:
from transformers import pipeline

# สร้าง NER pipeline
ner_pipeline = pipeline("ner", model='../output/model/original_tokenizer/checkpoint-60', tokenizer='../output/model/original_tokenizer/checkpoint-60', aggregation_strategy="simple")

# ทดสอบทำนาย
text = "Aspirin Erythromycin"
print(xlm_roberta_tokenizer.tokenize(text))
ner_results = ner_pipeline(text)
print(ner_results)

for entity in ner_results:
    print(f"{entity['word']} ({entity['entity_group']}): {entity['score']:.3f}")


Device set to use mps:0


['▁A', 'spir', 'in', '▁Er', 'y', 'thro', 'my', 'cin']
[]
