## Fine-Tuning Language Model ðŸ’»

### Pretrained Model for Fine-Tuning Language Model ðŸ’»

In [1]:
# Setup Labels for Download XLM Roberta Base
labels_to_id = {
    "O": 0,
    "B-DRUG_NAME": 1, "I-DRUG_NAME": 2,
    "B-DOSAGE": 3, "I-DOSAGE": 4,
    "B-FORM": 5, "I-FORM": 6,
    "B-DRUG_REG_NO": 7, "I-DRUG_REG_NO": 8,
    "B-MFG_DATE": 9, "I-MFG_DATE": 10,
    "B-EXP_DATE": 11, "I-EXP_DATE": 12,
    "B-WARNINGS": 13, "I-WARNINGS": 14,
    "B-INDICATIONS": 15, "I-INDICATIONS": 16,
    "B-USAGE_INSTRUCTIONS": 17, "I-USAGE_INSTRUCTIONS": 18
}
id_to_labels = {v: k for k, v in labels_to_id.items()}

In [2]:
# XLM Roberta Base - Pretrained Model Download From HuggingFace
from transformers import XLMRobertaForTokenClassification
xlm_roberta_model = XLMRobertaForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=len(labels_to_id), id2label=id_to_labels, label2id=labels_to_id)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Display label of XLM Roberta Base Model
xlm_roberta_model.config.label2id

{'O': 0,
 'B-DRUG_NAME': 1,
 'I-DRUG_NAME': 2,
 'B-DOSAGE': 3,
 'I-DOSAGE': 4,
 'B-FORM': 5,
 'I-FORM': 6,
 'B-DRUG_REG_NO': 7,
 'I-DRUG_REG_NO': 8,
 'B-MFG_DATE': 9,
 'I-MFG_DATE': 10,
 'B-EXP_DATE': 11,
 'I-EXP_DATE': 12,
 'B-INDICATIONS': 15,
 'I-INDICATIONS': 16,
 'B-USAGE_INSTRUCTIONS': 17,
 'I-USAGE_INSTRUCTIONS': 18}

In [4]:
# XLM Roberta Base - Tokenizer Download From HuggingFace
from transformers import XLMRobertaTokenizer
xlm_roberta_tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

In [5]:
# PythaiNLP - Tokenizer Download From Library
from pythainlp.tokenize import word_tokenize as pythainlp_tokenizer

### Fine-Tuning Language Model with XLM Roberta Base - Tokenizer ðŸ’»

In [None]:
# Import XLM Roberta Tokenizer Dataset to fine-tuning
import json
with open('../data/xlm_roberta_tokenizer_format.json', 'r', encoding='utf-8') as f:
    xlm_roberta_tokenizer_dataset = json.load(f)

In [None]:
# Mapping dataset
from datasets import Dataset
xlm_roberta_tokenizer_dataset = Dataset.from_dict({
    "tokens": [item["tokens"] for item in xlm_roberta_tokenizer_dataset],
    "labels": [item["ner_tags"] for item in xlm_roberta_tokenizer_dataset]
})
print(xlm_roberta_tokenizer_dataset)

In [None]:
# Function to align dataset
def tokenize_and_align_labels(examples):
    tokenized_inputs = xlm_roberta_tokenizer(examples['tokens'], padding="max_length", max_length=128, truncation=True, is_split_into_words=True)
    aligned_labels = []
    for i, labels in enumerate(examples['labels']):
        label_ids = [labels_to_id[label] for label in labels]
        padding_length = len(tokenized_inputs['input_ids'][i]) - len(label_ids)
        label_ids += [-100] * padding_length
        aligned_labels.append(label_ids)
    tokenized_inputs['labels'] = aligned_labels
    return tokenized_inputs

In [None]:
# Align labels of XLM Roberta Base Tokenizer Dataset
xlm_roberta_tokenized_dataset = xlm_roberta_tokenizer_dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
# Display number of XLM Roberta Base - Tokenizer Dataset
print(xlm_roberta_tokenized_dataset)

In [None]:
# Setup Traning Arguments
from transformers import Trainer, TrainingArguments
training_arguments = TrainingArguments(
    output_dir='../output/xlm_roberta_base_tokenizer',         
    num_train_epochs=1,            
    per_device_train_batch_size=4,   
    per_device_eval_batch_size=4, 
    warmup_steps=10, 
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=5,
    evaluation_strategy="no",
)
trainer = Trainer(
    model=xlm_roberta_model, 
    args=training_arguments, 
    train_dataset=xlm_roberta_tokenized_dataset,
    tokenizer=xlm_roberta_tokenizer
)
trainer.train()

### Fine-Tuning Language Model with PythaiNLP - Tokenizer ðŸ’»

In [6]:
# Import PythaiNLP Tokenizer Dataset to fine-tuning
import json
with open('../data/pythainlp_tokenizer_format.json', 'r', encoding='utf-8') as f:
    pythainlp_tokenizer_dataset = json.load(f)

In [7]:
# Mapping dataset
from datasets import Dataset
pythainlp_tokenizer_dataset = Dataset.from_dict({
    "tokens": [item["tokens"] for item in pythainlp_tokenizer_dataset],
    "labels": [item["ner_tags"] for item in pythainlp_tokenizer_dataset]
})
print(pythainlp_tokenizer_dataset)

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 2000
})


In [8]:
# Function to align dataset
def tokenize_and_align_labels(examples):
    tokenized_inputs = xlm_roberta_tokenizer(examples['tokens'], padding="max_length", max_length=128, truncation=True, is_split_into_words=True)
    aligned_labels = []
    for i, labels in enumerate(examples['labels']):
        label_ids = [labels_to_id[label] for label in labels]
        padding_length = len(tokenized_inputs['input_ids'][i]) - len(label_ids)
        label_ids += [-100] * padding_length
        aligned_labels.append(label_ids)
    tokenized_inputs['labels'] = aligned_labels
    return tokenized_inputs

In [9]:
# Align labels of PythaiNLP Tokenizer Dataset
pythainlp_tokenized_dataset = pythainlp_tokenizer_dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2000/2000 [00:00<00:00, 2186.48 examples/s]


In [11]:
# Display number of PythaiNLP Tokenizer Dataset
print(pythainlp_tokenized_dataset)

Dataset({
    features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 2000
})


In [12]:
# Setup Traning Arguments
from transformers import Trainer, TrainingArguments
training_arguments = TrainingArguments(
    output_dir='../output/pythainlp_tokenizer',         
    num_train_epochs=1,            
    per_device_train_batch_size=4,   
    per_device_eval_batch_size=4, 
    warmup_steps=10, 
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=5,
    evaluation_strategy="no",
)
trainer = Trainer(
    model=xlm_roberta_model, 
    args=training_arguments, 
    train_dataset=pythainlp_tokenized_dataset,
    tokenizer=xlm_roberta_tokenizer
)
trainer.train()

  trainer = Trainer(


Step,Training Loss
5,2.9067
10,2.6802
15,2.0124
20,1.5834
25,1.4348
30,1.3275
35,1.1465
40,1.0429
45,1.0435
50,0.9306


TrainOutput(global_step=500, training_loss=0.6483626422882081, metrics={'train_runtime': 1960.6805, 'train_samples_per_second': 1.02, 'train_steps_per_second': 0.255, 'total_flos': 130668458496000.0, 'train_loss': 0.6483626422882081, 'epoch': 1.0})

In [17]:
# Save Fine-Tuning Model
xlm_roberta_model.save_pretrained('../model')
xlm_roberta_tokenizer.save_pretrained('../model')

('../model/tokenizer_config.json',
 '../model/special_tokens_map.json',
 '../model/sentencepiece.bpe.model',
 '../model/added_tokens.json')