## Fine-Tuning Language Model ðŸ’»

### Pretrained Model for Fine-Tuning Language Model ðŸ’»

In [1]:
# Setup Labels for Download XLM Roberta Base
labels_to_id = {
    "O": 0,
    "B-DRUG_NAME": 1, "I-DRUG_NAME": 2,
    "B-DOSAGE": 3, "I-DOSAGE": 4,
    "B-FORM": 5, "I-FORM": 6,
    "B-DRUG_REG_NO": 7, "I-DRUG_REG_NO": 8,
    "B-MFG_DATE": 9, "I-MFG_DATE": 10,
    "B-EXP_DATE": 11, "I-EXP_DATE": 12,
    "B-WARNINGS": 13, "I-WARNINGS": 14,
    "B-INDICATIONS": 15, "I-INDICATIONS": 16,
    "B-USAGE_INSTRUCTIONS": 17, "I-USAGE_INSTRUCTIONS": 18
}
id_to_labels = {v: k for k, v in labels_to_id.items()}

In [None]:
# XLM Roberta Base - Pretrained Model Download From HuggingFace
from transformers import XLMRobertaForTokenClassification
xlm_roberta_model = XLMRobertaForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=len(labels_to_id), id2label=id_to_labels, label2id=labels_to_id)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Display label of XLM Roberta Base Model
xlm_roberta_model.config.label2id

{'O': 0,
 'B-DRUG_NAME': 1,
 'I-DRUG_NAME': 2,
 'B-DOSAGE': 3,
 'I-DOSAGE': 4,
 'B-FORM': 5,
 'I-FORM': 6,
 'B-DRUG_REG_NO': 7,
 'I-DRUG_REG_NO': 8,
 'B-MFG_DATE': 9,
 'I-MFG_DATE': 10,
 'B-EXP_DATE': 11,
 'I-EXP_DATE': 12,
 'B-INDICATIONS': 15,
 'I-INDICATIONS': 16,
 'B-USAGE_INSTRUCTIONS': 17,
 'I-USAGE_INSTRUCTIONS': 18}

In [4]:
# Custom Tokenizer - Tokenizer Download From HuggingFace
from transformers import XLMRobertaTokenizer
custom_tokenizer = XLMRobertaTokenizer.from_pretrained("../output/tokenizer/custom_tokenizer")

In [None]:
# XLM Roberta Base - Tokenizer Download From HuggingFace
from transformers import XLMRobertaTokenizer
xlm_roberta_tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

In [None]:
# PythaiNLP - Tokenizer Download From Library
from pythainlp.tokenize import word_tokenize as pythainlp_tokenizer

### Fine-Tuning Language Model with XLM Roberta Base - Tokenizer ðŸ’»

In [None]:
# Import XLM Roberta Tokenizer Dataset to fine-tuning
import json
with open('../data/xlm_roberta_tokenizer_format.json', 'r', encoding='utf-8') as f:
    xlm_roberta_tokenizer_dataset = json.load(f)

In [None]:
# Mapping dataset
from datasets import Dataset
xlm_roberta_tokenizer_dataset = Dataset.from_dict({
    "tokens": [item["tokens"] for item in xlm_roberta_tokenizer_dataset],
    "labels": [item["ner_tags"] for item in xlm_roberta_tokenizer_dataset]
})
print(xlm_roberta_tokenizer_dataset)

In [None]:
# Function to align dataset
def tokenize_and_align_labels(examples):
    tokenized_inputs = xlm_roberta_tokenizer(examples['tokens'], padding="max_length", max_length=128, truncation=True, is_split_into_words=True)
    aligned_labels = []
    for i, labels in enumerate(examples['labels']):
        label_ids = [labels_to_id[label] for label in labels]
        padding_length = len(tokenized_inputs['input_ids'][i]) - len(label_ids)
        label_ids += [-100] * padding_length
        aligned_labels.append(label_ids)
    tokenized_inputs['labels'] = aligned_labels
    return tokenized_inputs

In [None]:
# Align labels of XLM Roberta Base Tokenizer Dataset
xlm_roberta_tokenized_dataset = xlm_roberta_tokenizer_dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
# Display number of XLM Roberta Base - Tokenizer Dataset
print(xlm_roberta_tokenized_dataset)

In [None]:
# Setup Traning Arguments
from transformers import Trainer, TrainingArguments
training_arguments = TrainingArguments(
    output_dir='../output/xlm_roberta_base_tokenizer',         
    num_train_epochs=1,            
    per_device_train_batch_size=4,   
    per_device_eval_batch_size=4, 
    warmup_steps=10, 
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=5,
    evaluation_strategy="no",
)
trainer = Trainer(
    model=xlm_roberta_model, 
    args=training_arguments, 
    train_dataset=xlm_roberta_tokenized_dataset,
    tokenizer=xlm_roberta_tokenizer
)
trainer.train()

### Fine-Tuning Language Model with PythaiNLP - Tokenizer ðŸ’»

In [None]:
# Import PythaiNLP Tokenizer Dataset to fine-tuning
import json
with open('../data/pythainlp_tokenizer_format.json', 'r', encoding='utf-8') as f:
    pythainlp_tokenizer_dataset = json.load(f)

In [None]:
# Mapping dataset
from datasets import Dataset
pythainlp_tokenizer_dataset = Dataset.from_dict({
    "tokens": [item["tokens"] for item in pythainlp_tokenizer_dataset],
    "labels": [item["ner_tags"] for item in pythainlp_tokenizer_dataset]
})
print(pythainlp_tokenizer_dataset)

In [None]:
# Function to align dataset
def tokenize_and_align_labels(examples):
    tokenized_inputs = xlm_roberta_tokenizer(examples['tokens'], padding="max_length", max_length=128, truncation=True, is_split_into_words=True)
    aligned_labels = []
    for i, labels in enumerate(examples['labels']):
        label_ids = [labels_to_id[label] for label in labels]
        padding_length = len(tokenized_inputs['input_ids'][i]) - len(label_ids)
        label_ids += [-100] * padding_length
        aligned_labels.append(label_ids)
    tokenized_inputs['labels'] = aligned_labels
    return tokenized_inputs

In [None]:
# Align labels of PythaiNLP Tokenizer Dataset
pythainlp_tokenized_dataset = pythainlp_tokenizer_dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
# Display number of PythaiNLP Tokenizer Dataset
print(pythainlp_tokenized_dataset)

In [None]:
# Setup Traning Arguments
from transformers import Trainer, TrainingArguments
training_arguments = TrainingArguments(
    output_dir='../output/pythainlp_tokenizer',         
    num_train_epochs=1,            
    per_device_train_batch_size=4,   
    per_device_eval_batch_size=4, 
    warmup_steps=10, 
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=5,
    evaluation_strategy="no",
)
trainer = Trainer(
    model=xlm_roberta_model, 
    args=training_arguments, 
    train_dataset=pythainlp_tokenized_dataset,
    tokenizer=xlm_roberta_tokenizer
)
trainer.train()

In [None]:
# Save Fine-Tuning Model
xlm_roberta_model.save_pretrained('../model')
xlm_roberta_tokenizer.save_pretrained('../model')

### Fine-Tuning Language Model with Custom - Tokenizer ðŸ’»

In [5]:
# Resize Embeddings of Model
xlm_roberta_model.resize_token_embeddings(len(custom_tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(251327, 768, padding_idx=1)

In [6]:
# Import Custom Tokenizer Dataset to fine-tuning
import json
with open('../data/custom_tokenizer_format.json', 'r', encoding='utf-8') as f:
    custom_tokenizer_dataset = json.load(f)

In [7]:
# Mapping dataset
from datasets import Dataset
custom_tokenizer_dataset = Dataset.from_dict({
    "tokens": [item["tokens"] for item in custom_tokenizer_dataset],
    "labels": [item["ner_tags"] for item in custom_tokenizer_dataset]
})
print(custom_tokenizer_dataset)

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 2000
})


In [8]:
# Function to align dataset
def tokenize_and_align_labels(examples):
    tokenized_inputs = custom_tokenizer(examples['tokens'], padding="longest", truncation=True, return_tensors="pt", is_split_into_words=True)
    aligned_labels = []
    for i, labels in enumerate(examples['labels']):
        label_ids = [labels_to_id[label] for label in labels]
        padding_length = len(tokenized_inputs['input_ids'][i]) - len(label_ids)
        label_ids += [-100] * padding_length
        aligned_labels.append(label_ids)
        
    tokenized_inputs['labels'] = aligned_labels
    return tokenized_inputs

In [9]:
# Align labels of Custom Tokenizer Dataset
custom_tokenized_dataset = custom_tokenizer_dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2000/2000 [00:01<00:00, 1961.22 examples/s]


In [10]:
# Display number of XLM Roberta Base - Tokenizer Dataset
print(custom_tokenized_dataset)

Dataset({
    features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 2000
})


In [11]:
# Setup Traning Arguments
train_dataset = custom_tokenized_dataset.select(range(0, 10))
eval_dataset = custom_tokenized_dataset.select(range(10, 15))

from transformers import Trainer, TrainingArguments
training_arguments = TrainingArguments(
    output_dir='../output/model/custom_tokenizer',         
    num_train_epochs=3,            
    per_device_train_batch_size=4,   
    per_device_eval_batch_size=4, 
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)
trainer = Trainer(
    model=xlm_roberta_model, 
    args=training_arguments, 
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=custom_tokenizer
)
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,2.323858
2,No log,2.142396
3,No log,2.105615


TrainOutput(global_step=9, training_loss=2.274640613132053, metrics={'train_runtime': 143.9111, 'train_samples_per_second': 0.208, 'train_steps_per_second': 0.063, 'total_flos': 520632139320.0, 'train_loss': 2.274640613132053, 'epoch': 3.0})

In [12]:
# Save Fine-Tuning Model
xlm_roberta_model.save_pretrained('../nam/output/model/custom_tokenizer')
custom_tokenizer.save_pretrained('../nam/output/tokenizer/custom_tokenizer')

('../nam/output/tokenizer/custom_tokenizer/tokenizer_config.json',
 '../nam/output/tokenizer/custom_tokenizer/special_tokens_map.json',
 '../nam/output/tokenizer/custom_tokenizer/sentencepiece.bpe.model',
 '../nam/output/tokenizer/custom_tokenizer/added_tokens.json')