## Fine-Tuning Language Model üíª

In [1]:
# Setup Labels for Download XLM Roberta Base
labels_to_id = {
    "O": 0,
    "B-DRUG_NAME": 1, "I-DRUG_NAME": 2,
    "B-DOSAGE": 3, "I-DOSAGE": 4,
    "B-FORM": 5, "I-FORM": 6,
    "B-WARNINGS": 7, "I-WARNINGS": 8,
    "B-INDICATIONS": 9, "I-INDICATIONS": 10,
    "B-USAGE_INSTRUCTIONS": 11, "I-USAGE_INSTRUCTIONS": 12
}
id_to_labels = {v: k for k, v in labels_to_id.items()}

In [3]:
# XLM Roberta Base - Pretrained Model Download From HuggingFace
from transformers import XLMRobertaForTokenClassification
xlm_roberta_model = XLMRobertaForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=len(labels_to_id), id2label=id_to_labels, label2id=labels_to_id)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# Custom Tokenizer - Tokenizer Download From HuggingFace
from transformers import XLMRobertaTokenizer
custom_tokenizer = XLMRobertaTokenizer.from_pretrained("../tokenizer")

In [5]:
# Resize Embeddings of Model
xlm_roberta_model.resize_token_embeddings(len(custom_tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(250063, 768, padding_idx=1)

In [6]:
# Import Custom Tokenizer Dataset to fine-tuning
import json
with open('../data/custom_tokenizer_format.json', 'r', encoding='utf-8') as f:
    custom_tokenizer_dataset = json.load(f)

In [7]:
# Mapping dataset
from datasets import Dataset
custom_tokenizer_dataset = Dataset.from_dict({
    "tokens": [item["tokens"] for item in custom_tokenizer_dataset],
    "labels": [item["ner_tags"] for item in custom_tokenizer_dataset]
})
print(custom_tokenizer_dataset)

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 2000
})


In [8]:
# Function to align dataset
def tokenize_and_align_labels(examples):
    tokenized_inputs = custom_tokenizer(examples['tokens'], padding="longest", truncation=True, return_tensors="pt", is_split_into_words=True)
    aligned_labels = []
    for i, labels in enumerate(examples['labels']):
        label_ids = [labels_to_id[label] for label in labels]
        padding_length = len(tokenized_inputs['input_ids'][i]) - len(label_ids)
        label_ids += [-100] * padding_length
        aligned_labels.append(label_ids)
        
    tokenized_inputs['labels'] = aligned_labels
    return tokenized_inputs

In [9]:
# Align labels of Custom Tokenizer Dataset
custom_tokenized_dataset = custom_tokenizer_dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [00:00<00:00, 2418.98 examples/s]


In [10]:
# Display number of XLM Roberta Base - Tokenizer Dataset
print(custom_tokenized_dataset)

Dataset({
    features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 2000
})


In [11]:
# Setup Traning Arguments
train_dataset = custom_tokenized_dataset.select(range(0, 800))
eval_dataset = custom_tokenized_dataset.select(range(800, 1000))

from transformers import Trainer, TrainingArguments
training_arguments = TrainingArguments(
    output_dir='../output/model/custom_tokenizer',         
    num_train_epochs=10,            
    per_device_train_batch_size=4,   
    per_device_eval_batch_size=4, 
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)
trainer = Trainer(
    model=xlm_roberta_model, 
    args=training_arguments, 
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=custom_tokenizer
)
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.5482,0.415683
2,0.3695,0.29285
3,0.2733,0.214964
4,0.1955,0.172715
5,0.1412,0.134123
6,0.118,0.161079
7,0.0842,0.105103
8,0.0584,0.095312


RuntimeError: [enforce fail at inline_container.cc:626] . unexpected pos 46016 vs 45908

In [12]:
# ‡πÉ‡∏ä‡πâ‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÅ‡∏•‡∏∞ tokenizer ‡∏à‡∏≤‡∏Å trainer ‡πÇ‡∏î‡∏¢‡∏ï‡∏£‡∏á
model = trainer.model
tokenizer = trainer.tokenizer

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


In [13]:
print(model.config.id2label)



In [None]:
from transformers import pipeline

# ‡∏™‡∏£‡πâ‡∏≤‡∏á NER pipeline
ner_pipeline = pipeline("ner", model='../output/model/custom_tokenizer/v1', tokenizer='../output/model/custom_tokenizer/v1', aggregation_strategy="simple")

# ‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢
text = "‡∏ò‡∏ô‡∏†‡∏ì ‡∏û‡∏£‡∏™‡∏µ‡∏°‡∏≤"
ner_results = ner_pipeline(text)
print(ner_results)

for entity in ner_results:
    print(f"{entity['word']} ({entity['entity_group']}): {entity['score']:.3f}")


Device set to use mps:0


26-08-18 ‡∏Ñ‡∏ß‡∏£ (INDICATIONS): 0.567
‡πÉ‡∏ä‡πâ‡∏ï‡∏≤‡∏°‡∏Ñ‡πç‡∏≤‡πÅ‡∏ô‡∏∞‡∏ô‡πç‡∏≤‡∏Ç‡∏≠‡∏á‡πÅ‡∏û‡∏ó‡∏¢‡πå ‡∏•‡∏î‡πÑ‡∏Ç‡πâ‡πÉ‡∏ô‡∏ú‡∏π‡πâ‡∏õ‡πà‡∏ß‡∏¢‡πÑ‡∏Ç‡πâ‡∏™‡∏π‡∏á ‡πÄ‡∏Ç (USAGE_INSTRUCTIONS): 0.960
‡∏¢‡πà‡∏≤‡∏Ç‡∏ß‡∏î‡πÉ‡∏´‡πâ‡πÄ‡∏Ç‡πâ‡∏≤‡∏Å‡∏±‡∏ô‡∏Å‡πà‡∏≠‡∏ô‡∏ó‡∏∏‡∏Å‡∏Ñ‡∏£‡∏±‡πâ‡∏á (INDICATIONS): 0.852
