# XLM-RoBERTa

In [1]:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("token-classification", model="xlm-roberta-base")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use mps:0


In [2]:
text = "กรุงเทพมหานครเป็นเมืองหลวงของประเทศไทย"
results = pipe(text)
print(results)

[{'entity': 'LABEL_1', 'score': np.float32(0.5878626), 'index': 1, 'word': '▁กรุงเทพมหานคร', 'start': 0, 'end': 13}, {'entity': 'LABEL_1', 'score': np.float32(0.5905951), 'index': 2, 'word': 'เป็น', 'start': 13, 'end': 17}, {'entity': 'LABEL_1', 'score': np.float32(0.59489083), 'index': 3, 'word': 'เมือง', 'start': 17, 'end': 22}, {'entity': 'LABEL_1', 'score': np.float32(0.63031447), 'index': 4, 'word': 'หลวง', 'start': 22, 'end': 26}, {'entity': 'LABEL_1', 'score': np.float32(0.60539097), 'index': 5, 'word': 'ของ', 'start': 26, 'end': 29}, {'entity': 'LABEL_1', 'score': np.float32(0.6170814), 'index': 6, 'word': 'ประเทศไทย', 'start': 29, 'end': 38}]


In [3]:
for entity in results:
    print(f"Entity: {entity['word']}, Label: {entity['entity']}, Score: {entity['score']}")

Entity: ▁กรุงเทพมหานคร, Label: LABEL_1, Score: 0.5878626108169556
Entity: เป็น, Label: LABEL_1, Score: 0.5905951261520386
Entity: เมือง, Label: LABEL_1, Score: 0.594890832901001
Entity: หลวง, Label: LABEL_1, Score: 0.6303144693374634
Entity: ของ, Label: LABEL_1, Score: 0.6053909659385681
Entity: ประเทศไทย, Label: LABEL_1, Score: 0.6170814037322998


# Bert-Base-NER

In [4]:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("token-classification", model="dslim/bert-base-NER")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


In [5]:
text = "Barack Obama was born in Hawaii."
results = pipe(text)
print(results)

[{'entity': 'B-PER', 'score': np.float32(0.9993894), 'index': 1, 'word': 'Barack', 'start': 0, 'end': 6}, {'entity': 'I-PER', 'score': np.float32(0.9991943), 'index': 2, 'word': 'Obama', 'start': 7, 'end': 12}, {'entity': 'B-LOC', 'score': np.float32(0.9997441), 'index': 6, 'word': 'Hawaii', 'start': 25, 'end': 31}]


In [6]:
for entity in results:
    print(f"Entity: {entity['word']}, Label: {entity['entity']}, Score: {entity['score']}")

Entity: Barack, Label: B-PER, Score: 0.9993894100189209
Entity: Obama, Label: I-PER, Score: 0.999194324016571
Entity: Hawaii, Label: B-LOC, Score: 0.9997441172599792


In [7]:
text = "กรุงเทพมหานครเป็นเมืองหลวงของประเทศไทย"
results = pipe(text)
print(results)

[]


In [8]:
for entity in results:
    print(f"Entity: {entity['word']}, Label: {entity['entity']}, Score: {entity['score']}")

# Data Preprocessing

In [9]:
data = {
    'tokens': [
        ['พาราเซตามอล', 'ใช้', 'บรรเทา', 'อาการ', 'ปวดหัว', 'รับประทาน', 'วันละ', '2', 'ครั้ง']
    ],
    'ner_tags': [
        [0, 1, 3, 3, 3, 1, 1, 2, 2]
    ]
}

In [10]:
from datasets import Dataset
dataset = Dataset.from_dict(data)

In [11]:
from transformers import XLMRobertaTokenizer, XLMRobertaForTokenClassification, Trainer, TrainingArguments
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
model = XLMRobertaForTokenClassification.from_pretrained('xlm-roberta-base', num_labels=4)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def tokenize_function(examples):
    return tokenizer(examples['tokens'], padding='max_length', truncation=True, is_split_into_words=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 1/1 [00:00<00:00, 39.37 examples/s]


In [13]:
tokenized_datasets

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'attention_mask'],
    num_rows: 1
})

In [16]:
training_args = TrainingArguments(
    output_dir='./results',          
    evaluation_strategy="no",     
    learning_rate=2e-5,               
    per_device_train_batch_size=8,   
    num_train_epochs=3,              
    weight_decay=0.01,               
)
trainer = Trainer(
    model=model,                     
    args=training_args,              
    train_dataset=tokenized_datasets,  
)

In [None]:
trainer.train()