## Reference
   - https://huggingface.co/airesearch/wangchanberta-base-att-spm-uncased
   - https://huggingface.co/docs/transformers/index
   - https://simpletransformers.ai/docs/ner-specifics/

In [2]:
import dill
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Prepair Data

In [5]:
path_name = "../../dataset/data/"

# อ่านข้อมูลจากไฟล์
with open(path_name + 'biased-pos.data', 'rb') as file:
    datatofile = dill.load(file)

# แบ่งข้อมูลเป็น train และ test
tagged_sents = []
for data in datatofile:
    text_inside = []
    for word, pos, label in data:
        if word.strip() == '': # ถ้า word เป็นช่องว่าง ให้แทนที่ด้วย _
            text_inside.append(('_', label))
        else:
            text_inside.append((word, label))
    tagged_sents.append(text_inside)
train_sents, test_sents = train_test_split(tagged_sents, test_size=0.2, random_state=42)
print(len(train_sents))
print(len(test_sents))
# print(train_sents[1])

3999
1000


In [6]:
# ทำให้ข้อมูลอยู่ในรูปที่สามารถใช้กับ Simple transformers ได้
# format : ["sentence_id", "words", "labels"]
def convert_to_simple_transformer_format(sentences):
    sentence_id = []
    words = []
    labels = []

    for idx, sents in enumerate(sentences):
        for word, label in sents:
            label = label.upper().replace("-", "_")
            sentence_id.append(idx)
            words.append(word)
            labels.append(label)
    return pd.DataFrame(
        {"sentence_id": sentence_id, "words": words, "labels": labels}
    )    

In [11]:
train_data = convert_to_simple_transformer_format(train_sents)
test_data = convert_to_simple_transformer_format(test_sents)
train_data

Unnamed: 0,sentence_id,words,labels
0,0,kevin,O
1,0,_,O
2,0,mitnick,O
3,0,_,O
4,0,เป็น,O
...,...,...,...
11054,399,และ,O
11055,399,ครู,O
11056,399,สอน,O
11057,399,ทฤษฎี,O


# Train

In [None]:
import torch
from simpletransformers.ner import NERModel, NERArgs

_NER_TAGS = ["O", "B_D", "I_D", "B_E", "I_E"]

# Configure the model for train
ner_args = NERArgs()
ner_args.evaluate_during_training = True
ner_args.overwrite_output_dir = True
ner_args.train_batch_size = 32
ner_args.max_seq_length = 256
ner_args.num_train_epochs = 100 #10
ner_args.save_model_every_epoch = False

model = NERModel(
    "camembert", "airesearch/wangchanberta-base-att-spm-uncased", args=ner_args, use_cuda=torch.cuda.is_available(), labels=_NER_TAGS
)

# Train the model
model.train_model(train_data,eval_data=test_data)

# Train Continue

In [None]:
import torch
from simpletransformers.ner import NERModel, NERArgs

_NER_TAGS = ["O", "B_D", "I_D", "B_E", "I_E"]

ner_args = NERArgs()
ner_args.train_batch_size = 32
ner_args.evaluate_during_training = True
ner_args.overwrite_output_dir = True
ner_args.max_seq_length = 256
ner_args.num_train_epochs = 100 #10

model_continue = NERModel("camembert", 'outputs/checkpoint-1820-epoch-91', args=ner_args, use_cuda=torch.cuda.is_available(), labels=_NER_TAGS)
model_continue.train_model(train_data,eval_data=test_data)