In [1]:
import dill
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
path_name = "../../dataset/data/"

# อ่านข้อมูลจากไฟล์
with open(path_name + 'biased-pos.data', 'rb') as file:
    datatofile = dill.load(file)

# แบ่งข้อมูลเป็น train และ test
tagged_sents = []
for data in datatofile:
    text_inside = []
    for word, pos, label in data:
        if word.strip() == '': # ถ้า word เป็นช่องว่าง ให้แทนที่ด้วย _
            text_inside.append(('_', label))
        else:
            text_inside.append((word, label))
    tagged_sents.append(text_inside)

train_sents, test_sents = train_test_split(tagged_sents, test_size=0.2, random_state=42)
print(len(train_sents))
print(len(test_sents))

3999
1000


In [3]:
# ทำให้ข้อมูลอยู่ในรูปที่สามารถใช้กับ Simple transformers ได้
# format : ["sentence_id", "words", "labels"]
def convert_to_simple_transformer_format(sentences):
    sentence_id = []
    words = []
    labels = []

    for idx, sents in enumerate(sentences):
        for word, label in sents:
            label = label.upper().replace("-", "_")
            sentence_id.append(idx)
            words.append(word)
            labels.append(label)
    return pd.DataFrame(
        {"sentence_id": sentence_id, "words": words, "labels": labels}
    )    

In [4]:
train_ = convert_to_simple_transformer_format(train_sents)
test_ = convert_to_simple_transformer_format(test_sents)
train_

Unnamed: 0,sentence_id,words,labels
0,0,Michael,O
1,0,_,O
2,0,McDowell,O
3,0,_,O
4,0,เกิด,O
...,...,...,...
133451,3998,และ,O
133452,3998,ทำให้,O
133453,3998,ไดรฟ์,O
133454,3998,มีชีวิต,O


In [5]:
import torch
from simpletransformers.ner import NERModel, NERArgs

# Configure the model for evaluation
_NER_TAGS = ["O", "B_D", "B_E", "I_D", "I_E"]
ner_args = NERArgs()
# ner_args.max_seq_length = 256
test_ner = NERModel("camembert", "kittisak612/bias-tagger", args=ner_args, use_cuda=torch.cuda.is_available(), labels=_NER_TAGS)

  "use_multiprocessing automatically disabled as CamemBERT"


In [6]:
# predict word label from test
idx = 1
test_pred = (list(map(lambda word_lable: word_lable[0], test_sents[idx])))
predictions, raw_outputs = test_ner.predict([test_pred], split_on_space=False)
print(predictions[0]) 

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'ใน': 'O'}, {'วันที่': 'O'}, {'_': 'O'}, {'19': 'O'}, {'_': 'O'}, {'พฤษภาคม': 'O'}, {'_': 'O'}, {'2016': 'O'}, {'_': 'O'}, {'หนึ่ง': 'O'}, {'วันก่อน': 'O'}, {'ดร.': 'O'}, {'_': 'O'}, {'Tsai': 'O'}, {'_': 'O'}, {'Ing-Wen': 'O'}, {'_': 'O'}, {'สันนิษฐาน': 'O'}, {'ว่า': 'O'}, {'ประธานาธิบดี': 'O'}, {'ที่': 'O'}, {'ได้รับ': 'O'}, {'การเลือกตั้ง': 'O'}, {'จาก': 'O'}, {'ประชาธิปไตย': 'O'}, {'ของ': 'O'}, {'ไต้หวัน': 'O'}, {'สหรัฐอเมริกา': 'O'}, {'_': 'O'}, {'วุฒิสมาชิก': 'O'}, {'มาร': 'O'}, {'์': 'O'}, {'โก': 'O'}, {'รู': 'O'}, {'บิ': 'O'}, {'โอ': 'O'}, {'_': 'O'}, {'(': 'O'}, {'R-FL': 'O'}, {')': 'O'}, {'_': 'O'}, {'สมาชิก': 'O'}, {'ของ': 'O'}, {'คณะกรรมการ': 'O'}, {'ความสัมพันธ์': 'O'}, {'ระหว่างประเทศ': 'O'}, {'ของ': 'O'}, {'วุฒิสภา': 'O'}, {'และ': 'O'}, {'คณะกรรมการ': 'O'}, {'วุฒิสภา': 'O'}, {'เลือก': 'O'}, {'คณะ': 'O'}, {'ข่าวกรอง': 'O'}, {'และ': 'O'}, {'_': 'O'}, {'Bob': 'O'}, {'_': 'O'}, {'Menendez': 'O'}]


In [7]:
# predict word label from text input
from pythainlp.tokenize import word_tokenize

text = "ที่นั่นเขาอาศัยอยู่ในแฟลตห้องเดี่ยวที่ตลาด anarkaliที่มีชื่อเสียงลาฮอร์"
text_token = word_tokenize(text)
predictions, raw_outputs = test_ner.predict([text_token], split_on_space=False)
print(predictions[0]) 

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'ที่นั่น': 'O'}, {'เขา': 'O'}, {'อาศัย': 'O'}, {'อยู่': 'O'}, {'ใน': 'O'}, {'แฟลต': 'O'}, {'ห้อง': 'O'}, {'เดี่ยว': 'O'}, {'ที่': 'O'}, {'ตลาด': 'O'}, {' ': 'O'}, {'anarkali': 'O'}, {'ที่': 'B_D'}, {'มีชื่อเสียง': 'I_D'}, {'ลา': 'O'}, {'ฮอร์': 'O'}]


In [8]:
# get label test list
y_test = []
y_test_words = []
for sent in test_sents:
    labels = []
    words = []
    for word, label in sent:
        label = label.upper().replace("-", "_")
        labels.append(label)
        words.append(word)
    y_test.append(labels)
    y_test_words.append(words)
    
print(y_test[4])
print(y_test_words[4])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_D', 'O', 'O', 'O', 'O']
['Jerry', '_', 'Douglas', '_', '(', 'เกิด', 'ที่', '_', 'Warren', ',', '_', 'Ohio', '_', 'เมื่อ', 'วันที่', '_', '28', '_', 'พฤษภาคม', '_', '1956', ')', '_', 'เป็น', 'ผู้', 'เล่น', '_', 'Virtuoso', 'Dobro', '_', 'ชาว', 'อเมริกัน']


In [9]:
# get label pred list
test_list = []
for sent in test_sents:
    words = []
    for word, label in sent:
        words.append(word)
    test_list.append(words)

predictions, raw_outputs = test_ner.predict(test_list, split_on_space=False)

y_pred = []
for preds in predictions:
    y_pred.append([list(pred.items())[0][1] for pred in preds])

print(y_test[4])
print(y_test_words[4])
print(y_pred[4])

  0%|          | 0/1000 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/125 [00:00<?, ?it/s]

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_D', 'O', 'O', 'O', 'O']
['Jerry', '_', 'Douglas', '_', '(', 'เกิด', 'ที่', '_', 'Warren', ',', '_', 'Ohio', '_', 'เมื่อ', 'วันที่', '_', '28', '_', 'พฤษภาคม', '_', '1956', ')', '_', 'เป็น', 'ผู้', 'เล่น', '_', 'Virtuoso', 'Dobro', '_', 'ชาว', 'อเมริกัน']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [11]:
# get label to list
y_pred_ = []
y_test_ = []
for i in range(len(y_test)):
    if len(y_pred[i]) != len(y_test[i]):
        continue;
    y_pred_.append(y_pred[i])
    y_test_.append(y_test[i])
print(y_test_[1])

['O', 'O', 'B_D', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [12]:
# convert label format 
#from B_D to B-d
def convert_format_lable(list_lable):
    result = []
    for list_ in list_lable:
        result.append(list(map(lambda lable: lable.lower().capitalize().replace("_", "-"), list_)))
    return result

y_test_ = convert_format_lable(y_test_)
y_pred_ = convert_format_lable(y_pred_)
print(y_test_[1])

['O', 'O', 'B-d', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [57]:
# การประเมินในระดับคำ
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

def pos_classification_report(y_true, y_pred):
 
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
    print("accuracy",accuracy_score(y_true_combined, y_pred_combined))
    tagset = list(sorted(set(lb.classes_)))
    del tagset[len(tagset)-1] # del O
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset if cls!="O"],
        target_names = tagset,
        zero_division=0
    )
print(pos_classification_report(y_test_,y_pred_))

accuracy 0.8937643500127556
              precision    recall  f1-score   support

         B-d       0.36      0.34      0.35       488
         B-e       0.20      0.01      0.02       968
         I-d       0.32      0.41      0.36       394
         I-e       0.00      0.00      0.00       997

   micro avg       0.33      0.12      0.17      2847
   macro avg       0.22      0.19      0.18      2847
weighted avg       0.18      0.12      0.12      2847
 samples avg       0.01      0.01      0.01      2847



In [58]:
# การประเมินในระดับประโยค
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report

print("accuracy:" ,accuracy_score(y_test_, y_pred_))
print(classification_report(y_test_, y_pred_))

accuracy: 0.8937643500127556
              precision    recall  f1-score   support

           d       0.26      0.28      0.27       489
           e       0.10      0.01      0.01       969

   micro avg       0.25      0.10      0.14      1458
   macro avg       0.18      0.14      0.14      1458
weighted avg       0.15      0.10      0.10      1458

