In [1]:
import dill
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
path_name = "../../dataset/data/"

with open(path_name + 'biased-pos.data', 'rb') as file:
    datatofile = dill.load(file)

tagged_sents = []
for data in datatofile:
    text_inside = []
    for word, pos, label in data:
        if word.strip() == '':
            text_inside.append(('_', label))
        else:
            text_inside.append((word, label))
    tagged_sents.append(text_inside)

train_sents, test_sents = train_test_split(tagged_sents, test_size=0.2, random_state=42)
print(len(train_sents))
print(len(test_sents))

400
100


In [3]:
def convert_to_simple_transformer_format(sentences):
    sentence_id = []
    words = []
    labels = []

    for idx, sents in enumerate(sentences):
        for word, label in sents:
            label = label.upper().replace("-", "_")
            sentence_id.append(idx)
            words.append(word)
            labels.append(label)
    return pd.DataFrame(
        {"sentence_id": sentence_id, "words": words, "labels": labels}
    )    
            


In [4]:
train_ = convert_to_simple_transformer_format(train_sents)
test_ = convert_to_simple_transformer_format(test_sents)
train_

Unnamed: 0,sentence_id,words,labels
0,0,kevin,O
1,0,_,O
2,0,mitnick,O
3,0,_,O
4,0,เป็น,O
...,...,...,...
11054,399,และ,O
11055,399,ครู,O
11056,399,สอน,O
11057,399,ทฤษฎี,O


In [5]:
import torch
from simpletransformers.ner import NERModel, NERArgs

_NER_TAGS = ["O", "B_D", "B_E", "I_D", "I_E"]
ner_args = NERArgs()
ner_args.max_seq_length = 256
test_ner = NERModel("bert", 'outputs/best_model', args=ner_args, use_cuda=torch.cuda.is_available(), labels=_NER_TAGS)

In [11]:
idx = 1
test_pred = (list(map(lambda word_lable: word_lable[0], test_sents[idx])))
predictions, raw_outputs = test_ner.predict([test_pred], split_on_space=False)
print(predictions[0]) 

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'หลังจาก': 'O'}, {'การ': 'O'}, {'ออกเดินทาง': 'O'}, {'จาก': 'O'}, {'แฮร์': 'O'}, {'วูด': 'O'}, {'_': 'O'}, {'ทัวร์': 'O'}, {'ได้': 'O'}, {'เดิน': 'O'}, {'ไป': 'O'}, {'ตาม': 'O'}, {'หุบเขา': 'O'}, {'ท่าเทียบเรือ': 'O'}, {'ผ่าน': 'O'}, {'ออ': 'O'}, {'ต': 'O'}, {'ลี': 'O'}, {'ย์': 'O'}, {'_': 'O'}, {'เบอร์': 'O'}, {'ลี': 'O'}, {'ย์': 'O'}, {'ใน': 'O'}, {'วอร์': 'O'}, {'เฟ': 'O'}, {'เด': 'O'}, {'ล': 'O'}, {'_': 'O'}, {'และ': 'O'}, {'บ้าน': 'O'}, {'ของ': 'O'}, {'วัว': 'O'}, {'และ': 'O'}, {'ลูก': 'O'}, {'วัว': 'O'}, {'ที่': 'O'}, {'มีชื่อเสียง': 'O'}, {'_': 'O'}, {'อิลค์': 'O'}, {'ลี': 'O'}, {'ย์': 'O'}]


In [7]:
from pythainlp.tokenize import word_tokenize

text = "ที่นั่นเขาอาศัยอยู่ในแฟลตห้องเดี่ยวที่ตลาด anarkaliที่มีชื่อเสียงลาฮอร์"
text_token = word_tokenize(text)
predictions, raw_outputs = test_ner.predict([text_token], split_on_space=False)
print(predictions[0]) 

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'ที่นั่น': 'O'}, {'เขา': 'O'}, {'อาศัย': 'O'}, {'อยู่': 'O'}, {'ใน': 'O'}, {'แฟลต': 'O'}, {'ห้อง': 'O'}, {'เดี่ยว': 'O'}, {'ที่': 'O'}, {'ตลาด': 'O'}, {' ': 'O'}, {'anarkali': 'O'}, {'ที่': 'B_D'}, {'มีชื่อเสียง': 'B_E'}, {'ลา': 'O'}, {'ฮอร์': 'O'}]


In [8]:
# get label test list
y_test = []
for sent in test_sents:
    labels = []
    for word, label in sent:
        label = label.upper().replace("-", "_")
        labels.append(label)
    y_test.append(labels)
    
print(y_test[1])


['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [9]:
# get label pred list
test_list = []
for sent in test_sents:
    words = []
    for word, label in sent:
        words.append(word)
    test_list.append(words)

predictions, raw_outputs = test_ner.predict(test_list, split_on_space=False)

y_pred = []
for preds in predictions:
    y_pred.append([list(pred.items())[0][1] for pred in preds])

print(y_pred[1])

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/13 [00:00<?, ?it/s]

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [19]:
y_pred_ = []
y_test_ = []
for i in range(len(y_test)):
    if len(y_pred[i]) != len(y_test[i]):
        continue;
    y_pred_.append(y_pred[i])
    y_test_.append(y_test[i])

In [20]:
def convert_format_lable(list_lable):
    result = []
    for list_ in list_lable:
        result.append(list(map(lambda lable: lable.lower().capitalize().replace("_", "-"), list_)))
    return result

y_test_ = convert_format_lable(y_test_)
y_pred_ = convert_format_lable(y_pred_)



In [21]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

def pos_classification_report(y_true, y_pred):
 
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
    print("accuracy",accuracy_score(y_true_combined, y_pred_combined))
    tagset = list(sorted(set(lb.classes_)))
    del tagset[len(tagset)-1] # del O
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset if cls!="O"],
        target_names = tagset,
        zero_division=0
    )
print(pos_classification_report(y_test_,y_pred_))

accuracy 0.9304647160068846
              precision    recall  f1-score   support

         B-d       0.49      0.31      0.38        80
         B-e       0.08      0.04      0.05        27
         I-d       0.00      0.00      0.00        79
         I-e       0.00      0.00      0.00        18

   micro avg       0.41      0.13      0.19       204
   macro avg       0.14      0.09      0.11       204
weighted avg       0.20      0.13      0.16       204
 samples avg       0.01      0.01      0.01       204



In [22]:
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report

print("accuracy:" ,accuracy_score(y_test_, y_pred_))
print(classification_report(y_test_, y_pred_))

accuracy: 0.9304647160068846
              precision    recall  f1-score   support

           d       0.10      0.06      0.08        80
           e       0.08      0.04      0.05        27

   micro avg       0.10      0.06      0.07       107
   macro avg       0.09      0.05      0.06       107
weighted avg       0.09      0.06      0.07       107

