In [86]:
#adapted from @bact at https://colab.research.google.com/drive/1hdtmwTXHLrqNmDhDqHnTQGpDVy1aJc4t
import json
import pandas as pd
import numpy as np
import re
import pycrfsuite
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pythainlp.tokenize import word_tokenize
from pythainlp.tag import pos_tag

In [79]:
orchid = pd.read_csv('data/orchid_corpus/orchid97.crp.utf',sep='\t',header=None)
orchid.columns = ['text']
#remove weird words
orchid['first_char'] = orchid.text.map(lambda x: x[0])
orchid = orchid[(orchid.first_char!='%')&(orchid.first_char!='#')][['text']]
#get word,pos
orchid['word'] = orchid.text.map(lambda x: x.split('/')[0])
orchid['word'] = orchid.word.map(lambda x: ' ' if (x=='<space>')|(x=='') else x)
orchid['pos'] = orchid.text.map(lambda x: x.split('/')[1] if len(x.split('/'))==2 else None)
#labels
orchid['lab'] = orchid.apply(lambda row: 'E' if row['text']=='//' else 'I',1)
orchid = orchid[(orchid.lab=='E')|(~orchid.pos.isna())].reset_index(drop=True)
orchid.shape

(365814, 4)

In [80]:
orchid

Unnamed: 0,text,word,pos,lab
0,การ/FIXN,การ,FIXN,I
1,ประชุม/VACT,ประชุม,VACT,I
2,ทาง/NCMN,ทาง,NCMN,I
3,วิชาการ/NCMN,วิชาการ,NCMN,I
4,<space>/PUNC,,PUNC,I
5,ครั้ง/CFQC,ครั้ง,CFQC,I
6,ที่ 1/DONM,ที่ 1,DONM,I
7,//,,,E
8,โครงการวิจัยและพัฒนา/NCMN,โครงการวิจัยและพัฒนา,NCMN,I
9,อิเล็กทรอนิกส์/NCMN,อิเล็กทรอนิกส์,NCMN,I


In [81]:
orchid.lab.value_counts()

I    342689
E     23125
Name: lab, dtype: int64

In [82]:
orchid_tuples = [(row['word'],row['lab']) for i,row in orchid.iterrows()]
len(orchid_tuples),orchid_tuples[:3]

(365814, [('การ', 'I'), ('ประชุม', 'I'), ('ทาง', 'I')])

In [83]:
# import pickle
# with open('orchid_tuples.pkl','wb') as f:
#     pickle.dump(orchid_tuples,f)

In [85]:
enders = ["ครับ","ค่ะ","คะ","นะคะ","นะ","จ้ะ","จ้า","จ๋า","ฮะ", #ending honorifics
          #enders
          "ๆ","ได้","แล้ว","ด้วย","เลย","มาก","น้อย","กัน","เช่นกัน","เท่านั้น",
          "อยู่","ลง","ขึ้น","มา","ไป","ไว้","เอง","อีก","ใหม่","จริงๆ",
          "บ้าง","หมด","ทีเดียว","เดียว",
          #demonstratives
          "นั้น","นี้","เหล่านี้","เหล่านั้น",
          #questions
          "อย่างไร","ยังไง","หรือไม่","มั้ย","ไหน","อะไร","ทำไม","เมื่อไหร่"]
starters = ["ผม","ฉัน","ดิฉัน","ชั้น","คุณ","มัน","เขา","เค้า",
            "เธอ","เรา","พวกเรา","พวกเขา", #pronouns
            #connectors
            "และ","หรือ","แต่","เมื่อ","ถ้า","ใน",
            "ด้วย","เพราะ","เนื่องจาก","ซึ่ง","ไม่",
            "ตอนนี้","ทีนี้","ดังนั้น","เพราะฉะนั้น","ฉะนั้น",
            "ตั้งแต่","ในที่สุด",
            #demonstratives
            "นั้น","นี้","เหล่านี้","เหล่านั้น"]

def extract_features(doc, window=2, max_n_gram=3):
    doc_features = []
    #paddings for word and POS
#     doc_pos = ['xxpad' for i in range(window)] + \
#         [p for (w,p) in pos_tag(doc,engine='artagger', corpus='orchid')] + ['xxpad' for i in range(window)]
    doc = ['xxpad' for i in range(window)] + doc + ['xxpad' for i in range(window)]
    doc_ender = []
    doc_starter = []
    #add enders
    for i in range(len(doc)):
        if doc[i] in enders:
            doc_ender.append('ender')
        else:
            doc_ender.append('normal')
    #add starters
    for i in range(len(doc)):
        if doc[i] in starters:
            doc_starter.append('starter')
        else:
            doc_starter.append('normal')
    #for each word
    for i in range(window, len(doc)-window):
        #bias term
        word_features = ['bias'] 
        
        #ngram features
        for n_gram in range(1, min(max_n_gram+1,2+window*2)):
            for j in range(i-window,i+window+2-n_gram):
                feature_position = f'{n_gram}_{j-i}_{j-i+n_gram}'
                word_ = f'{"|".join(doc[j:(j+n_gram)])}'
                word_features += [f'word_{feature_position}={word_}']
#                 pos_ =f'{"|".join(doc_pos[j:(j+n_gram)])}'
#                 word_features += [f'pos_{feature_position}={pos_}']
                ender_ =  f'{"|".join(doc_ender[j:(j+n_gram)])}'
                word_features += [f'ender_{feature_position}={ender_}']
                starter_ =  f'{"|".join(doc_starter[j:(j+n_gram)])}'
                word_features += [f'starter_{feature_position}={starter_}']
        
#         #number of verbs to the left and right
#         nb_verbs_left = 0
#         for l in range(i)[::-1]:
#             if doc[l]=='<space>': break
#             if doc_pos[l]=='VACT': nb_verbs_left+=1
#         nb_verbs_right = 0
#         for r in range(i+1,len(doc)):
#             if doc[r]=='<space>': break
#             if doc_pos[r]=='VACT': nb_verbs_right+=1
#         word_features += [f'nb_verbs_left={nb_verbs_left}',f'nb_verbs_right={nb_verbs_right}']
        
        #append to feature per word
        doc_features.append(word_features)
    return doc_features

In [87]:
#target
y = [l for (w,l) in orchid_tuples]
#features
x_pre = [w for (w,l) in orchid_tuples]
x = extract_features(x_pre, window=2, max_n_gram = 3) 

In [88]:
len(x),len(y)

(365814, 365814)

In [119]:
# Split train and test set at 80/20 proportion
idx = int(len(x)*0.8)
x_train, x_test = x[:idx], x[idx:]
y_train, y_test = y[:idx], y[idx:]

In [120]:
# Train model
trainer = pycrfsuite.Trainer(verbose=True)
trainer.append(x_train, y_train)

trainer.set_params({
    'c1': 1,
    'c2': 0,
    'max_iterations': 1000,
    'feature.possible_transitions': True,
})

trainer.train('models/orchid-crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 909839
Seconds required: 7.814

L-BFGS optimization
c1: 1.000000
c2: 0.000000
num_memories: 6
max_iterations: 1000
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 127325.034399
Feature norm: 1.000000
Error norm: 128912.378896
Active features: 143351
Line search trials: 1
Line search step: 0.000001
Seconds required for this iteration: 0.840

***** Iteration #2 *****
Loss: 110864.992509
Feature norm: 0.875037
Error norm: 125337.966999
Active features: 109489
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.492

***** Iteration #3 *****
Loss: 68629.585051
Feature norm: 0.407951
Error norm: 98991.793282
Active features: 45167
Line search trials: 4
Line search step: 0.125000
Seconds requ

***** Iteration #47 *****
Loss: 18836.584331
Feature norm: 39.329193
Error norm: 516.587512
Active features: 20073
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.426

***** Iteration #48 *****
Loss: 18780.812985
Feature norm: 40.388845
Error norm: 418.190854
Active features: 19717
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.404

***** Iteration #49 *****
Loss: 18735.751858
Feature norm: 41.268594
Error norm: 1460.654828
Active features: 19366
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.404

***** Iteration #50 *****
Loss: 18696.857089
Feature norm: 41.757456
Error norm: 479.592113
Active features: 19030
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.401

***** Iteration #51 *****
Loss: 18659.363969
Feature norm: 42.301160
Error norm: 341.224781
Active features: 18488
Line search trials: 1
Line search step: 1.000000
Sec

***** Iteration #97 *****
Loss: 18345.401171
Feature norm: 49.698444
Error norm: 80.478661
Active features: 14552
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.433

***** Iteration #98 *****
Loss: 18345.038034
Feature norm: 49.711088
Error norm: 105.487015
Active features: 14554
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.451

***** Iteration #99 *****
Loss: 18344.477445
Feature norm: 49.738618
Error norm: 127.908167
Active features: 14519
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.423

***** Iteration #100 *****
Loss: 18344.107161
Feature norm: 49.742980
Error norm: 121.385798
Active features: 14515
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.452

***** Iteration #101 *****
Loss: 18343.747793
Feature norm: 49.752491
Error norm: 54.900155
Active features: 14505
Line search trials: 1
Line search step: 1.000000
Seco

***** Iteration #140 *****
Loss: 18337.559351
Feature norm: 50.005281
Error norm: 117.760792
Active features: 14181
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.406

***** Iteration #141 *****
Loss: 18337.391783
Feature norm: 50.015782
Error norm: 109.618730
Active features: 14168
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.412

***** Iteration #142 *****
Loss: 18337.287819
Feature norm: 50.023364
Error norm: 87.226299
Active features: 14170
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.397

***** Iteration #143 *****
Loss: 18337.211847
Feature norm: 50.027025
Error norm: 61.422328
Active features: 14180
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.386

***** Iteration #144 *****
Loss: 18337.111560
Feature norm: 50.033380
Error norm: 80.103150
Active features: 14174
Line search trials: 1
Line search step: 1.000000
Se

***** Iteration #185 *****
Loss: 18334.184176
Feature norm: 50.351961
Error norm: 38.320981
Active features: 14047
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.740

***** Iteration #186 *****
Loss: 18334.150662
Feature norm: 50.362425
Error norm: 98.975613
Active features: 14041
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.764

***** Iteration #187 *****
Loss: 18334.096792
Feature norm: 50.365831
Error norm: 41.215295
Active features: 14040
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.744

***** Iteration #188 *****
Loss: 18334.063840
Feature norm: 50.377800
Error norm: 93.450291
Active features: 14040
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.771

***** Iteration #189 *****
Loss: 18334.021917
Feature norm: 50.379466
Error norm: 57.501615
Active features: 14034
Line search trials: 2
Line search step: 0.500000
Seco

***** Iteration #231 *****
Loss: 18332.690881
Feature norm: 50.514655
Error norm: 87.388023
Active features: 13952
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.788

***** Iteration #232 *****
Loss: 18332.671970
Feature norm: 50.514750
Error norm: 25.300306
Active features: 13950
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.781

***** Iteration #233 *****
Loss: 18332.666753
Feature norm: 50.516832
Error norm: 112.097705
Active features: 13952
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.746

***** Iteration #234 *****
Loss: 18332.639661
Feature norm: 50.516584
Error norm: 21.791951
Active features: 13950
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.739

***** Iteration #235 *****
Loss: 18332.630124
Feature norm: 50.517601
Error norm: 56.215650
Active features: 13946
Line search trials: 3
Line search step: 0.250000
Sec

In [126]:
# Predict (using test set)
tagger = pycrfsuite.Tagger()
tagger.open('models/orchid-crf.model')
y_pred = tagger.tag(x_test)

In [127]:
# Evaluate at word-level
labels = {'E': 0, "I": 1} # classification_report() needs values in 0s and 1s
predictions = np.array([labels[tag] for tag in y_pred])
truths = np.array([labels[tag] for tag in y_test])

print(classification_report(
    truths, predictions,
    target_names=["E", "I"]))

              precision    recall  f1-score   support

           E       0.85      0.71      0.77      4520
           I       0.98      0.99      0.99     68643

    accuracy                           0.97     73163
   macro avg       0.91      0.85      0.88     73163
weighted avg       0.97      0.97      0.97     73163



In [128]:
results = []
for i in range(len(y_test)):
    results.append({'word':x_test[i][7].split('=')[1],
                    'y':y_test[i],
                    'pred':y_pred[i]})
result_df = pd.DataFrame(results)[['word','y','pred']]
result_df['wrong_flag'] = result_df.apply(lambda row: 0 if row.y==row.pred else 1,1)

In [129]:
#space correct
space_df = result_df.copy()
space_df = space_df[space_df.word==' ']
space_df.wrong_flag.mean(), space_df.shape

(0.1312552418227565, (14308, 4))

In [130]:
print(classification_report(
    space_df.y, space_df.pred,
    target_names=["E", "I"]))

              precision    recall  f1-score   support

           E       0.85      0.71      0.77      4520
           I       0.88      0.94      0.91      9788

    accuracy                           0.87     14308
   macro avg       0.86      0.83      0.84     14308
weighted avg       0.87      0.87      0.87     14308



In [None]:
#               precision    recall  f1-score   support

#            E       0.85      0.71      0.77      4520
#            I       0.88      0.94      0.91      9788

#     accuracy                           0.87     14308
#    macro avg       0.86      0.83      0.84     14308
# weighted avg       0.87      0.87      0.87     14308