In [2]:
#adapted from @bact at https://colab.research.google.com/drive/1hdtmwTXHLrqNmDhDqHnTQGpDVy1aJc4t
import json
import pandas as pd
import numpy as np
import re
import pycrfsuite
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pythainlp.tokenize import word_tokenize
from pythainlp.tag import pos_tag
with open('data/talks.transcript.th-en.24-10-2019_11:12.json', 'r') as f:
    data = json.load(f)

In [3]:
talk_idx = []
idx = []
th_phrases = []
en_phrases = []

for i in range(len(data)):
    nb_phrases = min(len(data[i]['en']),len(data[i]['th']))
    for j in range(nb_phrases):
        talk_idx.append(i)
        th_phrases.append(data[i]['th'][j])
        en_phrases.append(data[i]['en'][j])
        if data[i]['en'][j][-1]=='.':
            idx.append(1)
        else:
            idx.append(0)

In [4]:
phrase_df = pd.DataFrame({'talk_idx':talk_idx,'en_phrase':en_phrases ,'th_phrase':th_phrases, 'idx':idx})
phrase_df.idx.sum()

136463

In [5]:
all_sentences = []
for i in range(phrase_df.talk_idx.max()):
    df = phrase_df[phrase_df.talk_idx==i]
    sentences = []
    for j,row in df.iterrows():
        sentences.append(row.th_phrase)
        if row.idx==1:
            sentences.append(' |')
    joined_sentences = ''.join(sentences)
    #remove parantheses like (audience claps)
    joined_sentences = re.sub(r'\([^)]*\)', '', joined_sentences)
    #skip if talk is nothing but parantheses
    if joined_sentences=='': continue
    #remove | at the last sentence if present
    if joined_sentences[-1]=='|': joined_sentences = joined_sentences[:-1]
    all_sentences.append(joined_sentences)

In [6]:
len(all_sentences)

1544

In [7]:
all_sentences[0]

'บนหลังอาชาแห้งกร่องดอนกิโฆเต้ พระเอกของเราบุกตะลุยสู้กับกองทัพยักษ์ |ในสายตาของเขา มันเป็นหน้าที่ของเขาที่จะปราบอสูรร้ายเหล่านี้ในนามแห่งหญิงอันเป็นที่รักของเขา ดุลสิเนอา |ทว่า การกระทำอันหาญกล้านี้ก็สูญเปล่า |เมื่อซานโซ่ ปันซ่า ผู้รับใช้ของเขาอธิบายครั้งแล้วครั้งเล่า ว่าสิ่งเหล่านี้จะเป็นยักษ์ก็หาไม่พวกมันเป็นเพียงกังหันลมเท่านั้น |ดอนกิโฆเต้ หาได้เสียความแน่วแน่แทงทวนของเขาเข้าไปยังใบพัดอย่างจัง |ด้วยพลังใจที่ไม่เคยถดถอยอัศวินผู้นั้นยืนขึ้นอย่างภาคภูมิและยิ่งเชื่อมั่นในปฏิบัติการของเขามากขึ้น |ลำดับเหตุการณ์นี้ครอบคลุมเรื่องราวส่วนใหญ่ของดอนกิโฆเต้ ที่เป็นที่รักมหากาพย์ ไร้ตรรกะ และมีชีวิตชีวาของ อลองโซ กีฆานาผู้กลายเป็น ดอนกิโฆเต้ แห่งลามันช่าผู้ซุ่มซ่ามแต่กล้าหาญหรือที่รู้จักกันในนาม ขุนนางต่ำศักดิ์นักฝัน |แต่เดิมวรรณกรรมนี้มีสองเล่มบรรยายเรื่องราวของดอนกิโฆเต้ในขณะที่เขาเดินทางผ่านตอนกลาง และตอนเหนือของสเปนเพื่อต่อสู้กับบรรดาปิศาจร้าย |แม้จินตนาการในเรื่อง ดอนกิโฆเต้ จะสูงล้ำเหนือเมฆผู้ประพันธ์ มิเกล เด เซร์บันเตสก็ไม่เคยนึกฝันว่าหนังสือของเขาจะกลายเป็นนิยายที่ขายดีที่สุดตลอดกาล 

In [8]:
all_tuples = []
for i in range(len(all_sentences)):
    tuples = []
    for s in all_sentences[i].split('|'):
        s_lst = word_tokenize(s)
        for j in range(len(s_lst)):
            lab = 'B' if j==0 else 'I'
            tuples.append((s_lst[j],lab))
    all_tuples.append(tuples)

In [9]:
len(all_tuples)

1544

In [10]:
def extract_features(doc, window=2, max_n_gram=3):
    doc_features = []
    #padding
    doc_pos = ['xxpad' for i in range(window)] + [p for (w,p) in pos_tag(doc)] + ['xxpad' for i in range(window)]
    doc = ['xxpad' for i in range(window)] + doc + ['xxpad' for i in range(window)]
    #for each word
    for i in range(window, len(doc)-window):
        #bias term
        word_features = ['bias'] 
        
        #ngram features
        for n_gram in range(1, min(max_n_gram+1,2+window*2)):
            for j in range(i-window,i+window+2-n_gram):
                feature_position = f'{n_gram}_{j-i}_{j-i+n_gram}'
                word_ = f'{"|".join(doc[j:(j+n_gram)])}'
                pos_ =f'{"|".join(doc_pos[j:(j+n_gram)])}'
                word_features += [f'word_{feature_position}={word_}']
                word_features += [f'pos_{feature_position}={pos_}']
                
        #number of verbs features
        nb_verbs_left = 0
        for l in range(i)[::-1]:
            if doc[l]=='<space>': break
            if doc_pos[l]=='VACT': nb_verbs_left+=1
        nb_verbs_right = 0
        for r in range(i+1,len(doc)):
            if doc[r]=='<space>': break
            if doc_pos[r]=='VACT': nb_verbs_right+=1
        word_features += [f'nb_verbs_left={nb_verbs_left}',f'nb_verbs_right={nb_verbs_right}']
        
        #append to feature per word
        doc_features.append(word_features)
    return doc_features

In [11]:
extract_features(word_tokenize('ฉันชอบกินมะนาว ฉันชอบกินโอเด้ง แต่ดี'), window=2, max_n_gram = 3)

[['bias',
  'word_1_-2_-1=xxpad',
  'pos_1_-2_-1=xxpad',
  'word_1_-1_0=xxpad',
  'pos_1_-1_0=xxpad',
  'word_1_0_1=ฉัน',
  'pos_1_0_1=PPRS',
  'word_1_1_2=ชอบ',
  'pos_1_1_2=VACT',
  'word_1_2_3=กิน',
  'pos_1_2_3=VACT',
  'word_2_-2_0=xxpad|xxpad',
  'pos_2_-2_0=xxpad|xxpad',
  'word_2_-1_1=xxpad|ฉัน',
  'pos_2_-1_1=xxpad|PPRS',
  'word_2_0_2=ฉัน|ชอบ',
  'pos_2_0_2=PPRS|VACT',
  'word_2_1_3=ชอบ|กิน',
  'pos_2_1_3=VACT|VACT',
  'word_3_-2_1=xxpad|xxpad|ฉัน',
  'pos_3_-2_1=xxpad|xxpad|PPRS',
  'word_3_-1_2=xxpad|ฉัน|ชอบ',
  'pos_3_-1_2=xxpad|PPRS|VACT',
  'word_3_0_3=ฉัน|ชอบ|กิน',
  'pos_3_0_3=PPRS|VACT|VACT',
  'nb_verbs_left=0',
  'nb_verbs_right=2'],
 ['bias',
  'word_1_-2_-1=xxpad',
  'pos_1_-2_-1=xxpad',
  'word_1_-1_0=ฉัน',
  'pos_1_-1_0=PPRS',
  'word_1_0_1=ชอบ',
  'pos_1_0_1=VACT',
  'word_1_1_2=กิน',
  'pos_1_1_2=VACT',
  'word_1_2_3=มะนาว',
  'pos_1_2_3=NCMN',
  'word_2_-2_0=xxpad|ฉัน',
  'pos_2_-2_0=xxpad|PPRS',
  'word_2_-1_1=ฉัน|ชอบ',
  'pos_2_-1_1=PPRS|VACT',
  'word_2_0_

In [12]:
#target
y = [[l for (w,l) in t] for t in all_tuples]
#features
x_pre = [[w for (w,l) in t] for t in all_tuples]
x = [extract_features(x_, window=2, max_n_gram = 3) for x_ in x_pre]

In [13]:
len(x),len(y)

(1544, 1544)

In [14]:
# Split train and test set at 80/20 proportion
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1412)

In [15]:
x_train[0][:5]

[['bias',
  'word_1_-2_-1=xxpad',
  'pos_1_-2_-1=xxpad',
  'word_1_-1_0=xxpad',
  'pos_1_-1_0=xxpad',
  'word_1_0_1=วงดนตรี',
  'pos_1_0_1=NCMN',
  'word_1_1_2=ที่',
  'pos_1_1_2=PREL',
  'word_1_2_3=คุณ',
  'pos_1_2_3=VACT',
  'word_2_-2_0=xxpad|xxpad',
  'pos_2_-2_0=xxpad|xxpad',
  'word_2_-1_1=xxpad|วงดนตรี',
  'pos_2_-1_1=xxpad|NCMN',
  'word_2_0_2=วงดนตรี|ที่',
  'pos_2_0_2=NCMN|PREL',
  'word_2_1_3=ที่|คุณ',
  'pos_2_1_3=PREL|VACT',
  'word_3_-2_1=xxpad|xxpad|วงดนตรี',
  'pos_3_-2_1=xxpad|xxpad|NCMN',
  'word_3_-1_2=xxpad|วงดนตรี|ที่',
  'pos_3_-1_2=xxpad|NCMN|PREL',
  'word_3_0_3=วงดนตรี|ที่|คุณ',
  'pos_3_0_3=NCMN|PREL|VACT',
  'nb_verbs_left=0',
  'nb_verbs_right=4'],
 ['bias',
  'word_1_-2_-1=xxpad',
  'pos_1_-2_-1=xxpad',
  'word_1_-1_0=วงดนตรี',
  'pos_1_-1_0=NCMN',
  'word_1_0_1=ที่',
  'pos_1_0_1=PREL',
  'word_1_1_2=คุณ',
  'pos_1_1_2=VACT',
  'word_1_2_3=ชื่นชอบ',
  'pos_1_2_3=VACT',
  'word_2_-2_0=xxpad|วงดนตรี',
  'pos_2_-2_0=xxpad|NCMN',
  'word_2_-1_1=วงดนตรี|ที่',


In [16]:
# Train model
trainer = pycrfsuite.Trainer(verbose=True)

for xseq, yseq in zip(x_train, y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1,
    'c2': 1e-1,
    'max_iterations': 500,
    'epsilon': 1,
    'feature.possible_transitions': True,
})

trainer.train('sentenceseg-crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 7172442
Seconds required: 120.489

L-BFGS optimization
c1: 1.000000
c2: 0.100000
num_memories: 6
max_iterations: 500
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 460698.728817
Feature norm: 1.000000
Error norm: 236243.781028
Active features: 992058
Line search trials: 1
Line search step: 0.000000
Seconds required for this iteration: 38.980

***** Iteration #2 *****
Loss: 447649.503515
Feature norm: 1.012339
Error norm: 231577.008407
Active features: 735629
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 19.149

***** Iteration #3 *****
Loss: 339203.024651
Feature norm: 2.918894
Error norm: 573968.630882
Active features: 206587
Line search trials: 1
Line search step: 1.000000
Secon

***** Iteration #39 *****
Loss: 122653.677718
Feature norm: 58.073374
Error norm: 4735.346911
Active features: 101549
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 7.231

***** Iteration #40 *****
Loss: 122236.755385
Feature norm: 60.701575
Error norm: 1896.745932
Active features: 100737
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 7.343

***** Iteration #41 *****
Loss: 121847.005991
Feature norm: 63.701571
Error norm: 5220.311260
Active features: 98598
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 7.954

***** Iteration #42 *****
Loss: 121546.073651
Feature norm: 65.772372
Error norm: 2347.624225
Active features: 96736
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 8.862

***** Iteration #43 *****
Loss: 121258.609563
Feature norm: 68.239511
Error norm: 2920.708446
Active features: 94344
Line search trials: 1
Line search step: 1

***** Iteration #82 *****
Loss: 119178.149316
Feature norm: 85.533089
Error norm: 990.851928
Active features: 72612
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.171

***** Iteration #83 *****
Loss: 119172.219799
Feature norm: 85.547714
Error norm: 1130.101085
Active features: 72512
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.473

***** Iteration #84 *****
Loss: 119166.816873
Feature norm: 85.568101
Error norm: 1423.627382
Active features: 72387
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 8.206

***** Iteration #85 *****
Loss: 119162.930279
Feature norm: 85.583953
Error norm: 1938.940951
Active features: 72223
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 7.420

***** Iteration #86 *****
Loss: 119156.834308
Feature norm: 85.599516
Error norm: 1230.745167
Active features: 72118
Line search trials: 1
Line search step: 1.00

***** Iteration #122 *****
Loss: 119064.798393
Feature norm: 86.212420
Error norm: 822.049920
Active features: 70848
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 6.804

***** Iteration #123 *****
Loss: 119063.717738
Feature norm: 86.229219
Error norm: 1009.206570
Active features: 70821
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 7.023

***** Iteration #124 *****
Loss: 119062.100763
Feature norm: 86.241847
Error norm: 803.483431
Active features: 70808
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 7.169

***** Iteration #125 *****
Loss: 119061.019777
Feature norm: 86.259130
Error norm: 968.940703
Active features: 70799
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 7.637

***** Iteration #126 *****
Loss: 119059.464118
Feature norm: 86.273948
Error norm: 804.955407
Active features: 70770
Line search trials: 1
Line search step: 1.

***** Iteration #163 *****
Loss: 119018.432607
Feature norm: 86.607900
Error norm: 931.919601
Active features: 69982
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 8.848

***** Iteration #164 *****
Loss: 119017.201805
Feature norm: 86.615411
Error norm: 607.847203
Active features: 69941
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 8.185

***** Iteration #165 *****
Loss: 119016.513530
Feature norm: 86.619190
Error norm: 864.208268
Active features: 69942
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 7.906

***** Iteration #166 *****
Loss: 119015.420204
Feature norm: 86.627555
Error norm: 635.564002
Active features: 69929
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 7.150

***** Iteration #167 *****
Loss: 119014.856169
Feature norm: 86.632480
Error norm: 936.684283
Active features: 69917
Line search trials: 1
Line search step: 1.0

***** Iteration #204 *****
Loss: 118988.115550
Feature norm: 86.678339
Error norm: 513.954076
Active features: 69458
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 7.133

***** Iteration #205 *****
Loss: 118987.984330
Feature norm: 86.678745
Error norm: 998.148790
Active features: 69440
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 7.577

***** Iteration #206 *****
Loss: 118986.930127
Feature norm: 86.679060
Error norm: 551.914434
Active features: 69410
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 7.119

***** Iteration #207 *****
Loss: 118986.708693
Feature norm: 86.679457
Error norm: 960.830363
Active features: 69397
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 7.422

***** Iteration #208 *****
Loss: 118985.737082
Feature norm: 86.679940
Error norm: 553.785017
Active features: 69390
Line search trials: 1
Line search step: 1.0

***** Iteration #243 *****
Loss: 118967.327835
Feature norm: 86.682535
Error norm: 846.770363
Active features: 69027
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 6.294

***** Iteration #244 *****
Loss: 118966.679371
Feature norm: 86.681537
Error norm: 551.107161
Active features: 69020
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 6.168

***** Iteration #245 *****
Loss: 118966.351507
Feature norm: 86.680183
Error norm: 759.788988
Active features: 69003
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 6.584

***** Iteration #246 *****
Loss: 118965.772059
Feature norm: 86.678404
Error norm: 498.113715
Active features: 68998
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 6.588

***** Iteration #247 *****
Loss: 118965.584999
Feature norm: 86.676257
Error norm: 866.076243
Active features: 68980
Line search trials: 1
Line search step: 1.0

***** Iteration #283 *****
Loss: 118948.082162
Feature norm: 86.500394
Error norm: 891.262749
Active features: 68648
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 8.080

***** Iteration #284 *****
Loss: 118947.394803
Feature norm: 86.493907
Error norm: 155.282279
Active features: 68634
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 6.007

***** Iteration #285 *****
Loss: 118947.049919
Feature norm: 86.489009
Error norm: 824.381880
Active features: 68640
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 6.031

***** Iteration #286 *****
Loss: 118946.544709
Feature norm: 86.481753
Error norm: 410.426749
Active features: 68644
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 6.079

***** Iteration #287 *****
Loss: 118946.448387
Feature norm: 86.475233
Error norm: 923.334198
Active features: 68621
Line search trials: 1
Line search step: 1.0

***** Iteration #323 *****
Loss: 118935.168963
Feature norm: 86.298126
Error norm: 361.219501
Active features: 68319
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 12.980

***** Iteration #324 *****
Loss: 118934.877495
Feature norm: 86.296340
Error norm: 188.648036
Active features: 68310
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 11.461

***** Iteration #325 *****
Loss: 118934.730340
Feature norm: 86.294412
Error norm: 360.602650
Active features: 68297
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 12.696

***** Iteration #326 *****
Loss: 118934.448411
Feature norm: 86.292207
Error norm: 198.358078
Active features: 68298
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 12.642

***** Iteration #327 *****
Loss: 118934.297680
Feature norm: 86.290795
Error norm: 334.233855
Active features: 68302
Line search trials: 2
Line search step:

***** Iteration #362 *****
Loss: 118926.632485
Feature norm: 86.251850
Error norm: 371.389270
Active features: 68064
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 6.621

***** Iteration #363 *****
Loss: 118926.384658
Feature norm: 86.250412
Error norm: 201.749746
Active features: 68056
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 12.260

***** Iteration #364 *****
Loss: 118926.288561
Feature norm: 86.248246
Error norm: 303.310487
Active features: 68048
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 10.980

***** Iteration #365 *****
Loss: 118926.097796
Feature norm: 86.248032
Error norm: 324.550239
Active features: 68054
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 11.005

***** Iteration #366 *****
Loss: 118925.934674
Feature norm: 86.247493
Error norm: 303.075243
Active features: 68032
Line search trials: 2
Line search step: 

***** Iteration #401 *****
Loss: 118920.037363
Feature norm: 86.234945
Error norm: 479.793399
Active features: 67785
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 17.706

***** Iteration #402 *****
Loss: 118919.779394
Feature norm: 86.233984
Error norm: 143.463649
Active features: 67762
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 18.492

***** Iteration #403 *****
Loss: 118919.731304
Feature norm: 86.233453
Error norm: 473.908066
Active features: 67750
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 17.511

***** Iteration #404 *****
Loss: 118919.720572
Feature norm: 86.233025
Error norm: 572.022748
Active features: 67739
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 7.789

***** Iteration #405 *****
Loss: 118919.346837
Feature norm: 86.234026
Error norm: 525.393924
Active features: 67742
Line search trials: 1
Line search step: 

***** Iteration #441 *****
Loss: 118914.038526
Feature norm: 86.219274
Error norm: 405.646313
Active features: 67612
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 14.974

***** Iteration #442 *****
Loss: 118913.847322
Feature norm: 86.218895
Error norm: 155.906497
Active features: 67613
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 14.137

***** Iteration #443 *****
Loss: 118913.792952
Feature norm: 86.219262
Error norm: 434.063419
Active features: 67614
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 14.279

***** Iteration #444 *****
Loss: 118913.581048
Feature norm: 86.218858
Error norm: 142.225021
Active features: 67615
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 13.975

***** Iteration #445 *****
Loss: 118913.525318
Feature norm: 86.219364
Error norm: 437.085436
Active features: 67625
Line search trials: 2
Line search step:

***** Iteration #481 *****
Loss: 118908.304805
Feature norm: 86.211324
Error norm: 506.042487
Active features: 67511
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 14.399

***** Iteration #482 *****
Loss: 118908.054505
Feature norm: 86.210148
Error norm: 106.880518
Active features: 67503
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 13.700

***** Iteration #483 *****
Loss: 118908.050189
Feature norm: 86.209024
Error norm: 517.523395
Active features: 67494
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 13.688

***** Iteration #484 *****
Loss: 118908.037422
Feature norm: 86.206558
Error norm: 557.355788
Active features: 67483
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 7.108

L-BFGS terminated with the stopping criteria
Total seconds required for training: 4942.057

Storing the model
Number of active features: 67483 (7172442)
Numb

In [None]:
# #training until convergence is not much different than training for about 200 iterations
# ***** Iteration #484 *****
# Loss: 118908.037422
# Feature norm: 86.206558
# Error norm: 557.355788
# Active features: 67483
# Line search trials: 1
# Line search step: 1.000000
# Seconds required for this iteration: 7.108

# L-BFGS terminated with the stopping criteria
# Total seconds required for training: 4942.057

# Storing the model
# Number of active features: 67483 (7172442)
# Number of active attributes: 37888 (7013949)
# Number of active labels: 2 (2)
# Writing labels
# Writing attributes
# Writing feature references for transitions
# Writing feature references for attributes
# Seconds required: 1.586

In [17]:
# Predict (using test set)
tagger = pycrfsuite.Tagger()
tagger.open('sentenceseg-crf.model')
y_pred = [tagger.tag(xseq) for xseq in x_test]

In [18]:
# Evaluate
labels = {'B': 0, "I": 1} # classification_report() needs values in 0s and 1s
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

print(classification_report(
    truths, predictions,
    target_names=["B", "I"]))

              precision    recall  f1-score   support

           B       0.73      0.70      0.71     27763
           I       0.99      0.99      0.99    649564

    accuracy                           0.98    677327
   macro avg       0.86      0.84      0.85    677327
weighted avg       0.98      0.98      0.98    677327



In [22]:
def sentence_tokenize(s):
    toks = word_tokenize(s)
    feat = extract_features(toks)
    labs = tagger.tag(feat)
    sentences = []
    sentence = ''
    for i, w in enumerate(toks):
        if labs[i] == 'B':
            if sentence:
                sentences.append(sentence)
            sentence = ''
        sentence = sentence + w
    if sentence:
        sentences.append(sentence)
    return sentences

In [23]:
s = 'เธอคือหุ่นยนต์รูปแบบใหม่ที่ฉันคนนี้สร้างขึ้นมาเธอมีความสามารถในการคิดและรู้สึกเหมือนมนุษย์ เธอสามารถตัดสินใจด้วยตัวเอง แต่ถึงอย่างนั้นมันก็เป็นดาบสองคม ที่อาจเป็นอันตรายถ้าเธอคิดว่ามนุษย์เป็นภัย จนแหกกฎข้อแรกที่หุ่นยนต์ห้ามทำร้ายมนุษย์ ฉันจำเป็นต้องใช้เวลาทดสอบระบบอีก 30 ปีเพื่อความมั่นใจว่าเธอจะไม่เป็นอันตรายกับมนุษย์ แต่ฉันคงจะมีชีวิตอยู่ไม่ถึงวันนั้น ดังนั้นฉันจึงปิดผนึกเธอเอาไว้ในแคปซูลจนกว่าระบบต่าง ๆ จะพร้อมเสียก่อน'

In [24]:
sentence_tokenize(s)

['เธอคือหุ่นยนต์รูปแบบใหม่ที่ฉันคนนี้สร้างขึ้นมาเธอมีความสามารถในการคิดและรู้สึกเหมือนมนุษย์<space>',
 'เธอสามารถตัดสินใจด้วยตัวเอง<space>',
 'แต่ถึงอย่างนั้นมันก็เป็นดาบสองคม<space>ที่อาจเป็นอันตรายถ้าเธอคิดว่ามนุษย์เป็นภัย<space>จนแหกกฎข้อแรกที่หุ่นยนต์ห้ามทำร้ายมนุษย์<space>',
 'ฉันจำเป็นต้องใช้เวลาทดสอบระบบอีก<space>30<space>ปีเพื่อความมั่นใจว่าเธอจะไม่เป็นอันตรายกับมนุษย์<space>',
 'แต่ฉันคงจะมีชีวิตอยู่ไม่ถึงวันนั้น<space>',
 'ดังนั้นฉันจึงปิดผนึกเธอเอาไว้ในแคปซูลจนกว่าระบบต่าง ๆ<space>จะพร้อมเสียก่อน']

In [68]:
results = []
for i in range(len(y_test)):
    s=0
    for j in range(len(y_test[i])):
        if y_test[i][j]=='B': s+=1
        results.append({'sentence_idx':f'{i}_{s}', 
                        'word':x_test[i][j][5].split('=')[1],
                        'y':y_test[i][j],
                        'pred':y_pred[i][j]})
result_df = pd.DataFrame(results)[['sentence_idx','word','y','pred']]
result_df['wrong_flag'] = result_df.apply(lambda row: 0 if row.y==row.pred else 1,1)

In [81]:
result_agg = result_df.groupby('sentence_idx').agg({'word': lambda x: ''.join(x),
                                                    'wrong_flag': max,
                                                    'y': lambda x: '|'.join(x),
                                                    'pred': lambda x: '|'.join(x)}).reset_index()

In [93]:
print(result_agg.iloc[102,1])

คุณจะต้องลากอุปกรณ์น้ำหนักรวมกันกว่า<space>1000<space>ปอนด์  ขึ้นไปบนยอดของภูเขาไฟสูง<space>20,000<space>ฟุต<space>ที่อยู่ในเทือกเขาแอนดีส<space>


In [94]:
#widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import display

In [None]:
def error_sentences()

In [None]:
interact(utils.value_dist, 
         df=fixed(df.select_dtypes(exclude=[np.number])),
         col=widgets.Dropdown(options=df.select_dtypes(exclude=[np.number]).columns, 
         value='brand'))