In [3]:
#adapted from @bact at https://colab.research.google.com/drive/1hdtmwTXHLrqNmDhDqHnTQGpDVy1aJc4t
import json
import pandas as pd
import numpy as np
import re
import pycrfsuite
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pythainlp.tokenize import word_tokenize
with open('data/talks.transcript.th-en.24-10-2019_11:12.json', 'r') as f:
    data = json.load(f)

In [4]:
talk_idx = []
idx = []
th_phrases = []
en_phrases = []

for i in range(len(data)):
    nb_phrases = min(len(data[i]['en']),len(data[i]['th']))
    for j in range(nb_phrases):
        talk_idx.append(i)
        th_phrases.append(data[i]['th'][j])
        en_phrases.append(data[i]['en'][j])
        if data[i]['en'][j][-1]=='.':
            idx.append(1)
        else:
            idx.append(0)

In [5]:
phrase_df = pd.DataFrame({'talk_idx':talk_idx,'en_phrase':en_phrases ,'th_phrase':th_phrases, 'idx':idx})
phrase_df.idx.sum()

136463

In [6]:
all_sentences = []
for i in range(phrase_df.talk_idx.max()):
    df = phrase_df[phrase_df.talk_idx==i]
    sentences = []
    for j,row in df.iterrows():
        sentences.append(row.th_phrase)
        if row.idx==1:
            sentences.append('|')
    joined_sentences = ''.join(sentences)
    #remove parantheses like (audience claps)
    joined_sentences = re.sub(r'\([^)]*\)', '', joined_sentences)
    #skip if talk is nothing but parantheses
    if joined_sentences=='': continue
    #remove | at the last sentence if present
    if joined_sentences[-1]=='|': joined_sentences = joined_sentences[:-1]
    all_sentences.append(joined_sentences)

In [7]:
len(all_sentences)

1544

In [8]:
all_tuples = []
for i in range(len(all_sentences)):
    tuples = []
    for s in all_sentences[i].split('|'):
        s_lst = word_tokenize(s)
        for j in range(len(s_lst)):
            lab = 'B' if j==0 else 'I'
            tuples.append((s_lst[j],lab))
    all_tuples.append(tuples)

In [9]:
len(all_tuples)

1544

In [10]:
# Extract features of each wordacter from text, in CRFSuite format
def extract_features(doc):
    doc_features = []
    for i, word in enumerate(doc):
        word_features = [
            'bias',
            'word=' + word,
        ]
        if i > 0:
            feats = [
                'word[-1]=' + doc[i-1],
            ]
            word_features.extend(feats)

        if i > 1:
            feats = [
                'word[-2]=' + doc[i-2],
            ]
            word_features.extend(feats)

        if i < len(doc)-1:
            feats = [
                'word[+1]=' + doc[i+1],           
            ]
            word_features.extend(feats)
            
        if i < len(doc)-2:
            feats = [
                'word[+2]=' + doc[i+2],              
            ]
            word_features.extend(feats)
        doc_features.append(word_features)
    return doc_features

In [11]:
#target
y = [[l for (w,l) in t] for t in all_tuples]
#features
x_pre = [[w for (w,l) in t] for t in all_tuples]
x = [extract_features(x_) for x_ in x_pre]

In [12]:
len(x),len(y)

(1544, 1544)

In [16]:
# Split train and test set at 80/20 proportion
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1412)

In [17]:
# Train model
trainer = pycrfsuite.Trainer(verbose=True)
#trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(x_train, y_train):
  trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 0.1,
    'c2': 0.01,
    'max_iterations': 200,
    'feature.possible_transitions': True,
})

trainer.train('sentenceseg-crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 195473
Seconds required: 4.403

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 467295.662820
Feature norm: 1.000000
Error norm: 196493.750890
Active features: 192303
Line search trials: 1
Line search step: 0.000000
Seconds required for this iteration: 1.404

***** Iteration #2 *****
Loss: 454826.338810
Feature norm: 1.083542
Error norm: 100494.877781
Active features: 188805
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.770

***** Iteration #3 *****
Loss: 449070.484299
Feature norm: 1.172912
Error norm: 36085.905940
Active features: 184229
Line search trials: 1
Line search step: 1.000000
Seconds req

***** Iteration #39 *****
Loss: 209528.009340
Feature norm: 305.329633
Error norm: 6086.417537
Active features: 113446
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.562

***** Iteration #40 *****
Loss: 208886.964162
Feature norm: 315.106298
Error norm: 2020.899305
Active features: 113469
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.576

***** Iteration #41 *****
Loss: 208366.467234
Feature norm: 325.180937
Error norm: 4066.104760
Active features: 112426
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.562

***** Iteration #42 *****
Loss: 207949.825373
Feature norm: 331.704736
Error norm: 2457.192992
Active features: 111884
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.563

***** Iteration #43 *****
Loss: 207602.659235
Feature norm: 339.375001
Error norm: 3316.793703
Active features: 111897
Line search trials: 1
Line search

***** Iteration #80 *****
Loss: 205183.972021
Feature norm: 378.064567
Error norm: 2231.072403
Active features: 103368
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.667

***** Iteration #81 *****
Loss: 205171.641694
Feature norm: 378.222821
Error norm: 2519.583120
Active features: 103335
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.645

***** Iteration #82 *****
Loss: 205157.331824
Feature norm: 378.361794
Error norm: 2114.536082
Active features: 103252
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.603

***** Iteration #83 *****
Loss: 205147.514715
Feature norm: 378.444105
Error norm: 2809.569127
Active features: 103189
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.554

***** Iteration #84 *****
Loss: 205137.055382
Feature norm: 378.555111
Error norm: 2946.609046
Active features: 103189
Line search trials: 1
Line search

***** Iteration #120 *****
Loss: 204778.753840
Feature norm: 379.073703
Error norm: 3484.748754
Active features: 102553
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.612

***** Iteration #121 *****
Loss: 204763.520167
Feature norm: 379.086206
Error norm: 1766.995901
Active features: 102526
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.574

***** Iteration #122 *****
Loss: 204762.121119
Feature norm: 379.100696
Error norm: 3556.861136
Active features: 102505
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.560

***** Iteration #123 *****
Loss: 204746.319679
Feature norm: 379.112134
Error norm: 1692.821613
Active features: 102490
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.541

***** Iteration #124 *****
Loss: 204746.025440
Feature norm: 379.124989
Error norm: 3589.055988
Active features: 102481
Line search trials: 1
Line s

***** Iteration #160 *****
Loss: 204407.502761
Feature norm: 379.628214
Error norm: 3483.088780
Active features: 101785
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.553

***** Iteration #161 *****
Loss: 204389.717022
Feature norm: 379.687772
Error norm: 1405.086985
Active features: 101745
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.551

***** Iteration #162 *****
Loss: 204382.097765
Feature norm: 379.709594
Error norm: 1264.462537
Active features: 101739
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.099

***** Iteration #163 *****
Loss: 204376.013358
Feature norm: 379.748528
Error norm: 1324.859165
Active features: 101743
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 1.102

***** Iteration #164 *****
Loss: 204369.078124
Feature norm: 379.780516
Error norm: 1949.803385
Active features: 101721
Line search trials: 2
Line s

Storing the model
Number of active features: 100255 (195473)
Number of active attributes: 63550 (156795)
Number of active labels: 2 (2)
Writing labels
Writing attributes
Writing feature references for transitions
Writing feature references for attributes
Seconds required: 0.584



In [18]:
# Predict (using test set)
tagger = pycrfsuite.Tagger()
tagger.open('sentenceseg-crf.model')
y_pred = [tagger.tag(xseq) for xseq in x_test]

In [19]:
# Evaluate
labels = {'B': 0, "I": 1} # classification_report() needs values in 0s and 1s
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

print(classification_report(
    truths, predictions,
    target_names=["B", "I"]))

              precision    recall  f1-score   support

           B       0.56      0.36      0.44     27731
           I       0.97      0.99      0.98    622194

    accuracy                           0.96    649925
   macro avg       0.77      0.67      0.71    649925
weighted avg       0.95      0.96      0.96    649925

