In [None]:
!cd data/checkpoint; unzip ted_fake.zip

In [1]:
#adapted from @bact at https://colab.research.google.com/drive/1hdtmwTXHLrqNmDhDqHnTQGpDVy1aJc4t
import json
import pandas as pd
import numpy as np
import re
import pycrfsuite
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pythainlp.tokenize import word_tokenize
from pythainlp.tag import pos_tag
from ast import literal_eval
from tqdm import tqdm
pd.set_option('display.max_rows', 10)
warnings.filterwarnings('ignore')

In [2]:
orchid = pd.read_csv('data/orchid_corpus/orchid97.crp.utf',sep='\t',header=None)
orchid.columns = ['text']
#remove weird words
orchid['first_char'] = orchid.text.map(lambda x: x[0])
orchid = orchid[(orchid.first_char!='%')&(orchid.first_char!='#')][['text']]
#get word,pos
orchid['word'] = orchid.text.map(lambda x: x.split('/')[0])
orchid['word'] = orchid.word.map(lambda x: ' ' if (x=='<space>')|(x=='') else x)
orchid['pos'] = orchid.text.map(lambda x: x.split('/')[1] if len(x.split('/'))==2 else None)
#labels
orchid['lab'] = orchid.apply(lambda row: 'E' if row['text']=='//' else 'I',1)
orchid = orchid[(orchid.lab=='E')|(~orchid.pos.isna())].reset_index(drop=True)

In [3]:
%%time
ted_all_sentences = np.load('data/checkpoint/ted-all-sentences.npy') 
fake_review_all_sentences = np.load('data/checkpoint/fake-review-all-sentences.npy') 

CPU times: user 1.92 ms, sys: 371 ms, total: 373 ms
Wall time: 371 ms


In [4]:
# Sample from 3 datasets
np.random.seed(42)
ratio = .25
ted_sample = np.random.choice(ted_all_sentences, int(len(ted_all_sentences) * ratio))
orchid_sample = orchid.iloc[:int(len(orchid) * ratio)]
fake_review_sample = np.random.choice(fake_review_all_sentences, int(len(fake_review_all_sentences) * ratio))

In [5]:
print(f"Length of TED: {len(ted_sample)}")
print(f"Length of orchid: {len(orchid_sample)}")
print(f"Length of fake review: {len(fake_review_sample)}")

Length of TED: 385
Length of orchid: 91453
Length of fake review: 54370


In [6]:
def assign_word_lab(all_sentences):
    all_tuples = []
    # all_tuples = np.array([])
    for i in tqdm(range(len(all_sentences)), total=len(all_sentences)):
        tuples = []
        # tuples = np.array()
        for s in all_sentences[i].split('|'):
            s_lst = word_tokenize(s)
            for j in range(len(s_lst)):
                lab = 'E' if j==len(s_lst)-1 else 'I'
                tuples.append((s_lst[j],lab))
        all_tuples.append(tuples)
        # all_tuples = np.append(all_tuples, tuples)
    return all_tuples

In [7]:
%%time
ted_all_tuples = assign_word_lab(ted_sample)
orchid_all_tuples = [(row['word'],row['lab']) for i,row in orchid_sample.iterrows()]
fake_review_all_tuples = assign_word_lab(fake_review_sample)

100%|██████████| 385/385 [00:09<00:00, 40.72it/s]
100%|██████████| 54370/54370 [00:42<00:00, 1269.39it/s]

CPU times: user 1min 3s, sys: 884 ms, total: 1min 3s
Wall time: 1min 3s





In [8]:
enders = ["ครับ","ค่ะ","คะ","นะคะ","นะ","จ้ะ","จ้า","จ๋า","ฮะ", #ending honorifics
          #enders
          "ๆ","ได้","แล้ว","ด้วย","เลย","มาก","น้อย","กัน","เช่นกัน","เท่านั้น",
          "อยู่","ลง","ขึ้น","มา","ไป","ไว้","เอง","อีก","ใหม่","จริงๆ",
          "บ้าง","หมด","ทีเดียว","เดียว",
          #demonstratives
          "นั้น","นี้","เหล่านี้","เหล่านั้น",
          #questions
          "อย่างไร","ยังไง","หรือไม่","มั้ย","ไหน","อะไร","ทำไม","เมื่อไหร่"]
starters = ["ผม","ฉัน","ดิฉัน","ชั้น","คุณ","มัน","เขา","เค้า",
            "เธอ","เรา","พวกเรา","พวกเขา", #pronouns
            #connectors
            "และ","หรือ","แต่","เมื่อ","ถ้า","ใน",
            "ด้วย","เพราะ","เนื่องจาก","ซึ่ง","ไม่",
            "ตอนนี้","ทีนี้","ดังนั้น","เพราะฉะนั้น","ฉะนั้น",
            "ตั้งแต่","ในที่สุด",
            #demonstratives
            "นั้น","นี้","เหล่านี้","เหล่านั้น"]

def extract_features(doc, window=2, max_n_gram=3):
    doc_features = []
    #paddings for word and POS
    doc = ['xxpad' for i in range(window)] + doc + ['xxpad' for i in range(window)]
    doc_ender = []
    doc_starter = []
    #add enders
    for i in range(len(doc)):
        if doc[i] in enders:
            doc_ender.append('ender')
        else:
            doc_ender.append('normal')
    #add starters
    for i in range(len(doc)):
        if doc[i] in starters:
            doc_starter.append('starter')
        else:
            doc_starter.append('normal')
    #for each word
    for i in range(window, len(doc)-window):
        #bias term
        word_features = ['bias'] 
        
        #ngram features
        for n_gram in range(1, min(max_n_gram+1,2+window*2)):
            for j in range(i-window,i+window+2-n_gram):
                feature_position = f'{n_gram}_{j-i}_{j-i+n_gram}'
                word_ = f'{"|".join(doc[j:(j+n_gram)])}'
                word_features += [f'word_{feature_position}={word_}']
                ender_ =  f'{"|".join(doc_ender[j:(j+n_gram)])}'
                word_features += [f'ender_{feature_position}={ender_}']
                starter_ =  f'{"|".join(doc_starter[j:(j+n_gram)])}'
                word_features += [f'starter_{feature_position}={starter_}']
        
        #append to feature per word
        doc_features.append(word_features)
    return doc_features

In [9]:
%%time
# ted
#target
ted_y = []
for t in tqdm(ted_all_tuples, total=len(ted_all_tuples)):
    temp = []
    for (w, l) in t:
        temp.append(l)
    ted_y.append(temp)

#features
ted_x_pre = []
for t in tqdm(ted_all_tuples, total=len(ted_all_tuples)):
    temp = []
    for (w, l) in t:
        temp.append(w)
    ted_x_pre.append(temp)
ted_x = []
for x_ in tqdm(ted_x_pre, total=len(ted_x_pre)):
    ted_x.append(extract_features(x_, window=2, max_n_gram = 3))

100%|██████████| 385/385 [00:00<00:00, 2742.15it/s]
100%|██████████| 385/385 [00:00<00:00, 2698.12it/s]
100%|██████████| 385/385 [00:27<00:00, 14.09it/s]

CPU times: user 26.3 s, sys: 1.16 s, total: 27.5 s
Wall time: 27.6 s





In [10]:
%%time
# orchid
#target
orchid_y = []
for (w, l) in tqdm(orchid_all_tuples, total=len(orchid_all_tuples)):
    orchid_y.append(l)
#features
orchid_x_pre = []
for (w, l) in tqdm(orchid_all_tuples, total=len(orchid_all_tuples)):
    orchid_x_pre.append(w)
orchid_x = extract_features(orchid_x_pre, window=2, max_n_gram = 3) 

100%|██████████| 91453/91453 [00:00<00:00, 1539641.58it/s]
100%|██████████| 91453/91453 [00:00<00:00, 1557521.34it/s]


CPU times: user 3 s, sys: 148 ms, total: 3.15 s
Wall time: 3.2 s


In [11]:
# fake review
#target
fake_review_y = []
for t in tqdm(fake_review_all_tuples, total=len(fake_review_all_tuples)):
    temp = []
    for (w, l) in t:
        temp.append(l)
    fake_review_y.append(temp)

#features
fake_review_x_pre = []
for t in tqdm(fake_review_all_tuples, total=len(fake_review_all_tuples)):
    temp = []
    for (w, l) in t:
        temp.append(w)
    fake_review_x_pre.append(temp)
fake_review_x = []
for x_ in tqdm(fake_review_x_pre, total=len(fake_review_x_pre)):
    fake_review_x.append(extract_features(x_, window=2, max_n_gram = 3))

100%|██████████| 54370/54370 [00:00<00:00, 68578.92it/s]
100%|██████████| 54370/54370 [00:00<00:00, 64152.16it/s]
100%|██████████| 54370/54370 [03:45<00:00, 241.00it/s]  


In [13]:
# Split train and test set at 80/20 proportion
ted_x_train, ted_x_test, ted_y_train, ted_y_test = train_test_split(ted_x, ted_y, test_size=0.2, random_state=1412)
idx = int(len(orchid_x)*0.8)
orchid_x_train, orchid_x_test = orchid_x[:idx], orchid_x[idx:]
orchid_y_train, orchid_y_test = orchid_y[:idx], orchid_y[idx:]
fake_review_x_train, fake_review_x_test, fake_review_y_train, fake_review_y_test = \
    train_test_split(fake_review_x, fake_review_y, test_size=0.2, random_state=1412)

In [None]:
%%time
# Train model
trainer = pycrfsuite.Trainer(verbose=True)

for xseq, yseq in tqdm(zip(ted_x_train, ted_y_train), total=len(ted_y_train)):
    trainer.append(xseq, yseq)
    
trainer.append(orchid_x_train, orchid_y_train)

for xseq, yseq in tqdm(zip(fake_review_x_train, fake_review_y_train), total=len(fake_review_y_train)):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1,
    'c2': 0,
    'max_iterations': 1000,
    'feature.possible_transitions': True,
})

trainer.train('models/datasets-crf.model')

In [14]:
# ted
# Predict (using test set)
tagger = pycrfsuite.Tagger()
tagger.open('models/datasets-crf.model')
# y_pred = [tagger.tag(xseq) for xseq in x_test]
y_pred = []
for xseq in tqdm(ted_x_test, total=len(ted_x_test)):
    y_pred.append(tagger.tag(xseq))

# Evaluate at word-level
labels = {'E': 0, "I": 1} # classification_report() needs values in 0s and 1s
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in ted_y_test for tag in row])

print("Validate TED dataset")
print(classification_report(
    truths, predictions,
    target_names=["E", "I"]))

100%|██████████| 77/77 [00:10<00:00,  7.64it/s]


Validate TED dataset
              precision    recall  f1-score   support

           E       0.68      0.76      0.72      6990
           I       0.99      0.98      0.99    155115

    accuracy                           0.97    162105
   macro avg       0.84      0.87      0.85    162105
weighted avg       0.98      0.97      0.98    162105



In [16]:
# orchid
# Predict (using test set)
tagger = pycrfsuite.Tagger()
tagger.open('models/datasets-crf.model')
y_pred = tagger.tag(orchid_x_test)

# Evaluate at word-level
labels = {'E': 0, "I": 1} # classification_report() needs values in 0s and 1s
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in orchid_y_test for tag in row])

print("Validate orchid dataset")
print(classification_report(
    truths, predictions,
    target_names=["E", "I"]))

Validate orchid dataset
              precision    recall  f1-score   support

           E       0.76      0.63      0.69      1179
           I       0.97      0.99      0.98     17112

    accuracy                           0.96     18291
   macro avg       0.87      0.81      0.83     18291
weighted avg       0.96      0.96      0.96     18291



In [18]:
# fake review
# Predict (using test set)
tagger = pycrfsuite.Tagger()
tagger.open('models/datasets-crf.model')
# y_pred = [tagger.tag(xseq) for xseq in x_test]
y_pred = []
for xseq in tqdm(fake_review_x_test, total=len(fake_review_x_test)):
    y_pred.append(tagger.tag(xseq))

# Evaluate at word-level
labels = {'E': 0, "I": 1} # classification_report() needs values in 0s and 1s
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in fake_review_y_test for tag in row])

print("Validate TED dataset")
print(classification_report(
    truths, predictions,
    target_names=["E", "I"]))

100%|██████████| 10874/10874 [00:58<00:00, 187.20it/s]


Validate TED dataset
              precision    recall  f1-score   support

           E       0.98      0.96      0.97     48984
           I       1.00      1.00      1.00    663897

    accuracy                           1.00    712881
   macro avg       0.99      0.98      0.98    712881
weighted avg       1.00      1.00      1.00    712881

