In [8]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
import pandas as pd

In [9]:
data_train = pd.read_csv("dataset.tsv", delimiter="\t", header=None)
data_test = pd.read_csv("data_test.tsv", delimiter="\t", header=None)

In [10]:
train_sents = []
test_sents = []
sent = []

In [11]:
for row in data_train.iterrows():
    word = row[1][0]
    tag = row[1][1]
    label = row[1][2]
    if not row[1].isnull()[2]:
        t = (word, tag, label)
        sent.append(t)
    else:
        if len(sent) == 0:
            continue
        print('Train sent:', sent)
        train_sents.append(sent)
        sent = []

Train sent: [('SEA', 'NN', 'B-ORG'), ('GAMES', 'NN', 'I-ORG'), ('2019', 'CDP', 'I-ORG'), ('merupakan', 'VBT', 'O'), ('sebuah', 'NN', 'O'), ('Pesta', 'NN', 'B-ORG'), ('Olahraga', 'NN', 'I-ORG'), ('Asia', 'NNP', 'I-ORG'), ('Tenggara', 'NN', 'I-ORG'), ('2019', 'NN', 'I-ORG'), ('yang', 'SC', 'O'), ('akan', 'MD', 'O'), ('diselenggarakan', 'VBT', 'O'), ('di', 'IN', 'O'), ('Manila', 'NNP', 'B-LOC')]
Train sent: [('ibu', 'NNP', 'O'), ('kota', 'NN', 'O'), ('dari', 'IN', 'O'), ('negara', 'NN', 'O'), ('Filipina', 'NN', 'B-LOC')]
Train sent: [('Pesta', 'NN', 'B-ORG'), ('Olahraga', 'NN', 'I-ORG'), ('Asia', 'NNP', 'I-ORG'), ('Tenggara', 'NN', 'I-ORG'), ('kali', 'NN', 'O'), ('ini', 'DT', 'O'), ('akan', 'MD', 'O'), ('menjadi', 'VBT', 'O'), ('Pesta', 'NN', 'B-ORG'), ('Olahraga', 'NN', 'I-ORG'), ('Asia', 'NNP', 'I-ORG'), ('Tenggara', 'NN', 'I-ORG'), ('yang', 'SC', 'O'), ('ke', 'IN', 'O')]
Train sent: [('30', 'CDP', 'O'), ('(', 'OP', 'O'), ('tiga', 'CDP', 'O'), ('puluh)', 'CDP', 'O')]
Train sent: [('Pest

In [12]:
for row in data_test.iterrows():
    word = row[1][0]
    tag = row[1][1]
    label = row[1][2]
    if not row[1].isnull()[2]:
        t = (word, tag, label)
        sent.append(t)
    else:
        if len(sent) == 0:
            continue
        print('Test sent:', sent)
        test_sents.append(sent)
        sent = []

Test sent: [('Pada', 'IN', 'O'), ('pemberitaan', 'NN', 'O'), ('sebelumnya', 'NNG', 'O')]
Test sent: [('September', 'NN', 'O')]
Test sent: [('Datu', 'NNP', 'B-PER')]
Test sent: [('Ramos', 'NNP', 'I-PER'), ('sudah', 'MD', 'O'), ('mencuatkan', 'VBT', 'O'), ('isu', 'NNP', 'O'), ('terkait', 'VBT', 'O'), ('sertifikasi', 'NN', 'O'), ('halal', 'NN', 'O'), ('katering', 'NN', 'O'), ('untuk', 'IN', 'O'), ('para', 'DT', 'O'), ('tamu', 'NN', 'I-PER'), ('muslim', 'NN', 'I-ORG'), ('di', 'IN', 'O'), ('SEA', 'NN', 'I-ORG'), ('Games', 'NN', 'I-ORG'), ('2019', 'CDP', 'I-ORG')]
Test sent: [('Dia', 'PRP', 'B-PER'), ('juga', 'RB', 'O'), ('sempat', 'RB', 'O'), ('mempertanyakan', 'VBT', 'O'), ('ketersediaan', 'NN', 'O'), ('tempat', 'NN', 'O')]
Test sent: [('tempat', 'NN', 'O'), ('salat', 'NN', 'O'), ('dengan', 'IN', 'O'), ('petunjuk', 'NN', 'O'), ('arah', 'NN', 'O'), ('kiblat', 'NN', 'O'), ('yang', 'SC', 'O'), ('sesuai', 'RB', 'O')]
Test sent: [('Pada', 'IN', 'O'), ('Minggu', 'NN', 'O'), ('(', 'OP', 'O'), ('2

In [13]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features

In [14]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]  

In [15]:
print(sent2features(train_sents[0])[0])

['bias', 'word.lower=sea', 'word[-3:]=SEA', 'word[-2:]=EA', 'word.isupper=True', 'word.istitle=False', 'word.isdigit=False', 'postag=NN', 'postag[:2]=NN', 'BOS', '+1:word.lower=games', '+1:word.istitle=False', '+1:word.isupper=True', '+1:postag=NN', '+1:postag[:2]=NN']


In [16]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [17]:
trainer = pycrfsuite.Trainer(verbose=False)

In [18]:
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

In [19]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [20]:
trainer.train('sample.crfsuite')

tagger = pycrfsuite.Tagger()
tagger.open('sample.crfsuite')

<contextlib.closing at 0x18ba4cde288>

In [21]:
example_sent = test_sents[0]
print(' '.join(sent2tokens(example_sent)), end='\n\n')

Pada pemberitaan sebelumnya



In [22]:
print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2labels(example_sent)))

Predicted: O O O
Correct:   O O O


In [23]:
predicted = []
target = []
for sent in test_sents:
    pred = tagger.tag(sent2features(sent))
    tar = sent2labels(sent)
    for p, t in zip(pred, tar):
        predicted.append(p)
        target.append(t)

In [24]:
correct = 0
for p, t in zip(predicted, target):
    if p == t:
        correct += 1
print('Akurasi:', correct/len(predicted))

Akurasi: 0.8004866180048662
