In [2]:
import numpy as np
import pandas as pd

import re
import html
import json

import nltk
import anago

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

from keras.models import Sequential
from keras.layers import LSTM, GRU, Activation, Dense, Dropout, Input, Embedding, MaxPooling1D
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping

import seaborn as sn

In [78]:
def get_ner_label(text):
    text = re.sub(r'\t.+?\n', '', text)
    
    cur_label = re.findall("TYPE=\"(.+?)\">(.+?)<", text)
    dict_label = dict((y, x) for x, y in cur_label)
    
    new_dict = {}
    for x in dict_label:
        token = x.split(' ')
        for tok in token:
            if tok not in new_dict:
                new_dict[tok] = dict_label[x]
    
    text = ' '.join(re.split('<ENAMEX.+?>|</ENAMEX>| ', text))
    data = re.findall(r"[\w']+|[!\"#$%&\(\)*+,-./:;<=>?@\[\\\]\^_`\{\|\}~]", text)

    label = []
    
    for w in data:
        if w in new_dict:
            if(len(label) > 0 and label[-1][2:] == new_dict[w][:3]):
                label.append('I-' + new_dict[w][:3])
            else:
                label.append('B-' + new_dict[w][:3])
        else:
            label.append('O')
    
    return data, label

In [55]:
ner_train = tuple(open("ner/training_data.txt", 'r'))
ner_valid = tuple(open("ner/testing_data.txt", 'r'))

In [58]:
train_x = []
train_y = []
valid_x = []
valid_y = []

In [59]:
for t in ner_train:
    x, y = get_ner_label(t)
    train_x.append(x)
    train_y.append(y)

for t in ner_valid:
    x, y = get_ner_label(t)
    valid_x.append(x)
    valid_y.append(y)

In [60]:
train_x = np.array(train_x)
train_y = np.array(train_y)

valid_x = np.array(valid_x)
valid_y = np.array(valid_y)

In [61]:
model = anago.Sequence(char_emb_size=25, word_emb_size=100, char_lstm_units=25,
                       word_lstm_units=100, dropout=0.5, char_feature=True, crf=True,
                       batch_size=20, optimizer='adam', learning_rate=0.001,lr_decay=0.9,
                       clip_gradients=5.0, max_epoch=30, early_stopping=True, patience=3,
                       train_embeddings=True, max_checkpoints_to_keep=5, log_dir=None)
 
model.train(train_x, train_y, valid_x, valid_y)


Epoch 1/30
 - f1: 0.18
Epoch 2/30
 - f1: 15.35
Epoch 3/30
 - f1: 48.82
Epoch 4/30
 - f1: 63.00
Epoch 5/30
 - f1: 65.12
Epoch 6/30
 - f1: 69.07
Epoch 7/30
 - f1: 71.40
Epoch 8/30
 - f1: 67.77
Epoch 9/30
 - f1: 71.73
Epoch 10/30
 - f1: 67.60
Epoch 11/30
 - f1: 72.14
Epoch 12/30
 - f1: 69.34
Epoch 13/30
 - f1: 70.81
Epoch 14/30
 - f1: 69.29


In [64]:
words = '[ TOEFL TEST HIMASIKA ITS ] , ✅ Training TOEFL, 📆 Sabtu, 13 April 2019, ⏱ 08.30 - selesai, 📍 J103, ✅ TOEFL TEST, 📆 Senin, 15 April 2018, ⏱ 18.30 - selesai, 📍 UPT Bahasa ITS, 💸 Fee 60k, LIMITED SEAT !!!,  ,  ,  ,  ,  '
words = re.findall(r"[\w']+|[!\"#$%&\(\)*+,-./:;<=>?@\[\\\]\^_`\{\|\}~]", words)

In [65]:
model.analyze(words)

{'words': ['[',
  'TOEFL',
  'TEST',
  'HIMASIKA',
  'ITS',
  ']',
  ',',
  'Training',
  'TOEFL',
  ',',
  'Sabtu',
  ',',
  '13',
  'April',
  '2019',
  ',',
  '08',
  '.',
  '30',
  '-',
  'selesai',
  ',',
  'J103',
  ',',
  'TOEFL',
  'TEST',
  ',',
  'Senin',
  ',',
  '15',
  'April',
  '2018',
  ',',
  '18',
  '.',
  '30',
  '-',
  'selesai',
  ',',
  'UPT',
  'Bahasa',
  'ITS',
  ',',
  'Fee',
  '60k',
  ',',
  'LIMITED',
  'SEAT',
  '!',
  '!',
  '!',
  ',',
  ',',
  ',',
  ',',
  ','],
 'entities': [{'text': 'TOEFL TEST HIMASIKA',
   'type': 'ORG',
   'score': 1.0,
   'beginOffset': 1,
   'endOffset': 4},
  {'text': 'TOEFL TEST',
   'type': 'PER',
   'score': 1.0,
   'beginOffset': 24,
   'endOffset': 26},
  {'text': 'UPT Bahasa',
   'type': 'ORG',
   'score': 1.0,
   'beginOffset': 39,
   'endOffset': 41},
  {'text': 'LIMITED SEAT',
   'type': 'ORG',
   'score': 1.0,
   'beginOffset': 46,
   'endOffset': 48}]}