In [414]:
from gensim.models import Word2Vec
from string import punctuation
import pandas as pd
import numpy as np
from stem import IndonesianStemmer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, Flatten, Input, Dropout, Concatenate, Reshape, Conv2D, MaxPool2D
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from functools import reduce

In [4]:
word_vector = Word2Vec.load('word2vec/id.bin')

In [5]:
stemmer = IndonesianStemmer()

def is_number(word):
    return word.replace(',', '.').replace('.', '').replace('-', '', 1).isdigit()
    
def preprocess_word(word):
    if '\\' in word:
        word = word.split('\\')[0]
    
    while word[-1] in punctuation and len(word) > 1:
        word = word[:-1]
    
    while word[0] in punctuation and len(word) > 1:
        word = word[1:]
        
    word = word.lower()
        
    if word not in word_vector.wv:
        tmp = word.split('-')
        if len(tmp) == 2 and tmp[0] == tmp[1]:
            word = tmp[0]
            
    if word not in word_vector.wv:
        word = stemmer.stem(word)

    if word not in word_vector.wv:
        tmp = word.split('-')
        if len(tmp) == 2 and tmp[0] == tmp[1]:
            word = tmp[0]
            
    if word not in word_vector.wv:
        word = stemmer.stem(word)
        
    if is_number(word):
        word = '<angka>'
        
    return word

def preprocess_sentence(sentence):
    return ' '.join([preprocess_word(word) for word in sentence.split() if preprocess_word(word) != ''])

def preprocess_arr(arr):
    return [preprocess_word(word) for word in arr if preprocess_word(word) != '']

In [6]:
def get_entity_types(labels):
    entity_types = set()
    
    for label in labels:
        for entity_type in label.split('|'):
            entity_types.add(entity_type)
    
    return list(entity_types)

def entity_to_bow(entities):
    idx = {entities[i]: i for i in range(len(entities))}
    
    def f(label):
        bow = [0 for _ in entities]
        
        for entity_type in label.split('|'):
            bow[idx[entity_type]] = 1
        
        return bow
    
    return f

def entity_to_id(entities):
    idx = {entities[i]: i for i in range(len(entities))}
    
    def f(label):
        return idx[label]
    
    return f

In [256]:
markables = pd.read_csv('markables.csv')
markables.previous_words = markables.previous_words.map(lambda x: preprocess_arr(eval(x)))
markables.next_words = markables.next_words.map(lambda x: preprocess_arr(eval(x)))
markables.entity = markables.entity.map(entity_to_bow(get_entity_types(markables.entity)))
markables.first_pos_tag = markables.first_pos_tag.map(entity_to_id(get_entity_types(markables.first_pos_tag)))
markables.is_singleton = markables.is_singleton.map(int)
markables.text = markables.text.map(preprocess_sentence)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(markables.text)
tokenizer.fit_on_texts(markables.previous_words)
tokenizer.fit_on_texts(markables.next_words)

markables.previous_words = tokenizer.texts_to_sequences(markables.previous_words)
markables.next_words = tokenizer.texts_to_sequences(markables.next_words)
markables.text = tokenizer.texts_to_sequences(markables.text)

markables.is_singleton = markables.is_singleton.map(lambda x: to_categorical(x, num_classes=2))

In [257]:
markables.head()

Unnamed: 0,id,text,num_words,first_pos_tag,entity,is_singleton,previous_words,next_words
0,1,[3440],1,15,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]","[0.0, 1.0]",[],"[12, 731, 1955, 621, 38, 1057, 778, 3289, 283,..."
1,2,"[1955, 621]",2,15,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]","[0.0, 1.0]","[3440, 12, 731]","[38, 1057, 778, 3289, 283, 12, 1091, 283, 132, 2]"
2,3,"[38, 1057, 778]",3,16,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1.0, 0.0]","[3440, 12, 731, 1955, 621]","[3289, 283, 12, 1091, 283, 132, 2, 1926, 41, 302]"
3,4,[283],1,15,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]","[0.0, 1.0]","[3440, 12, 731, 1955, 621, 38, 1057, 778, 3289]","[12, 1091, 283, 132, 2, 1926, 41, 302, 9, 1754]"
4,5,"[283, 132]",2,15,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]","[0.0, 1.0]","[731, 1955, 621, 38, 1057, 778, 3289, 283, 12,...","[2, 1926, 41, 302, 9, 1754, 1955, 621, 3441, 214]"


In [258]:
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

def sequence_to_text(sequence):
    return ' '.join([reverse_word_map[word_id] for word_id in sequence if word_id > 0])

not_found = set()
found = set()
total_found = 0
total_not_found = 0

for sentence in markables.text.map(sequence_to_text):
    for word in sentence.split():
        if word not in word_vector.wv:
            not_found.add(word)
            total_not_found += 1
        else:
            found.add(word)
            total_found += 1

for sentence in list(markables.previous_words.map(sequence_to_text)) + list(markables.next_words.map(sequence_to_text)):
    for word in sentence.split():
        if word not in word_vector.wv:
            not_found.add(word)
            total_not_found += 1
        else:
            found.add(word)
            total_found += 1

print("found: %d" % len(found))
print("not found: %d" % len(not_found))
print("total found: %d" % total_found)
print("total not found: %d" % total_not_found)

found: 3065
not found: 570
total found: 140989
total not found: 16018


In [259]:
min_val = min(map(min, word_vector.wv.vectors))
max_val = max(map(max, word_vector.wv.vectors))

vocab_size = len(tokenizer.word_index) + 1
vector_size = word_vector.wv.vector_size

embedding_matrix = np.zeros((vocab_size, vector_size))

for word, i in tokenizer.word_index.items():
    if word in word_vector.wv:
        embedding_matrix[i] = word_vector.wv[word]
    else:
        embedding_matrix[i] = np.random.rand(300) * (max_val - min_val) + min_val

In [328]:
max_text_length = 10
max_prev_words_length = 5
max_next_words_length = 5

padded_text = pad_sequences(markables.text, maxlen=max_text_length, padding='post')
padded_prev_words = pad_sequences(markables.previous_words.map(lambda seq: seq[(-1*max_prev_words_length):]), maxlen=max_prev_words_length, padding='pre')
padded_next_words = pad_sequences(markables.next_words.map(lambda seq: seq[:max_next_words_length]), maxlen=max_next_words_length, padding='post')

In [381]:
train_idx, test_idx = train_test_split([i for i in range(len(padded_text))], test_size=0.3)

padded_text_train = padded_text[train_idx]
padded_text_test = padded_text[test_idx]

padded_prev_words_train = padded_prev_words[train_idx]
padded_prev_words_test = padded_prev_words[test_idx]

padded_next_words_train = padded_next_words[train_idx]
padded_next_words_test = padded_next_words[test_idx]

numeric_train = markables[['num_words', 'first_pos_tag', 'entity']].iloc[train_idx]
numeric_test = markables[['num_words', 'first_pos_tag', 'entity']].iloc[test_idx]

label_train = markables.is_singleton[train_idx]
label_test = markables.is_singleton[test_idx]


numeric_train = np.array(list(map(lambda p: reduce(lambda x,y: x + y, [i if type(i) is list else [i] for i in p]), numeric_train.values)))
numeric_test = np.array(list(map(lambda p: reduce(lambda x,y: x + y, [i if type(i) is list else [i] for i in p]), numeric_test.values)))

In [534]:
filter_sizes = [2,3,4]
num_filters = 64

words_input = Input(shape=(max_text_length,))
words_embedding = Embedding(vocab_size, vector_size, weights=[embedding_matrix], input_length=10, trainable=False)(words_input)
reshape_words = Reshape((max_text_length,vector_size,1))(words_embedding)
conv_0_words = Conv2D(num_filters, kernel_size=(filter_sizes[0], vector_size), padding='valid', kernel_initializer='normal', activation='relu')(reshape_words)
conv_1_words = Conv2D(num_filters, kernel_size=(filter_sizes[1], vector_size), padding='valid', kernel_initializer='normal', activation='relu')(reshape_words)
conv_2_words = Conv2D(num_filters, kernel_size=(filter_sizes[2], vector_size), padding='valid', kernel_initializer='normal', activation='relu')(reshape_words)
maxpool_0_words = MaxPool2D(pool_size=(max_text_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0_words)
maxpool_1_words = MaxPool2D(pool_size=(max_text_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1_words)
maxpool_2_words = MaxPool2D(pool_size=(max_text_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2_words)
words_representation = Concatenate(axis=1)([maxpool_0_words, maxpool_1_words, maxpool_2_words])
words_representation = Flatten()(words_representation)
words_representation = Dense(16, activation='sigmoid')(words_representation)

prev_words_input = Input(shape=(max_prev_words_length,))
prev_words_embedding = Embedding(vocab_size, vector_size, weights=[embedding_matrix], input_length=10, trainable=False)(prev_words_input)
reshape_prev_words = Reshape((max_prev_words_length,vector_size,1))(prev_words_embedding)
conv_0_prev_words = Conv2D(num_filters, kernel_size=(filter_sizes[0], vector_size), padding='valid', kernel_initializer='normal', activation='relu')(reshape_prev_words)
conv_1_prev_words = Conv2D(num_filters, kernel_size=(filter_sizes[1], vector_size), padding='valid', kernel_initializer='normal', activation='relu')(reshape_prev_words)
conv_2_prev_words = Conv2D(num_filters, kernel_size=(filter_sizes[2], vector_size), padding='valid', kernel_initializer='normal', activation='relu')(reshape_prev_words)
maxpool_0_prev_words = MaxPool2D(pool_size=(max_prev_words_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0_prev_words)
maxpool_1_prev_words = MaxPool2D(pool_size=(max_prev_words_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1_prev_words)
maxpool_2_prev_words = MaxPool2D(pool_size=(max_prev_words_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2_prev_words)
prev_words_representation = Concatenate(axis=1)([maxpool_0_prev_words, maxpool_1_prev_words, maxpool_2_prev_words])
prev_words_representation = Flatten()(prev_words_representation)
prev_words_representation = Dense(16, activation='sigmoid')(prev_words_representation)

next_words_input = Input(shape=(max_next_words_length,))
next_words_embedding = Embedding(vocab_size, vector_size, weights=[embedding_matrix], input_length=10, trainable=False)(next_words_input)
reshape_next_words = Reshape((max_next_words_length,vector_size,1))(next_words_embedding)
conv_0_next_words = Conv2D(num_filters, kernel_size=(filter_sizes[0], vector_size), padding='valid', kernel_initializer='normal', activation='relu')(reshape_next_words)
conv_1_next_words = Conv2D(num_filters, kernel_size=(filter_sizes[1], vector_size), padding='valid', kernel_initializer='normal', activation='relu')(reshape_next_words)
conv_2_next_words = Conv2D(num_filters, kernel_size=(filter_sizes[2], vector_size), padding='valid', kernel_initializer='normal', activation='relu')(reshape_next_words)
maxpool_0_next_words = MaxPool2D(pool_size=(max_next_words_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0_next_words)
maxpool_1_next_words = MaxPool2D(pool_size=(max_next_words_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1_next_words)
maxpool_2_next_words = MaxPool2D(pool_size=(max_next_words_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2_next_words)
next_words_representation = Concatenate(axis=1)([maxpool_0_next_words, maxpool_1_next_words, maxpool_2_next_words])
next_words_representation = Flatten()(next_words_representation)
next_words_representation = Dense(16, activation='sigmoid')(next_words_representation)

context_representation = Concatenate()([prev_words_representation, next_words_representation])
context_representation = Dense(16, activation='sigmoid')(context_representation)

numeric_input = Input(shape=(12,))
numeric_representation = Dense(64, activation='relu')(numeric_input)
numeric_representation = Dropout(0.2)(numeric_representation)
numeric_representation = Dense(32, activation='relu')(numeric_representation)
numeric_representation = Dropout(0.2)(numeric_representation)
numeric_representation = Dense(16, activation='sigmoid')(numeric_representation)

markable_representation = Concatenate()([words_representation, context_representation, numeric_representation])
# markable_representation = Dense(64, activation='relu')(markable_representation)
# markable_representation = Dropout(0.2)(markable_representation)
markable_representation = Dense(32, activation='relu')(markable_representation)
markable_representation = Dropout(0.2)(markable_representation)
# markable_representation = Dense(16, activation='relu')(markable_representation)
# markable_representation = Dropout(0.2)(markable_representation)
markable_representation = Dense(8, activation='relu')(markable_representation)
markable_representation = Dropout(0.2)(markable_representation)

output_layer = Dense(2, activation='softmax')(markable_representation)

model = Model(inputs=[words_input, prev_words_input, next_words_input, numeric_input], outputs=[output_layer])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())
# fit the model
model.fit([padded_text_train, padded_prev_words_train, padded_next_words_train, numeric_train], np.stack(label_train.values), epochs=10)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_184 (InputLayer)          (None, 5)            0                                            
__________________________________________________________________________________________________
input_185 (InputLayer)          (None, 5)            0                                            
__________________________________________________________________________________________________
embedding_161 (Embedding)       (None, 5, 300)       1090800     input_184[0][0]                  
__________________________________________________________________________________________________
embedding_162 (Embedding)       (None, 5, 300)       1090800     input_185[0][0]                  
__________________________________________________________________________________________________
input_183 

<tensorflow.python.keras.callbacks.History at 0x1afadead68>

In [536]:
def get_classes(output, threshold=0.5):
    return list(map(lambda x: 1 if x[1] > threshold else 0, output))

pred = model.predict([padded_text_test, padded_prev_words_test, padded_next_words_test, numeric_test])

for i in range(1, 10):
    print('threshold %f:' % (i * 0.1))
    print(classification_report(get_classes(label_test), list(get_classes(pred, i*0.1))))


threshold 0.100000:
              precision    recall  f1-score   support

           0       0.59      0.49      0.54       278
           1       0.93      0.95      0.94      1858

   micro avg       0.89      0.89      0.89      2136
   macro avg       0.76      0.72      0.74      2136
weighted avg       0.88      0.89      0.88      2136

threshold 0.200000:
              precision    recall  f1-score   support

           0       0.57      0.53      0.55       278
           1       0.93      0.94      0.93      1858

   micro avg       0.89      0.89      0.89      2136
   macro avg       0.75      0.73      0.74      2136
weighted avg       0.88      0.89      0.88      2136

threshold 0.300000:
              precision    recall  f1-score   support

           0       0.55      0.54      0.55       278
           1       0.93      0.93      0.93      1858

   micro avg       0.88      0.88      0.88      2136
   macro avg       0.74      0.74      0.74      2136
weighted avg  

In [540]:
from random import randint

n = 20

start = randint(0, len(label_test) - n)

pred = get_classes(model.predict([padded_text_test, padded_prev_words_test, padded_next_words_test, numeric_test])[start:start+n], 0.6)
lab = get_classes(label_test[start:start+n])

for a, b, c in zip(list(map(sequence_to_text, padded_text_test))[start:start+n], lab, pred):
    print(a,'| label:', b, '| prediksi:', c)

perusahaan minyak asing | label: 1 | prediksi: 1
nya | label: 1 | prediksi: 1
majalah forbes | label: 1 | prediksi: 1
pendiri facebook | label: 0 | prediksi: 0
country director | label: 0 | prediksi: 1
semester pertama | label: 1 | prediksi: 1
nya | label: 1 | prediksi: 0
stabilitas nilai tukar rupiah | label: 1 | prediksi: 1
rp angka triliun | label: 1 | prediksi: 1
nya | label: 1 | prediksi: 0
kesepakatan | label: 1 | prediksi: 1
nya | label: 0 | prediksi: 0
jakarta | label: 1 | prediksi: 1
stimulus fiskal | label: 1 | prediksi: 1
hari minggu | label: 1 | prediksi: 1
neraca pembayaran angka | label: 1 | prediksi: 1
itu | label: 1 | prediksi: 1
icmi | label: 0 | prediksi: 0
peningkatan produksi ini | label: 1 | prediksi: 1
pemda papua | label: 1 | prediksi: 1


In [541]:
model.save('singleton_classifier.model')