In [1]:
import pandas as pd
import numpy as np
from utils.data_helper import get_markable_dataframe, get_embedding_variables
from model_builders.antecedentless_classifier import AntecedentlessClassifierModelBuilder
from functools import reduce

from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
embedding_indexes_file_path = 'helper_files/embedding/embedding_indexes.txt'
indexed_embedding_file_path = 'helper_files/embedding/indexed_embedding.txt'

word_vector, embedding_matrix, idx_by_word, word_by_idx = get_embedding_variables(embedding_indexes_file_path, indexed_embedding_file_path)

In [3]:
data = get_markable_dataframe("data/training/markables.csv", word_vector, idx_by_word)

data.head()

['EVENT', 'FACILITY', 'LOCATION', 'NUM', 'ORGANIZATION', 'OTHER', 'PERSON', 'THINGS', 'TIME', 'TITLE']


Unnamed: 0,id,text,is_pronoun,entity_type,is_proper_name,is_first_person,previous_words,next_words,all_previous_words,is_singleton,is_antecedentless
0,1,[1],0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,[],"[2, 3, 4, 5, 6, 7, 8, 9, 10, 2]",[],"[0.0, 1.0]","[0.0, 1.0]"
1,2,"[4, 5]",0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",0,0,"[1, 2, 3]","[6, 7, 8, 9, 10, 2, 11, 12, 13, 14]","[1, 2, 3]","[0.0, 1.0]","[0.0, 1.0]"
2,3,"[6, 7, 8]",0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 1]",1,0,"[1, 2, 3, 4, 5]","[9, 10, 2, 11, 12, 13, 14, 15, 16, 17]","[1, 2, 3, 4, 5]","[1.0, 0.0]","[0.0, 1.0]"
3,4,[10],0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",0,0,"[1, 2, 3, 4, 5, 6, 7, 8, 9]","[2, 11, 12, 13, 14, 15, 16, 17, 18, 19]","[1, 2, 3, 4, 5, 6, 7, 8, 9]","[0.0, 1.0]","[0.0, 1.0]"
4,5,"[12, 13]",0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",0,0,"[3, 4, 5, 6, 7, 8, 9, 10, 2, 11]","[14, 15, 16, 17, 18, 19, 4, 5, 20, 21]","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 2, 11]","[0.0, 1.0]","[0.0, 1.0]"


In [58]:
max_text_length = 10
max_prev_words_length = max(map(lambda x: len(x), data.all_previous_words))

data_text = pad_sequences(data.text, maxlen=max_text_length, padding='post')
data_all_previous_words = pad_sequences(data.all_previous_words.map(lambda seq: seq[(-1*max_prev_words_length):]), maxlen=max_prev_words_length, padding='pre')
data_syntactic = data[['is_pronoun', 'entity_type', 'is_proper_name', 'is_first_person']]

data_syntactic = np.array(list(map(lambda p: reduce(lambda x,y: x + y, [i if type(i) is list else [i] for i in p]), data_syntactic.values)))
label = np.vstack(data.is_antecedentless)

# Build Model

## Words + Context + Numeric

In [69]:
words_context_numeric_model_builder = AntecedentlessClassifierModelBuilder(
    use_words_feature=True, 
    use_context_feature=True, 
    use_syntactic_feature=True,
    embedding_matrix=embedding_matrix,
    syntactic_features_num=data_syntactic.shape[1]
)

In [70]:
words_context_numeric_model = words_context_numeric_model_builder.create_model()

In [73]:
words_context_numeric_model.fit([data_text, data_all_previous_words, data_syntactic], label, epochs=5)

Epoch 1/20
 704/5032 [===>..........................] - ETA: 2:27 - loss: 0.3454 - acc: 0.8494

KeyboardInterrupt: 