In [1]:
import pandas as pd
import numpy as np
from utils.data_helper import get_markable_dataframe, get_embedding_variables
from model_builders.singleton_classifier import SingletonClassifierModelBuilder
from functools import reduce

from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
embedding_indexes_file_path = 'helper_files/embedding/embedding_indexes.txt'
indexed_embedding_file_path = 'helper_files/embedding/indexed_embedding.txt'

word_vector, embedding_matrix, idx_by_word, word_by_idx = get_embedding_variables(embedding_indexes_file_path, indexed_embedding_file_path)

In [3]:
data = get_markable_dataframe("data/training/markables.csv", word_vector, idx_by_word)

data.head()

Unnamed: 0,id,text,is_pronoun,entity_type,is_proper_name,is_first_person,previous_words,next_words,is_singleton
0,1,[1],0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,[],"[2, 3, 4, 5, 6, 7, 8, 9, 10, 2]","[0.0, 1.0]"
1,2,"[4, 5]",0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",0,0,"[1, 2, 3]","[6, 7, 8, 9, 10, 2, 11, 12, 13, 14]","[0.0, 1.0]"
2,3,"[6, 7, 8]",0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 1]",1,0,"[1, 2, 3, 4, 5]","[9, 10, 2, 11, 12, 13, 14, 15, 16, 17]","[1.0, 0.0]"
3,4,[10],0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",0,0,"[1, 2, 3, 4, 5, 6, 7, 8, 9]","[2, 11, 12, 13, 14, 15, 16, 17, 18, 19]","[0.0, 1.0]"
4,5,"[12, 13]",0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",0,0,"[3, 4, 5, 6, 7, 8, 9, 10, 2, 11]","[14, 15, 16, 17, 18, 19, 4, 5, 20, 21]","[0.0, 1.0]"


In [4]:
max_text_length = 10
max_prev_words_length = 10
max_next_words_length = 10

data_text = pad_sequences(data.text, maxlen=max_text_length, padding='post')
data_previous_words = pad_sequences(data.previous_words.map(lambda seq: seq[(-1*max_prev_words_length):]), maxlen=max_prev_words_length, padding='pre')
data_next_words = pad_sequences(data.next_words.map(lambda seq: seq[:max_next_words_length]), maxlen=max_next_words_length, padding='post')
data_syntactic = data[['is_pronoun', 'entity_type', 'is_proper_name', 'is_first_person']]

data_syntactic = np.array(list(map(lambda p: reduce(lambda x,y: x + y, [i if type(i) is list else [i] for i in p]), data_syntactic.values)))
label = np.vstack(data.is_singleton)

# Build Model

## Words

In [5]:
words_model_builder = SingletonClassifierModelBuilder(
    use_words_feature=True, 
    use_context_feature=False, 
    use_syntactic_feature=False,
    embedding_matrix=embedding_matrix
)

In [6]:
words_model = words_model_builder.create_model()

In [7]:
words_model.fit([data_text], label, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x139fbfb38>

In [8]:
words_model.save('models/singleton_classifiers/words.model')

## Context

In [9]:
context_model_builder = SingletonClassifierModelBuilder(
    use_words_feature=False, 
    use_context_feature=True, 
    use_syntactic_feature=False,
    embedding_matrix=embedding_matrix
)

In [10]:
context_model = context_model_builder.create_model()

In [11]:
context_model.fit([data_previous_words, data_next_words], label, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1a43c89748>

In [12]:
context_model.save('models/singleton_classifiers/context.model')

## Syntactic

In [13]:
syntactic_model_builder = SingletonClassifierModelBuilder(
    use_words_feature=False, 
    use_context_feature=False, 
    use_syntactic_feature=True,
    syntactic_features_num=data_syntactic.shape[1]
)

In [14]:
syntactic_model = syntactic_model_builder.create_model()

In [15]:
syntactic_model.fit([data_syntactic], label, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1a56421a20>

In [16]:
syntactic_model.save('models/singleton_classifiers/syntactic.model')

## Words + Context

In [17]:
words_context_model_builder = SingletonClassifierModelBuilder(
    use_words_feature=True, 
    use_context_feature=True, 
    use_syntactic_feature=False,
    embedding_matrix=embedding_matrix
)

In [18]:
words_context_model = words_context_model_builder.create_model()

In [19]:
words_context_model.fit([data_text, data_previous_words, data_next_words], label, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1a588a3978>

In [20]:
words_context_model.save('models/singleton_classifiers/words_context.model')

## Words + Syntactic

In [21]:
words_syntactic_model_builder = SingletonClassifierModelBuilder(
    use_words_feature=True, 
    use_context_feature=False, 
    use_syntactic_feature=True,
    embedding_matrix=embedding_matrix,
    syntactic_features_num=data_syntactic.shape[1]
)

In [22]:
words_syntactic_model = words_syntactic_model_builder.create_model()

In [23]:
words_syntactic_model.fit([data_text, data_syntactic], label, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1a5a448e10>

In [24]:
words_syntactic_model.save('models/singleton_classifiers/words_syntactic.model')

## Context + Syntactic

In [25]:
context_syntactic_model_builder = SingletonClassifierModelBuilder(
    use_words_feature=False, 
    use_context_feature=True, 
    use_syntactic_feature=True,
    embedding_matrix=embedding_matrix,
    syntactic_features_num=data_syntactic.shape[1]
)

In [26]:
context_syntactic_model = context_syntactic_model_builder.create_model()

In [27]:
context_syntactic_model.fit([data_previous_words, data_next_words, data_syntactic], label, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1a59e4eeb8>

In [28]:
context_syntactic_model.save('models/singleton_classifiers/context_syntactic.model')

## Words + Context + Syntactic

In [29]:
words_context_syntactic_model_builder = SingletonClassifierModelBuilder(
    use_words_feature=True, 
    use_context_feature=True, 
    use_syntactic_feature=True,
    embedding_matrix=embedding_matrix,
    syntactic_features_num=data_syntactic.shape[1]
)

In [30]:
words_context_syntactic_model = words_context_syntactic_model_builder.create_model()

In [31]:
words_context_syntactic_model.fit([data_text, data_previous_words, data_next_words, data_syntactic], label, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1a5dcfacf8>

In [32]:
words_context_syntactic_model.save('models/singleton_classifiers/words_context_syntactic.model')