In [1]:
import pandas as pd
import numpy as np
from utils.data_helper import get_markable_dataframe, get_embedding_variables
from model_builders.coreference_classifier import CoreferenceClassifierModelBuilder
from functools import reduce
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
embedding_indexes_file_path = 'helper_files/embedding/embedding_indexes.txt'
indexed_embedding_file_path = 'helper_files/embedding/indexed_embedding.txt'

word_vector, embedding_matrix, idx_by_word, word_by_idx = get_embedding_variables(embedding_indexes_file_path, indexed_embedding_file_path)

In [3]:
markables = get_markable_dataframe("data/training/markables.csv", word_vector, idx_by_word)

markables.head()

Unnamed: 0,id,text,is_pronoun,entity_type,is_proper_name,is_first_person,num_words,previous_words,next_words,is_singleton
0,1,[1],0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",0,0,1,[],"[2, 3, 4, 5, 6, 7, 8, 9, 10, 2]","[0.0, 1.0]"
1,2,"[4, 5]",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",0,0,2,"[1, 2, 3]","[6, 7, 8, 9, 10, 2, 11, 10, 12, 13]","[0.0, 1.0]"
2,3,"[6, 7, 8]",0,"[0, 0, 0, 0, 0, 0, 1, 1, 0, 0]",1,0,3,"[1, 2, 3, 4, 5]","[9, 10, 2, 11, 10, 12, 13, 14, 15, 16]","[1.0, 0.0]"
3,4,[10],0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",0,0,1,"[1, 2, 3, 4, 5, 6, 7, 8, 9]","[2, 11, 10, 12, 13, 14, 15, 16, 17, 18]","[0.0, 1.0]"
4,5,"[10, 12]",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",0,0,2,"[3, 4, 5, 6, 7, 8, 9, 10, 2, 11]","[13, 14, 15, 16, 17, 18, 4, 5, 19, 20]","[0.0, 1.0]"


In [90]:
# pairs_budi = pd.read_csv("data/training/mention_pairs_budi.csv")
pairs_soon = pd.read_csv("data/training/mention_pairs_soon.csv")
pairs_gilang = pd.read_csv("data/training/mention_pairs_gilang.csv")

pairs_soon.head()

Unnamed: 0,m1_id,m2_id,is_exact_match,is_words_match,is_substring,is_abbreviation,is_appositive,is_nearest_candidate,sentence_distance,word_distance,markable_distance,is_coreference
0,3,11,0,0,0,0,0,0,2,31,8,1
1,4,11,0,0,0,0,0,0,2,29,7,0
2,5,11,0,0,0,0,0,0,2,25,6,0
3,6,11,0,0,0,0,0,0,2,16,5,0
4,7,11,0,0,0,0,0,0,1,14,4,0


In [5]:
max_text_length = 10
max_prev_words_length = 10
max_next_words_length = 10

def get_data(markable_ids):
    indices = reduce(lambda a, b: a + [b], map(lambda a: markables.index[markables['id'] == a].tolist()[0], markable_ids), [])
    data = markables.loc[indices]
    
    data_text = pad_sequences(data.text, maxlen=max_text_length, padding='post')
    data_previous_words = pad_sequences(data.previous_words.map(lambda seq: seq[(-1*max_prev_words_length):]), maxlen=max_prev_words_length, padding='pre')
    data_next_words = pad_sequences(data.next_words.map(lambda seq: seq[:max_next_words_length]), maxlen=max_next_words_length, padding='post')
    data_syntactic = data[['is_pronoun', 'entity_type', 'is_proper_name', 'is_first_person']]

    data_syntactic = np.array(list(map(lambda p: reduce(lambda x,y: x + y, [i if type(i) is list else [i] for i in p]), data_syntactic.values)))
    is_singleton = np.vstack(data.is_singleton)
    
    return data_text, data_previous_words, data_next_words, data_syntactic, is_singleton

def get_pair_data(markable_ids_1, markable_ids_2):
    text_1, prev_1, next_1, syntactic_1, is_singleton_1 = get_data(markable_ids_1)
    text_2, prev_2, next_2, syntactic_2, is_singleton_2 = get_data(markable_ids_2)
    
    return text_1, text_2, prev_1, prev_2, next_1, next_2, syntactic_1, syntactic_2, is_singleton_1, is_singleton_2

def get_relation_data(mention_pairs):
    return mention_pairs[['is_exact_match', 'is_words_match', 'is_substring', 'is_abbreviation', 'is_appositive', 'is_nearest_candidate', 'sentence_distance', 'word_distance', 'markable_distance']]

# Soon-Generated

In [6]:
text_11, text_21, prev_11, prev_21, next_11, next_21, syntactic_11, syntactic_21, is_singleton_11, is_singleton_21 = get_pair_data(pairs_soon.m1_id, pairs_soon.m2_id)
relation_1 = get_relation_data(pairs_soon)
label_soon = np.vstack(to_categorical(pairs_soon.is_coreference, num_classes=2))

## Build Model

### Word

In [7]:
words_model_builder = CoreferenceClassifierModelBuilder(
    use_words_feature=True, 
    use_context_feature=False, 
    use_syntactic_feature=False,
    use_relation_feature=False,
    embedding_matrix=embedding_matrix
)

In [8]:
words_model_1 = words_model_builder.create_model()

In [9]:
words_model_1.fit([text_11, text_21], label_soon, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a422f4320>

In [10]:
#words_model_1.save('models/coreference_classifiers/words_soon.model')

### Context

In [11]:
context_model_builder = CoreferenceClassifierModelBuilder(
    use_words_feature=False, 
    use_context_feature=True, 
    use_syntactic_feature=False,
    use_relation_feature=False,
    embedding_matrix=embedding_matrix
)

In [12]:
context_model_1 = context_model_builder.create_model()

In [13]:
context_model_1.fit([prev_11, prev_21, next_11, next_21], label_soon, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a4f851a20>

In [14]:
#context_model_1.save('models/coreference_classifiers/context_soon.model')

### Syntactic

In [15]:
syntactic_model_builder = CoreferenceClassifierModelBuilder(
    use_words_feature=False, 
    use_context_feature=False, 
    use_syntactic_feature=True,
    use_relation_feature=True,
    embedding_matrix=embedding_matrix,
    syntactic_features_num=syntactic_11.shape[1],
    relation_features_num=relation_1.shape[1]
)

In [16]:
syntactic_model_1 = syntactic_model_builder.create_model()

In [17]:
syntactic_model_1.fit([syntactic_11, syntactic_21, relation_1], label_soon, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a4e2ec470>

In [24]:
#syntactic_model_1.save('models/coreference_classifiers/syntactic_soon.model')

### Words + Context

In [20]:
words_context_model_builder = CoreferenceClassifierModelBuilder(
    use_words_feature=True, 
    use_context_feature=True, 
    use_syntactic_feature=False,
    use_relation_feature=False,
    embedding_matrix=embedding_matrix,
)

In [21]:
words_context_model_1 = words_context_model_builder.create_model()

In [22]:
words_context_model_1.fit([text_11, text_21, prev_11, prev_21, next_11, next_21], label_soon, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a5e70ffd0>

In [26]:
#words_context_model_1.save('models/coreference_classifiers/words_context_soon.model')

### Words + Syntactic

In [44]:
words_syntactic_model_builder = CoreferenceClassifierModelBuilder(
    use_words_feature=True, 
    use_context_feature=False, 
    use_syntactic_feature=True,
    use_relation_feature=True,
    embedding_matrix=embedding_matrix,
    syntactic_features_num=syntactic_11.shape[1],
    relation_features_num=relation_1.shape[1]
)

In [30]:
words_syntactic_model_1 = words_syntactic_model_builder.create_model()

In [31]:
words_syntactic_model_1.fit([text_11, text_21, syntactic_11, syntactic_21, relation_1], label_soon, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a5defdf98>

In [33]:
#words_syntactic_model_1.save('models/coreference_classifiers/words_syntactic_soon.model')

### Context + Syntactic

In [36]:
context_syntactic_model_builder = CoreferenceClassifierModelBuilder(
    use_words_feature=False, 
    use_context_feature=True, 
    use_syntactic_feature=True,
    use_relation_feature=True,
    embedding_matrix=embedding_matrix,
    syntactic_features_num=syntactic_11.shape[1],
    relation_features_num=relation_1.shape[1]
)

In [37]:
context_syntactic_model_1 = context_syntactic_model_builder.create_model()

In [38]:
context_syntactic_model_1.fit([prev_11, prev_21, next_11, next_21, syntactic_11, syntactic_21, relation_1], label_soon, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a64045630>

In [40]:
#context_syntactic_model_1.save('models/coreference_classifiers/context_syntactic_soon.model')

In [43]:
words_context_syntactic_model_builder = CoreferenceClassifierModelBuilder(
    use_words_feature=True, 
    use_context_feature=True, 
    use_syntactic_feature=True,
    use_relation_feature=True,
    embedding_matrix=embedding_matrix,
    syntactic_features_num=syntactic_11.shape[1],
    relation_features_num=relation_1.shape[1]
)

In [45]:
words_context_syntactic_model_1 = words_context_syntactic_model_builder.create_model()

In [58]:
words_context_syntactic_model_1.fit([text_11, text_21, prev_11, prev_21, next_11, next_21, syntactic_11, syntactic_21, relation_1], label_soon, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a70b30278>

In [59]:
#words_context_syntactic_model_1.save('models/coreference_classifiers/words_context_syntactic_soon.model')

# Gilang-Generated

In [91]:
text_12, text_22, prev_12, prev_22, next_12, next_22, syntactic_12, syntactic_22, is_singleton_12, is_singleton_22 = get_pair_data(pairs_gilang.m1_id, pairs_gilang.m2_id)
relation_2 = get_relation_data(pairs_gilang)
label_gilang = np.vstack(to_categorical(pairs_gilang.is_coreference, num_classes=2))

## Build Model

### Words

In [92]:
words_model_2 = words_model_builder.create_model()

In [93]:
words_model_2.fit([text_12, text_22], label_gilang, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a66bd1518>

In [94]:
words_model_2.save('models/coreference_classifiers/words_gilang.model')

### Context

In [None]:
context_model_2 = context_model_builder.create_model()

In [None]:
context_model_2.fit([prev_12, prev_22, next_12, next_22], label_gilang, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


<tensorflow.python.keras.callbacks.History at 0x1a97027f98>

In [None]:
context_model_2.save('models/coreference_classifiers/context_gilang.model')

### Syntactic

In [None]:
syntactic_model_2 = syntactic_model_builder.create_model()

In [None]:
syntactic_model_2.fit([syntactic_12, syntactic_22, relation_2], label_gilang, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a9526ddd8>

In [None]:
syntactic_model_2.save('models/coreference_classifiers/syntactic_gilang.model')

### Words + Context

In [None]:
words_context_model_2 = words_context_model_builder.create_model()

In [None]:
words_context_model_2.fit([text_12, text_22, prev_12, prev_22, next_12, next_22], label_gilang, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

In [None]:
words_context_model_2.save('models/coreference_classifiers/words_context_gilang.model')

### Words + Syntactic

In [None]:
words_syntactic_model_2 = words_syntactic_model_builder.create_model()

In [105]:
words_syntactic_model_2.fit([text_12, text_22, syntactic_12, syntactic_22, relation_2], label_gilang, epochs=10)

Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a7f666d30>

In [106]:
words_syntactic_model_2.save('models/coreference_classifiers/words_syntactic_gilang.model')

### Context + Syntactic

In [107]:
context_syntactic_model_2 = context_syntactic_model_builder.create_model()

In [108]:
context_syntactic_model_2.fit([prev_12, prev_22, next_12, next_22, syntactic_12, syntactic_22, relation_2], label_gilang, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


<tensorflow.python.keras.callbacks.History at 0x1ab8e2d4a8>

In [109]:
context_syntactic_model_2.save('models/coreference_classifiers/context_syntactic_gilang.model')

### Words + Context + Syntactic

In [110]:
words_context_syntactic_model_2 = words_context_syntactic_model_builder.create_model()

In [111]:
words_context_syntactic_model_2.fit([text_12, text_22, prev_12, prev_22, next_12, next_22, syntactic_12, syntactic_22, relation_2], label_gilang, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


<tensorflow.python.keras.callbacks.History at 0x1abcd612b0>

In [112]:
words_context_syntactic_model_2.save('models/coreference_classifiers/words_context_syntactic_gilang.model')