In [1]:
import pandas as pd
import numpy as np
from utils.data_helper import get_markable_dataframe, get_embedding_variables
from model_builders.coreference_classifier import CoreferenceClassifierModelBuilder
from functools import reduce
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
embedding_indexes_file_path = 'helper_files/embedding/embedding_indexes.txt'
indexed_embedding_file_path = 'helper_files/embedding/indexed_embedding.txt'

word_vector, embedding_matrix, idx_by_word, word_by_idx = get_embedding_variables(embedding_indexes_file_path, indexed_embedding_file_path)

In [3]:
markables = get_markable_dataframe("data/training/markables.csv", word_vector, idx_by_word)

markables.head()

Unnamed: 0,id,text,is_pronoun,entity_type,is_proper_name,is_first_person,num_words,previous_words,next_words,is_singleton
0,1,[1],0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,1,[],"[2, 3, 4, 5, 6, 7, 8, 9, 10, 2]","[0.0, 1.0]"
1,2,"[4, 5]",0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",0,0,2,"[1, 2, 3]","[6, 7, 8, 9, 10, 2, 11, 10, 12, 13]","[0.0, 1.0]"
2,3,"[6, 7, 8]",0,"[1, 0, 0, 1, 0, 0, 0, 0, 0, 0]",1,0,3,"[1, 2, 3, 4, 5]","[9, 10, 2, 11, 10, 12, 13, 14, 15, 16]","[1.0, 0.0]"
3,4,[10],0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",0,0,1,"[1, 2, 3, 4, 5, 6, 7, 8, 9]","[2, 11, 10, 12, 13, 14, 15, 16, 17, 18]","[0.0, 1.0]"
4,5,"[10, 12]",0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",0,0,2,"[3, 4, 5, 6, 7, 8, 9, 10, 2, 11]","[13, 14, 15, 16, 17, 18, 4, 5, 19, 20]","[0.0, 1.0]"


In [4]:
pairs_budi = pd.read_csv("data/training/mention_pairs_budi.csv")
pairs_soon = pd.read_csv("data/training/mention_pairs_soon.csv")
pairs_gilang = pd.read_csv("data/training/mention_pairs_gilang.csv")

pairs_gilang.head()

Unnamed: 0,m1_id,m2_id,is_exact_match,is_words_match,is_substring,is_abbreviation,is_appositive,is_nearest_candidate,sentence_distance,word_distance,markable_distance,is_coreference
0,3,11,0,0,0,0,0,0,2,31,8,1
1,3,4,0,0,0,0,0,1,0,1,1,0
2,4,11,0,0,0,0,0,0,2,29,7,0
3,3,5,0,0,0,0,0,0,0,4,2,0
4,5,11,0,0,0,0,0,0,2,25,6,0


In [None]:
max_text_length = 10
max_prev_words_length = 10
max_next_words_length = 10

def get_data(markable_ids):
    indices = reduce(lambda a, b: a + [b], map(lambda a: markables.index[markables['id'] == a].tolist()[0], markable_ids), [])
    data = markables.loc[indices]
    
    data_text = pad_sequences(data.text, maxlen=max_text_length, padding='post')
    data_previous_words = pad_sequences(data.previous_words.map(lambda seq: seq[(-1*max_prev_words_length):]), maxlen=max_prev_words_length, padding='pre')
    data_next_words = pad_sequences(data.next_words.map(lambda seq: seq[:max_next_words_length]), maxlen=max_next_words_length, padding='post')
    data_syntactic = data[['is_pronoun', 'entity_type', 'is_proper_name', 'is_first_person']]

    data_syntactic = np.array(list(map(lambda p: reduce(lambda x,y: x + y, [i if type(i) is list else [i] for i in p]), data_syntactic.values)))
    is_singleton = np.vstack(data.is_singleton)
    
    return data_text, data_previous_words, data_next_words, data_syntactic, is_singleton

def get_pair_data(markable_ids_1, markable_ids_2):
    text_1, prev_1, next_1, syntactic_1, is_singleton_1 = get_data(markable_ids_1)
    text_2, prev_2, next_2, syntactic_2, is_singleton_2 = get_data(markable_ids_2)
    
    return text_1, text_2, prev_1, prev_2, next_1, next_2, syntactic_1, syntactic_2, is_singleton_1, is_singleton_2

# Gilang-Generated

In [None]:
text_11, text_21, prev_11, prev_21, next_11, next_21, syntactic_11, syntactic_21, is_singleton_11, is_singleton_21 = get_pair_data(pairs_gilang.m1_id[0:100], pairs_gilang.m2_id[0:100])
label_gilang = np.vstack(to_categorical(pairs_gilang.is_coreference, num_classes=2))

## Build Model

### Word

In [None]:
words_model_builder = CoreferenceClassifierModelBuilder(
    use_words_feature=True, 
    use_context_feature=False, 
    use_syntactic_feature=False,
    use_relation_feature=False,
    embedding_matrix=embedding_matrix
)

In [None]:
words_model = words_model_builder.create_model()

In [None]:
words_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 10)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 10)           0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 10, 300)      1085700     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 10, 300)      1085700     input_2[0][0]                    
__________________________________________________________________________________________________
reshape (R

In [None]:
words_model.fit([text_11, text_21], label_gilang[0:100], epochs=10)

Epoch 1/10
