In [5]:
import json
import pandas as pd
import numpy as np
from utils.data_helper import get_markable_dataframe, get_embedding_variables, get_sentence_variables, get_document_id_variables
from model_builders.coreference_classifier import CoreferenceClassifierModelBuilder
from functools import reduce
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from utils.clusterers import BestFirstClusterer, get_anaphora_scores_by_antecedent, ClosestFirstClusterer
from utils.scorers import MUCScorer, B3Scorer, AverageScorer
from utils.data_structures import UFDS

In [6]:
sentence_id_by_markable_id, markable_ids_by_sentence_id = get_sentence_variables('data/full.xml')
document_id_by_sentence_id, document_id_by_markable_id, sentence_ids_by_document_id, markable_ids_by_document_id = get_document_id_variables('data/document_id.csv', markable_ids_by_sentence_id)

In [7]:
embedding_indexes_file_path = 'helper_files/embedding/embedding_indexes.txt'
indexed_embedding_file_path = 'helper_files/embedding/indexed_embedding.txt'

word_vector, embedding_matrix, idx_by_word, word_by_idx = get_embedding_variables(embedding_indexes_file_path, indexed_embedding_file_path)

In [8]:
markables = get_markable_dataframe("data/testing/markables_with_predicted_singleton.csv", word_vector, idx_by_word)
singletons = set(markables[markables['is_singleton'].map(lambda x: True if x[1] > 0 else False)]['id'])
markables.head()

Unnamed: 0,id,text,is_pronoun,entity_type,is_proper_name,is_first_person,previous_words,next_words,is_singleton
0,1916,"[1263, 1264, 1968, 1395]",0,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 0]",1,0,[],"[999, 379, 1161, 213, 27, 1263, 1969, 1188, 14...","[0.0, 1.0]"
1,1917,[213],1,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",0,0,"[1263, 1264, 1968, 1395, 999, 379, 1161]","[27, 1263, 1969, 1188, 1470, 25, 1161, 63, 424...","[1.0, 0.0]"
2,1918,"[1263, 1969, 1188]",0,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 0]",1,0,"[1263, 1264, 1968, 1395, 999, 379, 1161, 213, 27]","[1470, 25, 1161, 63, 424, 1223, 25, 1415, 1161...","[0.0, 1.0]"
3,1919,"[1470, 25, 1161]",0,"[0, 0, 0, 0, 0, 1, 1, 0, 0, 0]",0,0,"[1968, 1395, 999, 379, 1161, 213, 27, 1263, 19...","[63, 424, 1223, 25, 1415, 1161, 876, 344, 213,...","[0.0, 1.0]"
4,1920,[424],0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",0,0,"[1161, 213, 27, 1263, 1969, 1188, 1470, 25, 11...","[1223, 25, 1415, 1161, 876, 344, 213, 406, 122...","[0.0, 1.0]"


In [13]:
pairs = pd.read_csv("data/testing/mention_pairs.csv")

label = np.vstack(to_categorical(pairs.is_coreference, num_classes=2))
label_chains = ClosestFirstClusterer().get_chains(get_anaphora_scores_by_antecedent(pairs.m1_id, pairs.m2_id, label))

pairs.head()

Unnamed: 0,m1_id,m2_id,is_exact_match,is_words_match,is_substring,is_abbreviation,is_appositive,is_nearest_candidate,sentence_distance,word_distance,markable_distance,is_coreference
0,1916,1917,0,0,0,0,0,1,0,3,1,1
1,1916,1918,0,0,0,0,0,0,0,5,2,0
2,1916,1919,0,0,0,0,0,0,0,8,3,0
3,1916,1920,0,0,0,0,0,0,0,12,4,0
4,1916,1921,0,0,0,0,0,0,0,13,5,0


In [14]:
max_text_length = 10
max_prev_words_length = 10
max_next_words_length = 10

def get_data(markable_ids):
    indices = reduce(lambda a, b: a + [b], map(lambda a: markables.index[markables['id'] == a].tolist()[0], markable_ids), [])
    data = markables.loc[indices]
    
    data_text = pad_sequences(data.text, maxlen=max_text_length, padding='post')
    data_previous_words = pad_sequences(data.previous_words.map(lambda seq: seq[(-1*max_prev_words_length):]), maxlen=max_prev_words_length, padding='pre')
    data_next_words = pad_sequences(data.next_words.map(lambda seq: seq[:max_next_words_length]), maxlen=max_next_words_length, padding='post')
    data_syntactic = data[['is_pronoun', 'entity_type', 'is_proper_name', 'is_first_person']]

    data_syntactic = np.array(list(map(lambda p: reduce(lambda x,y: x + y, [i if type(i) is list else [i] for i in p]), data_syntactic.values)))
    is_singleton = np.vstack(data.is_singleton)
    
    return data_text, data_previous_words, data_next_words, data_syntactic, is_singleton

def get_pair_data(markable_ids_1, markable_ids_2):
    text_1, prev_1, next_1, syntactic_1, is_singleton_1 = get_data(markable_ids_1)
    text_2, prev_2, next_2, syntactic_2, is_singleton_2 = get_data(markable_ids_2)
    
    return text_1, text_2, prev_1, prev_2, next_1, next_2, syntactic_1, syntactic_2, is_singleton_1, is_singleton_2

def get_relation_data(mention_pairs):
    return mention_pairs[['is_exact_match', 'is_words_match', 'is_substring', 'is_abbreviation', 'is_appositive', 'is_nearest_candidate', 'sentence_distance', 'word_distance', 'markable_distance']]

# Compute Baseline Score

In [16]:
baseline_result_file_path = 'baseline/suherik_and_purwarianti/test_result.txt'

baseline_ufds = UFDS()

for m1, m2 in zip(pairs.m1_id, pairs.m2_id):
    baseline_ufds.init_id(m1, m2)
    
for line in open(baseline_result_file_path, 'r').readlines():
    line = line.split(', ')
    m1_id, m2_id = int(line[0]), int(line[1])
    
    if document_id_by_markable_id[m1_id] == document_id_by_markable_id[m2_id]:
        baseline_ufds.join(m1_id, m2_id)

baseline_chains = baseline_ufds.get_chain_list()

print('MUC: ', MUCScorer().get_scores(baseline_chains, label_chains))
print('B3: ', B3Scorer().get_scores(baseline_chains, label_chains))
print('Average: ', AverageScorer([MUCScorer(), B3Scorer()]).get_scores(baseline_chains, label_chains))

MUC:  (0.6395348837209303, 0.7051282051282052, 0.6707317073170733)
B3:  (0.5041087231352718, 0.6379818594104308, 0.5631991462778547)
Average:  (0.616965426797464, 0.616965426797464, 0.616965426797464)


# Test Models

In [17]:
text_1, text_2, prev_1, prev_2, next_1, next_2, syntactic_1, syntactic_2, is_singleton_1, is_singleton_2 = get_pair_data(pairs.m1_id, pairs.m2_id)
relation = get_relation_data(pairs)

In [18]:
models = {}

def get_model(features, data_generation, epoch):
    name = '_'.join([*features, data_generation, str(epoch)])
    
    if name not in models:
        models[name] = load_model(f'models/coreference_classifiers/{name}.model')
    
    return models[name]

In [29]:
base_thresholds = [0.1, 0.01, 0.001, 0.0001, 0.00001]
thresholds = [0] + [base * multiplier for base in base_thresholds for multiplier in range(1, 10)]

muc_scorer = MUCScorer()
b3_scorer = B3Scorer()
average_scorer = AverageScorer([muc_scorer, b3_scorer])

def get_sorted_scores(clusterer, pred, thresholds=thresholds):
    scores = [] # will be a tuple (average_f1, (prec_muc, rec_muc, f1_muc), (prec_b3, rec_b3, f1_b3), threshold)
    
    for threshold in thresholds:
        predicted_chains = clusterer.get_chains(pred, threshold)
        
#         avg_f1 = average_scorer.get_scores(predicted_chains, label_chains)[2]
        muc = muc_scorer.get_scores(predicted_chains, label_chains)
        b3 = b3_scorer.get_scores(predicted_chains, label_chains)
        avg_f1 = (muc[2] + b3[2]) / 2
        
        scores.append((avg_f1, muc, b3, threshold))
    
    return sorted(scores, reverse=True)

def reorder_score(score):
    avg_f1, muc, b3, threshold = score
    return muc, b3, avg_f1, threshold

def evaluate(features, data_generation, epoch):
    model = get_model(features, data_generation, epoch)
    
    test_features = []
    if 'words' in features:
        test_features.extend([text_1, text_2])
    if 'context' in features:
        test_features.extend([prev_1, prev_2, next_1, next_2])
    if 'syntactic' in features:
        test_features.extend([syntactic_1, syntactic_2, relation])
    
    print('getting anaphora scores by antecedent dict')
    raw_pred = model.predict(test_features, verbose=1)
    pred_without_singleton_classifier = get_anaphora_scores_by_antecedent(pairs.m1_id, pairs.m2_id, raw_pred)
    pred_with_singleton_classifier = get_anaphora_scores_by_antecedent(pairs.m1_id, pairs.m2_id, raw_pred, singletons)
        
    print('get sorted_scores_with_sc_best')
    sorted_scores_with_sc_best = get_sorted_scores(BestFirstClusterer(), pred_with_singleton_classifier)
    print('With singleton classifier, best-first:', reorder_score(sorted_scores_with_sc_best[0]))
    
    print()
    
    print('get sorted_scores_without_sc_best')
    sorted_scores_without_sc_best = get_sorted_scores(BestFirstClusterer(), pred_without_singleton_classifier, [sorted_scores_with_sc_best[0][3]])
    print('Without singleton classifier, best-first:', reorder_score(sorted_scores_without_sc_best[0]))
    
    print()

## Budi

### Words + Context + Syntactic

In [30]:
evaluate(['words', 'context', 'syntactic'], 'budi', 5)

getting anaphora scores by antecedent dict
get sorted_scores_with_sc_best
With singleton classifier, best-first: ((0.5909090909090909, 0.6666666666666666, 0.6265060240963856), (0.44425024611940495, 0.6115192743764172, 0.5146342698374162), 0.5705701469669009, 0.09)

get sorted_scores_without_sc_best
Without singleton classifier, best-first: ((0.5196850393700787, 0.8461538461538461, 0.6439024390243901), (0.32174336774336776, 0.8064399092970522, 0.4599725905892746), 0.5519375148068324, 0.09)



In [31]:
evaluate(['words', 'context', 'syntactic'], 'budi', 10)

getting anaphora scores by antecedent dict
get sorted_scores_with_sc_best
With singleton classifier, best-first: ((0.6986301369863014, 0.6538461538461539, 0.6754966887417219), (0.5461981566820278, 0.6003854875283446, 0.5720113804910595), 0.6237540346163907, 0.2)

get sorted_scores_without_sc_best
Without singleton classifier, best-first: ((0.6631578947368421, 0.8076923076923077, 0.7283236994219653), (0.4915732878232878, 0.7603854875283446, 0.5971206104808658), 0.6627221549514155, 0.2)



In [32]:
evaluate(['words', 'context', 'syntactic'], 'budi', 20)

getting anaphora scores by antecedent dict
get sorted_scores_with_sc_best
With singleton classifier, best-first: ((0.7666666666666667, 0.5897435897435898, 0.6666666666666667), (0.6473176612417119, 0.5327664399092971, 0.5844822848367961), 0.6255744757517314, 0.2)

get sorted_scores_without_sc_best
Without singleton classifier, best-first: ((0.7333333333333333, 0.7051282051282052, 0.7189542483660132), (0.6033110119047619, 0.6526303854875283, 0.6270023411694138), 0.6729782947677134, 0.2)



## Gilang

### Words + Context + Syntactic

In [33]:
evaluate(['words', 'context', 'syntactic'], 'gilang', 5)

getting anaphora scores by antecedent dict
get sorted_scores_with_sc_best
With singleton classifier, best-first: ((0.6206896551724138, 0.6923076923076923, 0.6545454545454545), (0.43332702895809694, 0.6345351473922903, 0.5149751274620836), 0.584760291003769, 0.5)

get sorted_scores_without_sc_best
Without singleton classifier, best-first: ((0.5779816513761468, 0.8076923076923077, 0.6737967914438502), (0.3602799650043745, 0.7571201814058958, 0.4882319611955166), 0.5810143763196833, 0.5)



In [34]:
evaluate(['words', 'context', 'syntactic'], 'gilang', 10)

getting anaphora scores by antecedent dict
get sorted_scores_with_sc_best
With singleton classifier, best-first: ((0.6619718309859155, 0.6025641025641025, 0.6308724832214764), (0.5348335572473504, 0.5301133786848072, 0.5324630073107586), 0.5816677452661174, 0.6000000000000001)

get sorted_scores_without_sc_best
Without singleton classifier, best-first: ((0.6352941176470588, 0.6923076923076923, 0.6625766871165644), (0.4646665834165834, 0.6312698412698412, 0.535304774527802), 0.5989407308221832, 0.6000000000000001)



In [35]:
evaluate(['words', 'context', 'syntactic'], 'gilang', 20)

getting anaphora scores by antecedent dict
get sorted_scores_with_sc_best
With singleton classifier, best-first: ((0.5955056179775281, 0.6794871794871795, 0.6347305389221557), (0.4402153145208701, 0.6154875283446711, 0.5133017073980377), 0.5740161231600966, 0.4)

get sorted_scores_without_sc_best
Without singleton classifier, best-first: ((0.5447154471544715, 0.8589743589743589, 0.6666666666666666), (0.3573314642106588, 0.8181179138321997, 0.49740852733850427), 0.5820375970025855, 0.4)



## Soon

### Words + Context + Syntactic

In [36]:
evaluate(['words', 'context', 'syntactic'], 'soon', 5)

getting anaphora scores by antecedent dict
get sorted_scores_with_sc_best
With singleton classifier, best-first: ((0.5730337078651685, 0.6538461538461539, 0.6107784431137724), (0.39757431042477775, 0.5956462585034014, 0.4768601414228524), 0.5438192922683124, 0.5)

get sorted_scores_without_sc_best
Without singleton classifier, best-first: ((0.22794117647058823, 0.3974358974358974, 0.2897196261682243), (0.22598072562358276, 0.3927664399092971, 0.2868947125272212), 0.2883071693477227, 0.5)



In [37]:
evaluate(['words', 'context', 'syntactic'], 'soon', 10)

getting anaphora scores by antecedent dict
get sorted_scores_with_sc_best
With singleton classifier, best-first: ((0.5520833333333334, 0.6794871794871795, 0.6091954022988507), (0.35728364555798187, 0.6281859410430838, 0.45549972552324275), 0.5323475639110468, 0.2)

get sorted_scores_without_sc_best
Without singleton classifier, best-first: ((0.3218390804597701, 0.717948717948718, 0.4444444444444445), (0.1960860969671542, 0.6849659863945576, 0.3048907308972888), 0.3746675876708666, 0.2)



In [38]:
evaluate(['words', 'context', 'syntactic'], 'soon', 20)

getting anaphora scores by antecedent dict
get sorted_scores_with_sc_best
With singleton classifier, best-first: ((0.6470588235294118, 0.5641025641025641, 0.6027397260273972), (0.5609848484848485, 0.4987074829931973, 0.5280161674756727), 0.565377946751535, 0.4)

get sorted_scores_without_sc_best
Without singleton classifier, best-first: ((0.3543307086614173, 0.5769230769230769, 0.4390243902439024), (0.2727737422652677, 0.5226303854875283, 0.3584589043402067), 0.3987416472920545, 0.4)



# Saving Best Model

In [39]:
def get_markable_text(idx):
    return [word_by_idx[x] for x in markables[markables['id'] == idx].text.values[0]]

In [40]:
best_model = models['words_context_syntactic_budi_20']
raw_pred_best = best_model.predict([text_1, text_2, prev_1, prev_2, next_1, next_2, syntactic_1, syntactic_2, relation], verbose=1)
pred_best_wo_sc = get_anaphora_scores_by_antecedent(pairs.m1_id, pairs.m2_id, raw_pred_best)
pred_best_w_sc = get_anaphora_scores_by_antecedent(pairs.m1_id, pairs.m2_id, raw_pred_best, singletons)
pred_chains_best_wo_sc = BestFirstClusterer().get_chains(pred_best_wo_sc, 0.2)
pred_chains_best_w_sc = BestFirstClusterer().get_chains(pred_best_w_sc, 0.2)



In [42]:
def convert_chains(chains):
    return [[(id, ' '.join(get_markable_text(id))) for id in chain] for chain in chains if len(chain) > 1]

def save_chains(chains, file_path):
    chains = convert_chains(chains)
    
    with open(file_path, 'w') as f:
        f.write(json.dumps(chains, indent=4))

In [43]:
save_chains(pred_chains_best_w_sc, 'result/with_singleton_classifier.json')

In [44]:
save_chains(pred_chains_best_wo_sc, 'result/without_singleton_classifier.json')