In [1]:
import json
import pandas as pd
import numpy as np
from utils.data_helper import get_markable_dataframe, get_embedding_variables, get_sentence_variables, get_document_id_variables
from model_builders.coreference_classifier import CoreferenceClassifierModelBuilder
from functools import reduce
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from utils.clusterers import BestFirstClusterer, get_anaphora_scores_by_antecedent, ClosestFirstClusterer
from utils.scorers import MUCScorer, B3Scorer, CEAFeScorer, AverageScorer
from utils.data_structures import UFDS

In [2]:
sentence_id_by_markable_id, markable_ids_by_sentence_id = get_sentence_variables('data/full.xml')
document_id_by_sentence_id, document_id_by_markable_id, sentence_ids_by_document_id, markable_ids_by_document_id = get_document_id_variables('data/document_id.csv', markable_ids_by_sentence_id)

In [3]:
embedding_indexes_file_path = 'helper_files/embedding/embedding_indexes.txt'
indexed_embedding_file_path = 'helper_files/embedding/indexed_embedding.txt'

word_vector, embedding_matrix, idx_by_word, word_by_idx = get_embedding_variables(embedding_indexes_file_path, indexed_embedding_file_path)

In [4]:
markables = get_markable_dataframe("data/testing/markables_with_predicted_singleton.csv", word_vector, idx_by_word)
label_markables = get_markable_dataframe("data/testing/markables.csv", word_vector, idx_by_word)
singletons = set(markables[markables['is_singleton'].map(lambda x: True if x[1] > 0 else False)]['id'])
label_singletons = set(label_markables[label_markables['is_singleton'].map(lambda x: True if x[1] > 0 else False)]['id'])
markables.head()

Unnamed: 0,id,text,is_pronoun,entity_type,is_proper_name,is_first_person,previous_words,next_words,is_singleton
0,1916,"[1263, 1264, 1968, 1395]",0,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 0]",1,0,[],"[999, 379, 1161, 213, 27, 1263, 1969, 1188, 14...","[0.0, 1.0]"
1,1917,[213],1,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",0,0,"[1263, 1264, 1968, 1395, 999, 379, 1161]","[27, 1263, 1969, 1188, 1470, 25, 1161, 63, 424...","[0.0, 1.0]"
2,1918,"[1263, 1969, 1188]",0,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 0]",1,0,"[1263, 1264, 1968, 1395, 999, 379, 1161, 213, 27]","[1470, 25, 1161, 63, 424, 1223, 25, 1415, 1161...","[0.0, 1.0]"
3,1919,"[1470, 25, 1161]",0,"[0, 0, 0, 0, 0, 1, 1, 0, 0, 0]",0,0,"[1968, 1395, 999, 379, 1161, 213, 27, 1263, 19...","[63, 424, 1223, 25, 1415, 1161, 876, 344, 213,...","[0.0, 1.0]"
4,1920,[424],0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",0,0,"[1161, 213, 27, 1263, 1969, 1188, 1470, 25, 11...","[1223, 25, 1415, 1161, 876, 344, 213, 406, 122...","[0.0, 1.0]"


In [5]:
pairs = pd.read_csv("data/testing/mention_pairs.csv")

label = np.vstack(to_categorical(pairs.is_coreference, num_classes=2))
label_chains = ClosestFirstClusterer().get_chains(get_anaphora_scores_by_antecedent(pairs.m1_id, pairs.m2_id, label))

pairs.head()

Unnamed: 0,m1_id,m2_id,is_exact_match,is_words_match,is_substring,is_abbreviation,is_appositive,is_nearest_candidate,sentence_distance,word_distance,markable_distance,is_coreference
0,1916,1917,0,0,0,0,0,1,0,3,1,1
1,1916,1918,0,0,0,0,0,0,0,5,2,0
2,1916,1919,0,0,0,0,0,0,0,8,3,0
3,1916,1920,0,0,0,0,0,0,0,12,4,0
4,1916,1921,0,0,0,0,0,0,0,13,5,0


In [6]:
max_text_length = 10
max_prev_words_length = 10
max_next_words_length = 10

def get_data(markable_ids):
    indices = reduce(lambda a, b: a + [b], map(lambda a: markables.index[markables['id'] == a].tolist()[0], markable_ids), [])
    data = markables.loc[indices]
    
    data_text = pad_sequences(data.text, maxlen=max_text_length, padding='post')
    data_previous_words = pad_sequences(data.previous_words.map(lambda seq: seq[(-1*max_prev_words_length):]), maxlen=max_prev_words_length, padding='pre')
    data_next_words = pad_sequences(data.next_words.map(lambda seq: seq[:max_next_words_length]), maxlen=max_next_words_length, padding='post')
    data_syntactic = data[['is_pronoun', 'entity_type', 'is_proper_name', 'is_first_person']]

    data_syntactic = np.array(list(map(lambda p: reduce(lambda x,y: x + y, [i if type(i) is list else [i] for i in p]), data_syntactic.values)))
    is_singleton = np.vstack(data.is_singleton)
    
    return data_text, data_previous_words, data_next_words, data_syntactic, is_singleton

def get_pair_data(markable_ids_1, markable_ids_2):
    text_1, prev_1, next_1, syntactic_1, is_singleton_1 = get_data(markable_ids_1)
    text_2, prev_2, next_2, syntactic_2, is_singleton_2 = get_data(markable_ids_2)
    
    return text_1, text_2, prev_1, prev_2, next_1, next_2, syntactic_1, syntactic_2, is_singleton_1, is_singleton_2

def get_relation_data(mention_pairs):
    return mention_pairs[['is_exact_match', 'is_words_match', 'is_substring', 'is_abbreviation', 'is_appositive', 'is_nearest_candidate', 'sentence_distance', 'word_distance', 'markable_distance']]

# Compute Baseline Score

In [8]:
baseline_result_file_path = 'baseline/suherik_and_purwarianti/test_result.txt'

baseline_ufds = UFDS()

for m1, m2 in zip(pairs.m1_id, pairs.m2_id):
    baseline_ufds.init_id(m1, m2)
    
for line in open(baseline_result_file_path, 'r').readlines():
    line = line.split(', ')
    m1_id, m2_id = int(line[0]), int(line[1])
    
    if document_id_by_markable_id[m1_id] == document_id_by_markable_id[m2_id]:
        baseline_ufds.join(m1_id, m2_id)

baseline_chains = baseline_ufds.get_chain_list()

print('MUC: ', MUCScorer().get_scores(baseline_chains, label_chains))
print('B3: ', B3Scorer().get_scores(baseline_chains, label_chains))
print('Average: ', AverageScorer([MUCScorer(), B3Scorer(),]).get_scores(baseline_chains, label_chains))

MUC:  (0.6395348837209303, 0.7051282051282052, 0.6707317073170733)
B3:  (0.5041087231352718, 0.6379818594104308, 0.5631991462778547)
Average:  (0.616965426797464, 0.616965426797464, 0.616965426797464)


# Test Models

In [9]:
text_1, text_2, prev_1, prev_2, next_1, next_2, syntactic_1, syntactic_2, is_singleton_1, is_singleton_2 = get_pair_data(pairs.m1_id, pairs.m2_id)
relation = get_relation_data(pairs)

In [10]:
models = {}

def get_model(hyperparameter, features, data_generation, epoch):
    name = hyperparameter + '_hyperparameter/' + '_'.join([*features, data_generation, str(epoch)])
    
    if name not in models:
        models[name] = load_model(f'models/coreference_classifiers/{name}.model')
    
    return models[name]

In [50]:
base_thresholds = [0.1, 0.01, 0.001, 0.0001, 0.00001]
thresholds = [0] + [base * multiplier for base in base_thresholds for multiplier in range(1, 10)]

muc_scorer = MUCScorer()
b3_scorer = B3Scorer()
ceafe_scorer = CEAFeScorer()
average_scorer = AverageScorer([muc_scorer, b3_scorer])

def get_sorted_scores(clusterer, pred, thresholds=thresholds):
    scores = [] # will be a tuple (average_f1, (prec_muc, rec_muc, f1_muc), (prec_b3, rec_b3, f1_b3), threshold)
    
    for threshold in thresholds:
        predicted_chains = clusterer.get_chains(pred, threshold)
        
#         avg_f1 = average_scorer.get_scores(predicted_chains, label_chains)[2]
        muc = muc_scorer.get_scores(predicted_chains, label_chains)
        b3 = b3_scorer.get_scores(predicted_chains, label_chains)
#         ceafe = ceafe_scorer.get_scores(predicted_chains, label_chains)
        avg_f1 = (muc[2] + b3[2] ) / 2
        
        scores.append((avg_f1, muc, b3, threshold))
    
    return sorted(scores, reverse=True)

def reorder_score(score):
    avg_f1, muc, b3, threshold = score
    return muc, b3, avg_f1, threshold 

def evaluate(features, data_generation, epoch):
    model = get_model(features, data_generation, epoch)
    
    test_features = []
    if 'words' in features:
        test_features.extend([text_1, text_2])
    if 'context' in features:
        test_features.extend([prev_1, prev_2, next_1, next_2])
    if 'syntactic' in features:
        test_features.extend([syntactic_1, syntactic_2, relation])
    
    print('getting anaphora scores by antecedent dict')
    raw_pred = model.predict(test_features, verbose=1)
    pred_without_singleton_classifier = get_anaphora_scores_by_antecedent(pairs.m1_id, pairs.m2_id, raw_pred)
    pred_with_singleton_classifier = get_anaphora_scores_by_antecedent(pairs.m1_id, pairs.m2_id, raw_pred, singletons)
    pred_with_label_singletons = get_anaphora_scores_by_antecedent(pairs.m1_id, pairs.m2_id, raw_pred, label_singletons)
        
    print('get sorted_scores_with_sc_best')
    sorted_scores_with_sc_best = get_sorted_scores(BestFirstClusterer(), pred_with_singleton_classifier)
    print('With singleton classifier, best-first:', ' '.join(str(x) if type(x) != tuple else ' '.join(str(y) for y in x) for x in reorder_score(sorted_scores_with_sc_best[0])))
    
    print('get sorted_scores_with_label_singletons_best')
    sorted_scores_with_label_singletons_best = get_sorted_scores(BestFirstClusterer(), pred_with_label_singletons, [sorted_scores_with_sc_best[0][3]])
    print('With label singletons, best-first:', ' '.join(str(x) if type(x) != tuple else ' '.join(str(y) for y in x) for x in reorder_score(sorted_scores_with_label_singletons_best[0])))
    
    print('get sorted_scores_without_sc_best')
    sorted_scores_without_sc_best = get_sorted_scores(BestFirstClusterer(), pred_without_singleton_classifier, [sorted_scores_with_sc_best[0][3]])
    print('Without singleton classifier, best-first:', ' '.join(str(x) if type(x) != tuple else ' '.join(str(y) for y in x) for x in reorder_score(sorted_scores_without_sc_best[0])))
    
    print()

## Budi

### Words + Context + Syntactic

In [51]:
evaluate('wu_ma', ['words', 'context', 'syntactic'], 'budi', 5)

getting anaphora scores by antecedent dict
get sorted_scores_with_sc_best
With singleton classifier, best-first: 0.5974025974025974 0.5897435897435898 0.5935483870967742 0.3984814635913538 0.5231065759637188 0.45236757652566695 0.5229579818112206 0.07
get sorted_scores_with_label_singletons_best
With label singletons, best-first: 0.8441558441558441 0.8333333333333334 0.8387096774193548 0.6004799548277809 0.7988208616780045 0.6855937040524324 0.7621516907358936 0.07
get sorted_scores_without_sc_best
Without singleton classifier, best-first: 0.2831858407079646 0.8205128205128205 0.42105263157894735 0.10169835920357859 0.7497732426303857 0.17910335091862667 0.300077991248787 0.07



In [52]:
evaluate('wu_ma', ['words', 'context', 'syntactic'], 'budi', 10)

getting anaphora scores by antecedent dict
get sorted_scores_with_sc_best
With singleton classifier, best-first: 0.6052631578947368 0.5897435897435898 0.5974025974025974 0.41207570207570215 0.530249433106576 0.463752688461121 0.5305776429318592 0.02
get sorted_scores_with_label_singletons_best
With label singletons, best-first: 0.85 0.8717948717948718 0.8607594936708861 0.6002946760841499 0.8585714285714287 0.7065704741026533 0.7836649838867698 0.02
get sorted_scores_without_sc_best
Without singleton classifier, best-first: 0.14316239316239315 0.8589743589743589 0.2454212454212454 0.031411545720768944 0.8353968253968254 0.060546497819591656 0.15298387162041854 0.02



In [53]:
evaluate('wu_ma', ['words', 'context', 'syntactic'], 'budi', 20)

getting anaphora scores by antecedent dict
get sorted_scores_with_sc_best
With singleton classifier, best-first: 0.6133333333333333 0.5897435897435898 0.6013071895424836 0.4256945301889123 0.530249433106576 0.4722542156782639 0.5367807026103738 0.06
get sorted_scores_with_label_singletons_best
With label singletons, best-first: 0.8701298701298701 0.8589743589743589 0.8645161290322582 0.6213088498802783 0.832154195011338 0.7114384746728767 0.7879773018525674 0.06
get sorted_scores_without_sc_best
Without singleton classifier, best-first: 0.2882882882882883 0.8205128205128205 0.4266666666666667 0.10332250309030494 0.797233560090703 0.1829362331651084 0.30480144991588753 0.06



## Gilang

### Words + Context + Syntactic

In [54]:
evaluate('wu_ma', ['words', 'context', 'syntactic'], 'gilang', 5)

getting anaphora scores by antecedent dict
get sorted_scores_with_sc_best
With singleton classifier, best-first: 0.6388888888888888 0.5897435897435898 0.6133333333333332 0.45876250186595013 0.530249433106576 0.4919223882808494 0.5526278608070913 0.009000000000000001
get sorted_scores_with_label_singletons_best
With label singletons, best-first: 0.8648648648648649 0.8205128205128205 0.8421052631578947 0.6282041191132101 0.7886167800453515 0.6993294775938265 0.7707173703758605 0.009000000000000001
get sorted_scores_without_sc_best
Without singleton classifier, best-first: 0.32275132275132273 0.782051282051282 0.4569288389513108 0.13433062509149465 0.7337414965986393 0.2270870159999155 0.34200792747561315 0.009000000000000001



In [55]:
evaluate('wu_ma', ['words', 'context', 'syntactic'], 'gilang', 10)

getting anaphora scores by antecedent dict
get sorted_scores_with_sc_best
With singleton classifier, best-first: 0.6153846153846154 0.6153846153846154 0.6153846153846154 0.4240414476827521 0.5569160997732426 0.48147957023853094 0.5484320928115731 3.0000000000000004e-05
get sorted_scores_with_label_singletons_best
With label singletons, best-first: 0.8607594936708861 0.8717948717948718 0.8662420382165605 0.605438408664215 0.8421541950113378 0.7044419740511945 0.7853420061338775 3.0000000000000004e-05
get sorted_scores_without_sc_best
Without singleton classifier, best-first: 0.2537878787878788 0.8589743589743589 0.391812865497076 0.08201345656792812 0.8150113378684809 0.14903021048072965 0.27042153798890284 3.0000000000000004e-05



In [56]:
evaluate('wu_ma', ['words', 'context', 'syntactic'], 'gilang', 20)

getting anaphora scores by antecedent dict
get sorted_scores_with_sc_best
With singleton classifier, best-first: 0.6301369863013698 0.5897435897435898 0.609271523178808 0.45828469283525464 0.5169160997732426 0.48583786601018847 0.5475546945944982 0.04
get sorted_scores_with_label_singletons_best
With label singletons, best-first: 0.8493150684931506 0.7948717948717948 0.8211920529801324 0.6436062065275548 0.7522675736961452 0.6937075346779171 0.7574497938290248 0.04
get sorted_scores_without_sc_best
Without singleton classifier, best-first: 0.3905325443786982 0.8461538461538461 0.5344129554655871 0.17562541076535385 0.8108390022675737 0.2887158035432525 0.4115643795044198 0.04



## Soon

### Words + Context + Syntactic

In [60]:
evaluate('wu_ma', ['words', 'context', 'syntactic'], 'soon', 5)

getting anaphora scores by antecedent dict
get sorted_scores_with_sc_best
With singleton classifier, best-first: 0.618421052631579 0.6025641025641025 0.6103896103896105 0.4337391774891775 0.5473922902494331 0.48398301255986265 0.5471863114747366 0.2
get sorted_scores_with_label_singletons_best
With label singletons, best-first: 0.8433734939759037 0.8974358974358975 0.8695652173913043 0.6004121162284428 0.8847619047619049 0.7153663612324334 0.7924657893118688 0.2
get sorted_scores_without_sc_best
Without singleton classifier, best-first: 0.24056603773584906 0.6538461538461539 0.3517241379310345 0.14909822903520387 0.5973696145124716 0.23863520009100944 0.295179669011022 0.2



In [61]:
evaluate('wu_ma', ['words', 'context', 'syntactic'], 'soon', 10)

getting anaphora scores by antecedent dict
get sorted_scores_with_sc_best
With singleton classifier, best-first: 0.6527777777777778 0.6025641025641025 0.6266666666666667 0.4990502942750134 0.5426303854875283 0.5199287244567992 0.5732976955617329 0.2
get sorted_scores_with_label_singletons_best
With label singletons, best-first: 0.8625 0.8846153846153846 0.8734177215189874 0.6445744404507292 0.8575510204081633 0.7359644497623874 0.8046910856406875 0.2
get sorted_scores_without_sc_best
Without singleton classifier, best-first: 0.1408839779005525 0.6538461538461539 0.2318181818181818 0.07901802438306162 0.62437641723356 0.14028257274191186 0.18605037728004684 0.2



In [62]:
evaluate('wu_ma', ['words', 'context', 'syntactic'], 'soon', 20)

getting anaphora scores by antecedent dict
get sorted_scores_with_sc_best
With singleton classifier, best-first: 0.7636363636363637 0.5384615384615384 0.6315789473684211 0.6763046314416177 0.4665759637188208 0.5521967675688427 0.5918878574686319 0.8
get sorted_scores_with_label_singletons_best
With label singletons, best-first: 0.864406779661017 0.6538461538461539 0.7445255474452555 0.7715538847117793 0.5909070294784582 0.6692545956352384 0.7068900715402469 0.8
get sorted_scores_without_sc_best
Without singleton classifier, best-first: 0.36231884057971014 0.6410256410256411 0.46296296296296297 0.25697879248411165 0.5986167800453516 0.3595900264934673 0.41127649472821515 0.8



# Saving Best Model

In [39]:
def get_markable_text(idx):
    return [word_by_idx[x] for x in markables[markables['id'] == idx].text.values[0]]

In [40]:
best_model = models['words_context_syntactic_budi_20']
raw_pred_best = best_model.predict([text_1, text_2, prev_1, prev_2, next_1, next_2, syntactic_1, syntactic_2, relation], verbose=1)
pred_best_wo_sc = get_anaphora_scores_by_antecedent(pairs.m1_id, pairs.m2_id, raw_pred_best)
pred_best_w_sc = get_anaphora_scores_by_antecedent(pairs.m1_id, pairs.m2_id, raw_pred_best, singletons)
pred_chains_best_wo_sc = BestFirstClusterer().get_chains(pred_best_wo_sc, 0.2)
pred_chains_best_w_sc = BestFirstClusterer().get_chains(pred_best_w_sc, 0.2)



In [42]:
def convert_chains(chains):
    return [[(id, ' '.join(get_markable_text(id))) for id in chain] for chain in chains if len(chain) > 1]

def save_chains(chains, file_path):
    chains = convert_chains(chains)
    
    with open(file_path, 'w') as f:
        f.write(json.dumps(chains, indent=4))

In [43]:
save_chains(pred_chains_best_w_sc, 'result/with_singleton_classifier.json')

In [44]:
save_chains(pred_chains_best_wo_sc, 'result/without_singleton_classifier.json')