In [11]:
import pandas as pd
import numpy as np
from utils.data_helper import get_markable_dataframe, get_embedding_variables
from model_builders.coreference_classifier import CoreferenceClassifierModelBuilder
from functools import reduce
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from utils.clusterers import BestFirstClusterer, get_anaphora_scores_by_antecedent, ClosestFirstClusterer
from utils.scorers import MUCScorer, B3Scorer, AverageScorer
from utils.data_structures import UFDS

In [12]:
embedding_indexes_file_path = 'helper_files/embedding/embedding_indexes.txt'
indexed_embedding_file_path = 'helper_files/embedding/indexed_embedding.txt'

word_vector, embedding_matrix, idx_by_word, word_by_idx = get_embedding_variables(embedding_indexes_file_path, indexed_embedding_file_path)

In [13]:
markables = get_markable_dataframe("data/testing/markables_with_predicted_singleton.csv", word_vector, idx_by_word)
singletons = set(markables[markables['is_singleton'].map(lambda x: True if x[1] > 0 else False)]['id'])
markables.head()

Unnamed: 0,id,text,is_pronoun,entity_type,is_proper_name,is_first_person,previous_words,next_words,is_singleton
0,1916,"[1263, 1264, 1968, 1395]",0,"[0, 0, 1, 0, 0, 0, 0, 0, 1, 0]",1,0,[],"[999, 379, 1161, 213, 27, 1263, 1969, 1188, 14...","[0.0, 1.0]"
1,1917,[213],1,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]",0,0,"[1263, 1264, 1968, 1395, 999, 379, 1161]","[27, 1263, 1969, 1188, 1470, 25, 1161, 63, 424...","[1.0, 0.0]"
2,1918,"[1263, 1969, 1188]",0,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1]",1,0,"[1263, 1264, 1968, 1395, 999, 379, 1161, 213, 27]","[1470, 25, 1161, 63, 424, 1223, 25, 1415, 1161...","[0.0, 1.0]"
3,1919,"[1470, 25, 1161]",0,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 0]",0,0,"[1968, 1395, 999, 379, 1161, 213, 27, 1263, 19...","[63, 424, 1223, 25, 1415, 1161, 876, 344, 213,...","[0.0, 1.0]"
4,1920,[424],0,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",0,0,"[1161, 213, 27, 1263, 1969, 1188, 1470, 25, 11...","[1223, 25, 1415, 1161, 876, 344, 213, 406, 122...","[0.0, 1.0]"


In [14]:
pairs = pd.read_csv("data/testing/mention_pairs.csv")

label = np.vstack(to_categorical(pairs.is_coreference, num_classes=2))
label_chains = ClosestFirstClusterer().get_chains(get_anaphora_scores_by_antecedent(pairs.m1_id, pairs.m2_id, label))

pairs.head()

Unnamed: 0,m1_id,m2_id,is_exact_match,is_words_match,is_substring,is_abbreviation,is_appositive,is_nearest_candidate,sentence_distance,word_distance,markable_distance,is_coreference
0,1916,1917,0,0,0,0,0,1,0,3,1,1
1,1916,1918,0,0,0,0,0,0,0,5,2,0
2,1916,1919,0,0,0,0,0,0,0,8,3,0
3,1916,1920,0,0,0,0,0,0,0,12,4,0
4,1916,1921,0,0,0,0,0,0,0,13,5,0


In [15]:
max_text_length = 10
max_prev_words_length = 10
max_next_words_length = 10

def get_data(markable_ids):
    indices = reduce(lambda a, b: a + [b], map(lambda a: markables.index[markables['id'] == a].tolist()[0], markable_ids), [])
    data = markables.loc[indices]
    
    data_text = pad_sequences(data.text, maxlen=max_text_length, padding='post')
    data_previous_words = pad_sequences(data.previous_words.map(lambda seq: seq[(-1*max_prev_words_length):]), maxlen=max_prev_words_length, padding='pre')
    data_next_words = pad_sequences(data.next_words.map(lambda seq: seq[:max_next_words_length]), maxlen=max_next_words_length, padding='post')
    data_syntactic = data[['is_pronoun', 'entity_type', 'is_proper_name', 'is_first_person']]

    data_syntactic = np.array(list(map(lambda p: reduce(lambda x,y: x + y, [i if type(i) is list else [i] for i in p]), data_syntactic.values)))
    is_singleton = np.vstack(data.is_singleton)
    
    return data_text, data_previous_words, data_next_words, data_syntactic, is_singleton

def get_pair_data(markable_ids_1, markable_ids_2):
    text_1, prev_1, next_1, syntactic_1, is_singleton_1 = get_data(markable_ids_1)
    text_2, prev_2, next_2, syntactic_2, is_singleton_2 = get_data(markable_ids_2)
    
    return text_1, text_2, prev_1, prev_2, next_1, next_2, syntactic_1, syntactic_2, is_singleton_1, is_singleton_2

def get_relation_data(mention_pairs):
    return mention_pairs[['is_exact_match', 'is_words_match', 'is_substring', 'is_abbreviation', 'is_appositive', 'is_nearest_candidate', 'sentence_distance', 'word_distance', 'markable_distance']]

# Compute Baseline Score

In [16]:
baseline_result_file_path = 'baseline/test_result.txt'

baseline_ufds = UFDS()

for m1, m2 in zip(pairs.m1_id, pairs.m2_id):
    baseline_ufds.init_id(m1, m2)
    
for line in open(baseline_result_file_path, 'r').readlines():
    line = line.split(', ')
    baseline_ufds.join(int(line[0]), int(line[1]))

baseline_chains = baseline_ufds.get_chain_list()

print('MUC: ', MUCScorer().get_scores(baseline_chains, label_chains))
print('B3: ', B3Scorer().get_scores(baseline_chains, label_chains))
print('Average: ', AverageScorer([MUCScorer(), B3Scorer()]).get_scores(baseline_chains, label_chains))

MUC:  (0.5544554455445545, 0.7272727272727273, 0.6292134831460674)
B3:  (0.3124361294443262, 0.6732829670329671, 0.4268110965737344)
Average:  (0.5280122898599009, 0.5280122898599009, 0.5280122898599009)


# Test Models

In [17]:
text_1, text_2, prev_1, prev_2, next_1, next_2, syntactic_1, syntactic_2, is_singleton_1, is_singleton_2 = get_pair_data(pairs.m1_id, pairs.m2_id)
relation = get_relation_data(pairs)

In [18]:
models = {}

def get_model(features, data_generation, epoch):
    name = '_'.join([*features, data_generation, str(epoch)])
    
    if name not in models:
        models[name] = load_model(f'models/coreference_classifiers/{name}.model')
    
    return models[name]

In [19]:
base_thresholds = [0.1, 0.01, 0.001, 0.0001, 0.00001]
thresholds = [base * multiplier for base in base_thresholds for multiplier in range(1, 10)]

muc_scorer = MUCScorer()
b3_scorer = B3Scorer()
average_scorer = AverageScorer([muc_scorer, b3_scorer])

def get_sorted_scores(clusterer, pred):
    scores = [] # will be a tuple (average_f1, (prec_muc, rec_muc, f1_muc), (prec_b3, rec_b3, f1_b3), threshold)
    
    for threshold in thresholds:
        predicted_chains = clusterer.get_chains(pred, threshold)
        
#         avg_f1 = average_scorer.get_scores(predicted_chains, label_chains)[2]
        muc = muc_scorer.get_scores(predicted_chains, label_chains)
        b3 = b3_scorer.get_scores(predicted_chains, label_chains)
        avg_f1 = (muc[2] + b3[2]) / 2
        
        scores.append((avg_f1, muc, b3, threshold))
    
    return sorted(scores, reverse=True)

def evaluate(features, data_generation, epoch):
    model = get_model(features, data_generation, epoch)
    
    test_features = []
    if 'words' in features:
        test_features.extend([text_1, text_2])
    if 'context' in features:
        test_features.extend([prev_1, prev_2, next_1, next_2])
    if 'syntactic' in features:
        test_features.extend([syntactic_1, syntactic_2, relation])
    
    print('getting anaphora scores by antecedent dict')
    raw_pred = model.predict(test_features, verbose=1)
    pred_without_singleton_classifier = get_anaphora_scores_by_antecedent(pairs.m1_id, pairs.m2_id, raw_pred)
    pred_with_singleton_classifier = get_anaphora_scores_by_antecedent(pairs.m1_id, pairs.m2_id, raw_pred, singletons)
    
    print('get sorted_scores_without_sc_closest')
    sorted_scores_without_sc_closest = get_sorted_scores(ClosestFirstClusterer(), pred_without_singleton_classifier)
    print('Without singleton classifier, closest-first:', sorted_scores_without_sc_closest[0])
    
    print()
    
    print('get sorted_scores_with_sc_closest')
    sorted_scores_with_sc_closest = get_sorted_scores(ClosestFirstClusterer(), pred_with_singleton_classifier)
    print('With singleton classifier, closest-first:', sorted_scores_with_sc_closest[0])
    
    print()
    
    print('get sorted_scores_without_sc_best')
    sorted_scores_without_sc_best = get_sorted_scores(BestFirstClusterer(), pred_without_singleton_classifier)
    print('Without singleton classifier, best-first:', sorted_scores_without_sc_best[0])
    
    print()
    
    print('get sorted_scores_with_sc_best')
    sorted_scores_with_sc_best = get_sorted_scores(BestFirstClusterer(), pred_with_singleton_classifier)
    print('With singleton classifier, best-first:', sorted_scores_with_sc_best[0])

## Budi

### Words + Context + Syntactic

In [20]:
evaluate(['words', 'context', 'syntactic'], 'budi', 5)

getting anaphora scores by antecedent dict
get sorted_scores_without_sc_closest
Without singleton classifier, closest-first: (0.35413218980519395, (0.4482758620689655, 0.33766233766233766, 0.3851851851851852), (0.3568181818181818, 0.2951694139194139, 0.3230791944252027), 0.05)

get sorted_scores_with_sc_closest
With singleton classifier, closest-first: (0.5324111982788836, (0.5612244897959183, 0.7142857142857143, 0.6285714285714286), (0.32605634287050217, 0.6589514652014654, 0.4362509679863387), 2e-05)

get sorted_scores_without_sc_best
Without singleton classifier, best-first: (0.3675733338394137, (0.46551724137931033, 0.35064935064935066, 0.4), (0.3675555555555555, 0.3079899267399267, 0.33514666767882745), 0.05)

get sorted_scores_with_sc_best
With singleton classifier, best-first: (0.5155418802575614, (0.5408163265306123, 0.6883116883116883, 0.6057142857142858), (0.32063771997982526, 0.6317078754578755, 0.4253694748008371), 2e-05)


In [21]:
evaluate(['words', 'context', 'syntactic'], 'budi', 10)

getting anaphora scores by antecedent dict
get sorted_scores_without_sc_closest
Without singleton classifier, closest-first: (0.40764365653432477, (0.5370370370370371, 0.37662337662337664, 0.44274809160305345), (0.43550514800514806, 0.32548076923076924, 0.37253922146559615), 0.02)

get sorted_scores_with_sc_closest
With singleton classifier, closest-first: (0.5289191930872699, (0.5353535353535354, 0.6883116883116883, 0.6022727272727273), (0.3545748742782641, 0.6369963369963371, 0.45556565890181244), 1e-05)

get sorted_scores_without_sc_best
Without singleton classifier, best-first: (0.44239623560721375, (0.4375, 0.5454545454545454, 0.48554913294797686), (0.33154971643343734, 0.5016712454212454, 0.3992433382664507), 0.0005)

get sorted_scores_with_sc_best
With singleton classifier, best-first: (0.5225860893208769, (0.5360824742268041, 0.6753246753246753, 0.5977011494252873), (0.34993231406274883, 0.6203983516483518, 0.44747102921646664), 2e-05)


In [22]:
evaluate(['words', 'context', 'syntactic'], 'budi', 20)

getting anaphora scores by antecedent dict
get sorted_scores_without_sc_closest
Without singleton classifier, closest-first: (0.3683888682725824, (0.5897435897435898, 0.2987012987012987, 0.39655172413793105), (0.5307017543859649, 0.25036630036630036, 0.3402260124072337), 0.06)

get sorted_scores_with_sc_closest
With singleton classifier, closest-first: (0.505133139932475, (0.5675675675675675, 0.5454545454545454, 0.5562913907284768), (0.4346938227257376, 0.4750457875457875, 0.4539748891364731), 1e-05)

get sorted_scores_without_sc_best
Without singleton classifier, best-first: (0.3785229305096004, (0.5531914893617021, 0.33766233766233766, 0.41935483870967744), (0.4143229166666667, 0.28498168498168497, 0.3376910223095233), 0.02)

get sorted_scores_with_sc_best
With singleton classifier, best-first: (0.4709314903261924, (0.5405405405405406, 0.5194805194805194, 0.5298013245033113), (0.3735162180814354, 0.459478021978022, 0.4120616561490735), 1e-05)


## Gilang

### Words + Syntactic

In [None]:
evaluate(['words', 'syntactic'], 'budi', 5)

In [None]:
evaluate(['words', 'context', 'syntactic'], 'gilang', 10)

In [None]:
evaluate(['words', 'context', 'syntactic'], 'gilang', 20)

## Soon

### Words + Context + Syntactic

In [None]:
evaluate(['words', 'context', 'syntactic'], 'soon', 5)

In [None]:
evaluate(['words', 'context', 'syntactic'], 'soon', 10)

In [None]:
evaluate(['words', 'context', 'syntactic'], 'soon', 20)

In [None]:
[[get_markable_text(b) for b in a] for a in pred_chains2 if len(a) > 1]

In [None]:
[[get_markable_text(b) for b in a] for a in baseline_chains if len(a) > 1]

In [None]:
[[get_markable_text(b) for b in a] for a in label_chains if len(a) > 1]