In [2]:
import json
import numpy as np
import logging

subset = []
# load in subset terms list
with open("./data/subset_terms_list", "r") as handle:
    for line in handle:
        subset.append(line.strip("\n"))
subset = set(subset)
        
with open("./data/term_freqs_rev_3_all_terms.json", "r") as handle:
    temp = json.load(handle)

# Get UIDs in a list - for use in building arrays
uids = []
with open("./data/mesh_data.tab", "r") as handle:
    for line in handle:
        line = line.strip("\n").split("\t")
        if line[0] in subset:
            uids.append(line[0])

docs_list = list(temp.keys())
partition = int(len(docs_list) * .8)

train_docs = docs_list[0:partition]
test_docs = docs_list[partition:]

test_freqs = {}
for doc in test_docs:
    test_freqs[doc] = temp[doc]

# Load in solution values - only for the docs that we need
# Change to set for quick lookup
docs_list = set(test_docs)
solution = {}
with open("./data/pm_doc_term_counts.csv", "r") as handle:
    for line in handle:
        line = line.strip("\n").split(",")
        if line[0] in docs_list:
            terms = [term for term in line[1:] if term in subset]
            if terms:
                solution[line[0]] = terms

test_docs = [doc for doc in test_docs if doc in solution.keys()]

x_test = []
for doc in test_docs:
    row = []
    for uid in uids:
        if uid in test_freqs[doc].keys():
            row.append(test_freqs[doc][uid])
        else:
            row.append(0)
    x_test.append(row)

x_test = np.array(x_test)
"""
y_test = []
for doc in test_docs:
    row = []
    for uid in uids:
        if uid in solution[doc]:
            row.append(1)
        else:
            row.append(0)
    y_test.append(row)

y_test = np.array(y_test)
"""

'\ny_test = []\nfor doc in test_docs:\n    row = []\n    for uid in uids:\n        if uid in solution[doc]:\n            row.append(1)\n        else:\n            row.append(0)\n    y_test.append(row)\n\ny_test = np.array(y_test)\n'

In [3]:
from keras.models import Model
from keras.layers import Dense, Input, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint

a = Input(shape=(7221,))
b = Dense(2048, activation="relu")(a)
b = Dropout(0.1)(b)
b = Dense(7221, activation="sigmoid")(b)
model = Model(inputs=a, outputs=b)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

batch_size = 16

model.load_weights("weights.final.7221.hdf5")

y_pred = model.predict(x_test)
y_pred = np.round(y_pred)

In [4]:
preds = {doc: [] for doc in test_docs}

for r_idx, row in enumerate(y_pred):
    for c_idx, col in enumerate(row):
        if y_pred[r_idx][c_idx] == 1:
            preds[test_docs[r_idx]].append(uids[c_idx])

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [6]:
true_pos = 0
false_pos = 0
false_neg = 0

for pmid in preds:
    true_pos += len([pred for pred in preds[pmid] if pred in solution[pmid]])
    false_pos += len([pred for pred in preds[pmid] if pred not in solution[pmid]])
    false_neg += len([sol for sol in solution[pmid] if sol not in preds[pmid]])

if true_pos == 0:
    mi_precision = 0
    mi_recall = 0
    mi_f1 = 0
else:
    mi_precision = true_pos / (true_pos + false_pos)
    mi_recall = true_pos / (true_pos + false_neg)
    mi_f1 = (2 * mi_precision * mi_recall) / (mi_precision + mi_recall)

print(f"Micro-averaged F1 from test set: {mi_f1}")
print(f"Micro-averaged precision from test set: {mi_precision}")
print(f"Micro-averaged recall from test set: {mi_recall}\n")

eb_ps = []
eb_rs = []
eb_f1s = []

for pmid in preds:
    true_pos = len([pred for pred in preds[pmid] if pred in solution[pmid]])
    false_pos = len([pred for pred in preds[pmid] if pred not in solution[pmid]])
    false_neg = len([sol for sol in solution[pmid] if sol not in preds[pmid]])

    if true_pos == 0:
        eb_precision = 0
        eb_recall = 0
        eb_f1 = 0
    else:
        eb_precision = true_pos / (true_pos + false_pos)
        eb_recall = true_pos / (true_pos + false_neg)
        eb_f1 = (2 * eb_precision * eb_recall) / (eb_precision + eb_recall)

    eb_ps.append(eb_precision)
    eb_rs.append(eb_recall)
    eb_f1s.append(eb_f1)

eb_f1 = sum(eb_f1s) / len(eb_f1s)
eb_recall = sum(eb_rs) / len(eb_rs)
eb_precision = sum(eb_ps) / len(eb_ps)

print(f"Example-based F1 from test set: {eb_f1}")
print(f"Example-based precision from test set: {eb_precision}")
print(f"Example-based recall from test set: {eb_recall}\n")

ma_ps = []
ma_rs = []
ma_f1s = []

for uid in uids:
    true_pos = 0
    false_pos = 0
    false_neg = 0
    
    for pmid in preds:
        if uid in preds[pmid] and uid in solution[pmid]:
            true_pos += 1
        if uid in preds[pmid] and uid not in solution[pmid]:
            false_pos += 1
        if uid in solution[pmid] and uid not in preds[pmid]:
            false_neg += 1
    
    if true_pos == 0:
        ma_precision = 0
        ma_recall = 0
        ma_f1 = 0
    else:
        ma_precision = true_pos / (true_pos + false_pos)
        ma_recall = true_pos / (true_pos + false_neg)
        ma_f1 = (2 * ma_precision * ma_recall) / (ma_precision + ma_recall)

    if true_pos + false_pos + false_neg > 0:
        ma_ps.append(ma_precision)
        ma_rs.append(ma_recall)
        ma_f1s.append(ma_f1)

ma_f1 = sum(ma_f1s) / len(ma_f1s)
ma_recall = sum(ma_rs) / len(ma_rs)
ma_precision = sum(ma_ps) / len(ma_ps)

print(f"Macro-averaged F1 from test set: {ma_f1}")
print(f"Macro-averaged precision from test set: {ma_precision}")
print(f"Macro-averaged recall from test set: {ma_recall}\n")

Micro-averaged F1 from test set: 0.4427023396290225
Micro-averaged precision from test set: 0.6937186262531098
Micro-averaged recall from test set: 0.3250762247699461

Example-based F1 from test set: 0.42814472752355864
Example-based precision from test set: 0.6916253773087887
Example-based recall from test set: 0.33731962295133666

Macro-averaged F1 from test set: 0.16752501915943868
Macro-averaged precision from test set: 0.3564535370602334
Macro-averaged recall from test set: 0.12394390582064842

