In [1]:
import os
import re
from pprint import pprint

In [2]:
# helper to parse the ann files
def extract_relations(ann_path):
    res = []
    with open(ann_path, "r") as fin:
        for line in fin:
            items = line.strip().split("\t")
            if items[0].startswith("R"):
                match = re.match("(.*) Arg1:(.*) Arg2:(.*)", items[1])
                res.append({
                    "id": items[0],
                    "type": match[1],
                    "entities": (match[2], match[3]) if match[2] < match[3] else (match[3], match[2])
                })
    return res

# helper to parse the ann files
def extract_entities_id_to_loc(ann_path):
    id_to_loc = {}
    with open(ann_path, "r") as fin:
        for line in fin:
            items = line.strip().split("\t")
            if items[0].startswith("T"):
                beg, end = (int(items[1].split()[1]), int(items[1].split()[2]))
                assert (beg < end)
                id_to_loc[items[0]] = (beg, end)
    return id_to_loc

In [3]:
def freq_counter(data, key):
    res = {}
    for item in data:
        k = key(item)
        if k not in res:
            res[k] = 0
        res[k] += 1
    return res

In [6]:
# evaluate the annotator 1 against annotator 2.
def evaluate_annotator(pred_dir, target_dir, pub_num, use_loc=False):
    pred = extract_relations(os.path.join(pred_dir, f"{pub_num}/{pub_num}_NA.ann"))
    target = extract_relations(os.path.join(target_dir, f"{pub_num}/{pub_num}_NA.ann"))
    if use_loc:
        pred_id_to_loc   = extract_entities_id_to_loc(os.path.join(pred_dir, f"{pub_num}/{pub_num}_NA.ann"))
        target_id_to_loc = extract_entities_id_to_loc(os.path.join(target_dir, f"{pub_num}/{pub_num}_NA.ann"))
        pred   = {(pred_id_to_loc[x["entities"][0]], pred_id_to_loc[x["entities"][1]]):   x["type"] for x in pred}
        target = {(target_id_to_loc[x["entities"][0]], target_id_to_loc[x["entities"][1]]): x["type"] for x in target}
    else: 
        pred = {x["entities"]: x["type"] for x in pred}
        target = {x["entities"]: x["type"] for x in target}
    

    def helpercomp(t):
        if t[0] < t[1]:
            return t
        return (t[1], t[0])
        
    k1 = [helpercomp(k) for k in pred.keys()]
    k2 = [helpercomp(k) for k in target.keys()]
    print(sorted(set(k1)-set(k2)))
    print(sorted(set(k2)-set(k1)))
    y1, y2 = [], []
    for k in set(list(pred.keys()) + list(target.keys())):
        y1.append(pred.get(k, pred.get((k[1], k[0]), "NA")))
        y2.append(target.get(k, target.get((k[1], k[0]), "NA")))
    
    from sklearn.metrics import confusion_matrix
    lbls = ["UpRegulator", "DownRegulator", "Substrate", "NA"]
    print(lbls)
    print(confusion_matrix(y1, y2, labels=lbls))
    
    from sklearn.metrics import cohen_kappa_score
    return y1, y2


In [7]:
pred_dir   = "../../datasets/acs-20210505-eric"
target_dir = "../../datasets/acs-20210505-kevin"
y1, y2 = evaluate_annotator(pred_dir, target_dir, "sb300091d", use_loc=True)
print()
y1n, y2n = all_lists = evaluate_annotator(pred_dir, target_dir, "sb4001382", use_loc=True)

[((781, 791), (845, 863)), ((781, 791), (868, 876)), ((1177, 1186), (1187, 1196)), ((1187, 1196), (1267, 1276)), ((1187, 1196), (1370, 1379)), ((1526, 1535), (1536, 1545)), ((1536, 1545), (1599, 1608)), ((1536, 1545), (1675, 1684)), ((6382, 6391), (6566, 6574)), ((6382, 6391), (6578, 6596)), ((6482, 6492), (6566, 6574)), ((6482, 6492), (6578, 6596)), ((6642, 6651), (6713, 6722)), ((7395, 7404), (7486, 7495)), ((7578, 7587), (7849, 7856)), ((7640, 7649), (7849, 7856)), ((7732, 7741), (7849, 7856)), ((7816, 7828), (7849, 7856)), ((7874, 7881), (7968, 7972)), ((7874, 7881), (7973, 7998)), ((7874, 7881), (8012, 8016)), ((7968, 7972), (8028, 8035)), ((7968, 7972), (8181, 8187)), ((7973, 7998), (8028, 8035)), ((7973, 7998), (8181, 8187)), ((8012, 8016), (8028, 8035)), ((8012, 8016), (8181, 8187)), ((8770, 8781), (8835, 8842)), ((9856, 9863), (9888, 9898)), ((11799, 11813), (11913, 11922)), ((11867, 11876), (11913, 11922)), ((11913, 11922), (12003, 12015)), ((20652, 20661), (21002, 21012)), (

In [25]:
from sklearn.metrics import cohen_kappa_score
cohen_kappa_score(y1+y1n, y2+y2n)

0.7442599991366616

In [21]:
# evaluate the annotator against gold
def evaluate_gld(pred_dir, target_dir, pub_num, use_loc=False):
    pred = extract_relations(os.path.join(pred_dir, f"{pub_num}/{pub_num}_NA.ann"))
    target = extract_relations(os.path.join(target_dir, f"{pub_num}/{pub_num}.ann"))
    if use_loc:
        pred_id_to_loc   = extract_entities_id_to_loc(os.path.join(pred_dir, f"{pub_num}/{pub_num}_NA.ann"))
        target_id_to_loc = extract_entities_id_to_loc(os.path.join(target_dir, f"{pub_num}/{pub_num}.ann"))
        pred   = {(pred_id_to_loc[x["entities"][0]], pred_id_to_loc[x["entities"][1]]):   x["type"] for x in pred}
        target = {(target_id_to_loc[x["entities"][0]], target_id_to_loc[x["entities"][1]]): x["type"] for x in target}
    else: 
        pred = {x["entities"]: x["type"] for x in pred}
        target = {x["entities"]: x["type"] for x in target}
    

    def helpercomp(t):
        if t[0] < t[1]:
            return t
        return (t[1], t[0])
        
    k1 = [helpercomp(k) for k in pred.keys()]
    k2 = [helpercomp(k) for k in target.keys()]
    #print(set(k1)-set(k2))
    #print(set(k2)-set(k1))
    y1, y2 = [], []
    for k in set(k1 + k2):
        y1.append(pred.get(k, pred.get((k[1], k[0]), "NA")))
        y2.append(target.get(k, target.get((k[1], k[0]), "NA")))
    
    from sklearn.metrics import confusion_matrix
    lbls = ["UpRegulator", "DownRegulator", "Substrate", "NA"]
    print(lbls)
    print(confusion_matrix(y1, y2, labels=lbls))
    

In [22]:
pred_dir   = "../../datasets/acs-20210505-kevin/"
target_dir = "../../datasets/acs-20210530-gold-target"
evaluate_gld(pred_dir, target_dir, "sb300091d", use_loc=True)
print()
all_lists = evaluate_gld(pred_dir, target_dir, "sb4001382", use_loc=True)

['UpRegulator', 'DownRegulator', 'Substrate', 'NA']
[[  5   0   0   0]
 [  0   2   0   1]
 [  1   0  19   1]
 [ 18   0   3 187]]

['UpRegulator', 'DownRegulator', 'Substrate', 'NA']
[[  0   0   0   0]
 [  0  30   1   6]
 [  0   0  77  33]
 [  1  10  49 357]]


In [11]:
pred_dir   = "../../datasets/acs-20210505-eric"
target_dir = "../../datasets/acs-20210530-gold-target"
evaluate(pred_dir, target_dir, "sb300091d", use_loc=True)
print()
all_lists = evaluate(pred_dir, target_dir, "sb4001382", use_loc=True)

['UpRegulator', 'DownRegulator', 'Substrate', 'NA']
[[  5   0   0   0]
 [  0   2   0   1]
 [  1   0  14  19]
 [ 18   0   8 234]]

['UpRegulator', 'DownRegulator', 'Substrate', 'NA']
[[  0   0   0   0]
 [  0  15   1  10]
 [  0   0  74  42]
 [  1  25  52 510]]


In [36]:
not_in_target_list

[(('T13', 'T716'), 'Substrate'),
 (('T30', 'T720'), 'Substrate'),
 (('T31', 'T720'), 'Substrate'),
 (('T45', 'T722'), 'Substrate'),
 (('T312', 'T359'), 'Substrate'),
 (('T358', 'T365'), 'Substrate'),
 (('T362', 'T365'), 'Substrate'),
 (('T369', 'T371'), 'Substrate'),
 (('T369', 'T372'), 'Substrate'),
 (('T372', 'T373'), 'Substrate'),
 (('T372', 'T374'), 'Substrate'),
 (('T372', 'T375'), 'Substrate'),
 (('T394', 'T699'), 'Substrate'),
 (('T394', 'T397'), 'Substrate'),
 (('T699', 'T700'), 'Substrate'),
 (('T397', 'T700'), 'Substrate'),
 (('T416', 'T702'), 'Substrate'),
 (('T478', 'T479'), 'Substrate'),
 (('T486', 'T488'), 'Substrate'),
 (('T487', 'T488'), 'Substrate'),
 (('T884', 'T885'), 'Substrate'),
 (('T100', 'T897'), 'Substrate'),
 (('T101', 'T897'), 'Substrate'),
 (('T112', 'T900'), 'DownRegulator'),
 (('T115', 'T902'), 'DownRegulator'),
 (('T148', 'T913'), 'Substrate'),
 (('T164', 'T29'), 'Substrate'),
 (('T165', 'T29'), 'Substrate'),
 (('T167', 'T177'), 'DownRegulator'),
 (('T186

In [37]:
not_in_pred_list

[(('T277', 'T365'), 'Substrate'),
 (('T279', 'T365'), 'Substrate'),
 (('T371', 'T372'), 'Substrate'),
 (('T307', 'T411'), 'Substrate'),
 (('T307', 'T412'), 'Substrate'),
 (('T322', 'T430'), 'Substrate'),
 (('T322', 'T431'), 'Substrate'),
 (('T7', 'T8'), 'DownRegulator'),
 (('T7', 'T9'), 'DownRegulator'),
 (('T882', 'T884'), 'Substrate'),
 (('T883', 'T884'), 'Substrate'),
 (('T69', 'T71'), 'Substrate'),
 (('T70', 'T71'), 'Substrate'),
 (('T97', 'T99'), 'Substrate'),
 (('T98', 'T99'), 'Substrate'),
 (('T100', 'T892'), 'Substrate'),
 (('T101', 'T892'), 'Substrate'),
 (('T115', 'T894'), 'DownRegulator'),
 (('T148', 'T164'), 'Substrate'),
 (('T148', 'T165'), 'Substrate'),
 (('T177', 'T72'), 'DownRegulator'),
 (('T185', 'T186'), 'Substrate'),
 (('T185', 'T188'), 'Substrate'),
 (('T234', 'T910'), 'Substrate'),
 (('T235', 'T910'), 'Substrate'),
 (('T22', 'T240'), 'Substrate'),
 (('T242', 'T243'), 'Substrate'),
 (('T262', 'T263'), 'Substrate'),
 (('T262', 'T264'), 'Substrate'),
 (('T268', 'T269

In [39]:
y1 , y2 = [], []

# start with correct predictions
y1.extend([k[1] for k in found_correct_list])
y2.extend([k[1] for k in found_correct_list])
print(len(y1), len(y2))
print(y1[-5:], y2[-5:])

# false positives
y1.extend([k[1] for k in not_in_target_list])
y2.extend(["NA" for k in not_in_target_list])
print(len(y1), len(y2))
print(y1[-5:], y2[-5:])

# false negatives
y1.extend(["NA" for k in not_in_pred_list])
y2.extend([k[1] for k in not_in_pred_list])
print(len(y1), len(y2))
print(y1[-5:], y2[-5:])


100 100
['Substrate', 'DownRegulator', 'Substrate', 'Substrate', 'Substrate'] ['Substrate', 'DownRegulator', 'Substrate', 'Substrate', 'Substrate']
184 184
['Substrate', 'DownRegulator', 'Substrate', 'Substrate', 'Substrate'] ['NA', 'NA', 'NA', 'NA', 'NA']
260 260
['NA', 'NA', 'NA', 'NA', 'NA'] ['Substrate', 'Substrate', 'Substrate', 'DownRegulator', 'DownRegulator']


In [40]:
from sklearn.metrics import cohen_kappa_score
cohen_kappa_score(y1, y2)

-0.030187464401574937

In [51]:
!ls /sbksvol/jiawei/sbks-ucsd/relation-extraction/biobert_RE/datasets/

CHEMPROT


In [6]:
pred_dir   = "/sbksvol/data/acs-data/acs-re/acs-20210530-gold-3layer-e2e-2"
target_dir = "/sbksvol/data/acs-data/acs-re/acs-20210530-gold-target"
evaluate(pred_dir, target_dir, "sb300091d")
print()
evaluate(pred_dir, target_dir, "sb4001382")


# correct predictions:	24
{'DownRegulator': 1, 'Substrate': 17, 'UpRegulator': 6}

# incorrect predictions:	1
{('Substrate', 'UpRegulator'): 1}

# extra predictions (false positive):	3
{'Substrate': 3}

# targets not found (false negative):	23
{'DownRegulator': 1, 'Substrate': 5, 'UpRegulator': 17}


# correct predictions:	95
{'DownRegulator': 24, 'Substrate': 71}

# incorrect predictions:	3
{('Substrate', 'DownRegulator'): 3}

# extra predictions (false positive):	18
{'DownRegulator': 4, 'Substrate': 14}

# targets not found (false negative):	70
{'DownRegulator': 13, 'Substrate': 56, 'UpRegulator': 1}


In [7]:
pred_dir   = "/sbksvol/data/acs-data/acs-data-20210530/acs-20210505-kevin"
target_dir = "/sbksvol/data/acs-data/acs-re/acs-20210530-gold-target"
evaluate(pred_dir, target_dir, "sb300091d")
print("*" * 50)
evaluate(pred_dir, target_dir, "sb300091d", use_loc=True)
print("*" * 50)
evaluate(pred_dir, target_dir, "sb4001382")
print("*" * 50)
evaluate(pred_dir, target_dir, "sb4001382", use_loc=True)


# correct predictions:	0
{}

# incorrect predictions:	0
{}

# extra predictions (false positive):	29
{'DownRegulator': 3, 'Substrate': 21, 'UpRegulator': 5}

# targets not found (false negative):	48
{'DownRegulator': 2, 'Substrate': 22, 'UpRegulator': 24}
**************************************************

# correct predictions:	26
{'DownRegulator': 2, 'Substrate': 19, 'UpRegulator': 5}

# incorrect predictions:	1
{('Substrate', 'UpRegulator'): 1}

# extra predictions (false positive):	2
{'DownRegulator': 1, 'Substrate': 1}

# targets not found (false negative):	21
{'Substrate': 3, 'UpRegulator': 18}
**************************************************

# correct predictions:	0
{}

# incorrect predictions:	0
{}

# extra predictions (false positive):	147
{'DownRegulator': 37, 'Substrate': 110}

# targets not found (false negative):	168
{'DownRegulator': 40, 'Substrate': 127, 'UpRegulator': 1}
**************************************************

# correct predictions:	107
{'DownRegulator':

In [8]:
pred_dir   = "/sbksvol/data/acs-data/acs-data-20210530/acs-20210505-eric"
target_dir = "/sbksvol/data/acs-data/acs-re/acs-20210530-gold-target"
# evaluate(pred_dir, target_dir, "sb300091d")
print("*" * 50)
evaluate(pred_dir, target_dir, "sb300091d", use_loc=True)
print("*" * 50)
# evaluate(pred_dir, target_dir, "sb4001382")
print("*" * 50)
evaluate(pred_dir, target_dir, "sb4001382", use_loc=True)

**************************************************

# correct predictions:	21
{'DownRegulator': 2, 'Substrate': 14, 'UpRegulator': 5}

# incorrect predictions:	1
{('Substrate', 'UpRegulator'): 1}

# extra predictions (false positive):	20
{'DownRegulator': 1, 'Substrate': 19}

# targets not found (false negative):	26
{'Substrate': 8, 'UpRegulator': 18}
**************************************************
**************************************************

# correct predictions:	89
{'DownRegulator': 15, 'Substrate': 74}

# incorrect predictions:	1
{('DownRegulator', 'Substrate'): 1}

# extra predictions (false positive):	52
{'DownRegulator': 10, 'Substrate': 42}

# targets not found (false negative):	78
{'DownRegulator': 25, 'Substrate': 52, 'UpRegulator': 1}


# 

In [9]:
pred_dir   = "/sbksvol/data/acs-data/acs-data-20210530/acs-20210505-eric"
target_dir = "/sbksvol/data/acs-data/acs-re/acs-20210530-gold-target"
evaluate(pred_dir, target_dir, "sb300091d")
print("*" * 50)
evaluate(pred_dir, target_dir, "sb300091d", use_loc=True)
print("*" * 50)
evaluate(pred_dir, target_dir, "sb4001382")
print("*" * 50)
evaluate(pred_dir, target_dir, "sb4001382", use_loc=True)


# correct predictions:	0
{}

# incorrect predictions:	0
{}

# extra predictions (false positive):	42
{'DownRegulator': 3, 'Substrate': 34, 'UpRegulator': 5}

# targets not found (false negative):	48
{'DownRegulator': 2, 'Substrate': 22, 'UpRegulator': 24}
**************************************************

# correct predictions:	21
{'DownRegulator': 2, 'Substrate': 14, 'UpRegulator': 5}

# incorrect predictions:	1
{('Substrate', 'UpRegulator'): 1}

# extra predictions (false positive):	20
{'DownRegulator': 1, 'Substrate': 19}

# targets not found (false negative):	26
{'Substrate': 8, 'UpRegulator': 18}
**************************************************

# correct predictions:	0
{}

# incorrect predictions:	0
{}

# extra predictions (false positive):	142
{'DownRegulator': 26, 'Substrate': 116}

# targets not found (false negative):	168
{'DownRegulator': 40, 'Substrate': 127, 'UpRegulator': 1}
**************************************************

# correct predictions:	89
{'DownRegulator'

In [10]:
!tail /sbksvol/data/acs-data/acs-re/acs-20210530-gold-3-1024-1/sb300091d/sb300091d.ann

R16	Substrate Arg1:T340 Arg2:T57
R17	Substrate Arg1:T232 Arg2:T641
R18	Substrate Arg1:T232 Arg2:T520
R19	Substrate Arg1:T603 Arg2:T436
R20	Substrate Arg1:T624 Arg2:T235
R21	Substrate Arg1:T534 Arg2:T706
R22	Substrate Arg1:T226 Arg2:T706
R23	Substrate Arg1:T263 Arg2:T369
R24	Substrate Arg1:T448 Arg2:T511
R25	Substrate Arg1:T206 Arg2:T511


In [11]:
!head /sbksvol/data/acs-data/acs-data-20210530/acs-20210505-eric/sb300091d/sb300091d.ann

T0	Gene 544 572	tetracycline resistance gene
T1	Chemical 621 631	antibiotic
T2	Chemical 845 863	dicarboxylic acids
T3	Chemical 868 876	alcohols
T4	Species 933 951	Pseudomonas putida
T5	Species 953 973	Thauera butanivorans
T6	Species 978 985	E. coli
T7	Gene 987 1008	Transcription factors
T8	Chemical 1057 1066	succinate
T9	Chemical 1068 1075	adipate


In [12]:
!head /sbksvol/data/acs-data/acs-data-20210530/acs-20210505-kevin/sb300091d/sb300091d.ann

T0	Gene 544 572	tetracycline resistance gene
T2	Chemical 845 863	dicarboxylic acids
T3	Chemical 868 876	alcohols
T4	Species 933 951	Pseudomonas putida
T5	Species 953 973	Thauera butanivorans
T6	Species 978 985	E. coli
T7	Gene 987 1008	Transcription factors
T8	Chemical 1057 1066	succinate
T9	Chemical 1068 1075	adipate
T10	Chemical 1080 1089	1-butanol


In [13]:
!tail /sbksvol/data/acs-data/acs-data-20210530/acs-20210505-eric/sb300091d/sb300091d.ann

R32	Substrate Arg1:T409 Arg2:T412
R33	Substrate Arg1:T413 Arg2:T416
R34	Substrate Arg1:T702 Arg2:T416
R35	Substrate Arg1:T428 Arg2:T431
R36	Substrate Arg1:T478 Arg2:T479
R37	Substrate Arg1:T486 Arg2:T488
R38	Substrate Arg1:T487 Arg2:T488
R39	Substrate Arg1:T489 Arg2:T490
R40	Substrate Arg1:T518 Arg2:T520
R41	Substrate Arg1:T519 Arg2:T520


In [14]:
!tail /sbksvol/data/acs-data/acs-data-20210530/acs-20210505-kevin/sb300091d/sb300091d.ann

R19	Substrate Arg1:T307 Arg2:T411
R20	Substrate Arg1:T409 Arg2:T412
R21	Substrate Arg1:T307 Arg2:T412
R22	Substrate Arg1:T413 Arg2:T416
R23	Substrate Arg1:T322 Arg2:T430
R24	Substrate Arg1:T428 Arg2:T431
R25	Substrate Arg1:T322 Arg2:T431
R26	Substrate Arg1:T489 Arg2:T490
R27	Substrate Arg1:T518 Arg2:T520
R28	Substrate Arg1:T519 Arg2:T520


In [15]:
pred_dir   = "/sbksvol/data/acs-data/acs-data-20210530/acs-20210505-eric"
target_dir = "/sbksvol/data/acs-data/acs-re/acs-20210530-gold-3-1024-1"
evaluate(pred_dir, target_dir, "sb300091d")
print()
evaluate(pred_dir, target_dir, "sb4001382")


# correct predictions:	0
{}

# incorrect predictions:	0
{}

# extra predictions (false positive):	42
{'DownRegulator': 3, 'Substrate': 34, 'UpRegulator': 5}

# targets not found (false negative):	26
{'DownRegulator': 1, 'Substrate': 20, 'UpRegulator': 5}


# correct predictions:	0
{}

# incorrect predictions:	0
{}

# extra predictions (false positive):	142
{'DownRegulator': 26, 'Substrate': 116}

# targets not found (false negative):	136
{'DownRegulator': 38, 'Substrate': 98}


In [16]:
pred_dir = "/sbksvol/data/acs-data/acs-re/acs-20210530-gold-3-1024-1"
target_dir = "/sbksvol/data/acs-data/acs-re/acs-20210530-gold-target"
evaluate(pred_dir, target_dir, "sb300091d")
print()
evaluate(pred_dir, target_dir, "sb4001382")


# correct predictions:	18
{'DownRegulator': 1, 'Substrate': 12, 'UpRegulator': 5}

# incorrect predictions:	2
{('Substrate', 'UpRegulator'): 2}

# extra predictions (false positive):	6
{'Substrate': 6}

# targets not found (false negative):	28
{'DownRegulator': 1, 'Substrate': 10, 'UpRegulator': 17}


# correct predictions:	101
{'DownRegulator': 26, 'Substrate': 75}

# incorrect predictions:	1
{('DownRegulator', 'Substrate'): 1}

# extra predictions (false positive):	34
{'DownRegulator': 11, 'Substrate': 23}

# targets not found (false negative):	66
{'DownRegulator': 14, 'Substrate': 51, 'UpRegulator': 1}


In [17]:
# not used for evaluation
# for labeling false positive and conflicts
def get_fp_and_dual(pred_dir, target_dir, pub_num):
    pred = extract_relations(os.path.join(pred_dir, f"{pub_num}/{pub_num}.ann"))
    target = extract_relations(os.path.join(target_dir, f"{pub_num}/{pub_num}.ann"))
    
    def transform(x):
        y = {}
        for item in x:
            ent = item["entities"]
            if ent not in y:
                y[ent] = {
                    "id": [],
                    "type": []
                }
            y[ent]["id"].append(item["id"])
            y[ent]["type"].append(item["type"])
        return y
    
    target_dict = transform(target)
    pred_dict = transform(pred)
    
    res = []
    for pk, pv in pred_dict.items():
        if pk not in target_dict:
            for rid, rtype in zip(pv["id"], pv["type"]):
                res.append({
                    "entities": pk,
                    "id": rid,
                    "type": rtype + "_biobert",
                })
        else:
            for rtype in pv["type"]:
                tv = target_dict[pk]
                if len(tv["type"]) >= 2:
                    for rid, rtype in zip(pv["id"], pv["type"]):
                        res.append({
                            "entities": pk,
                            "id": rid,
                            "type": rtype + "_biobert",
                        })
                    for rid, rtype in zip(tv["id"], tv["type"]):
                        res.append({
                            "entities": pk,
                            "id": rid,
                            "type": rtype,
                        })
                    break
    return res

In [18]:
def to_ann_string(data):
    res = []
    for i, item in enumerate(data):
        res.append(f"R{i}\t{item['type']} Arg1:{item['entities'][0]} Arg2:{item['entities'][1]}")
    return "\n".join(res)

In [19]:
pred_dir = "/sbksvol/data/acs-data/acs-re/acs-20210530-gold-3layer-e2e-2"
target_dir = "/sbksvol/data/acs-data/acs-re/acs-20210530-gold-target"
print(to_ann_string(get_fp_and_dual(pred_dir, target_dir, "sb4001382")))
print()
print(to_ann_string(get_fp_and_dual(pred_dir, target_dir, "sb300091d")))

R0	Substrate_biobert Arg1:T315 Arg2:T952
R1	Substrate_biobert Arg1:T1016 Arg2:T1119
R2	Substrate_biobert Arg1:T626 Arg2:T740
R3	Substrate_biobert Arg1:T664 Arg2:T740
R4	Substrate_biobert Arg1:T515 Arg2:T96
R5	Substrate_biobert Arg1:T487 Arg2:T839
R6	Substrate_biobert Arg1:T661 Arg2:T991
R7	DownRegulator_biobert Arg1:T100 Arg2:T580
R8	Substrate_biobert Arg1:T131 Arg2:T514
R9	Substrate_biobert Arg1:T485 Arg2:T920
R10	DownRegulator_biobert Arg1:T151 Arg2:T851
R11	Substrate_biobert Arg1:T1093 Arg2:T609
R12	DownRegulator_biobert Arg1:T1022 Arg2:T776
R13	Substrate_biobert Arg1:T394 Arg2:T688
R14	Substrate_biobert Arg1:T1056 Arg2:T933
R15	Substrate_biobert Arg1:T181 Arg2:T371
R16	Substrate_biobert Arg1:T255 Arg2:T310
R17	DownRegulator_biobert Arg1:T475 Arg2:T897

R0	Substrate_biobert Arg1:T599 Arg2:T682
R1	Substrate_biobert Arg1:T184 Arg2:T682
R2	Substrate_biobert Arg1:T67 Arg2:T685


In [20]:
xx = extract_entities_id_to_loc("/sbksvol/data/acs-data/acs-re/acs-20210530-gold-3layer-e2e-2/sb300091d/sb300091d.ann")

In [21]:
xx = extract_relations("/sbksvol/data/acs-data/acs-re/acs-20210530-gold-3layer-e2e-2/sb300091d/sb300091d.ann")

In [22]:
xx

[{'id': 'R0', 'type': 'Substrate', 'entities': ('T599', 'T682')},
 {'id': 'R1', 'type': 'Substrate', 'entities': ('T184', 'T682')},
 {'id': 'R2', 'type': 'UpRegulator', 'entities': ('T279', 'T582')},
 {'id': 'R3', 'type': 'UpRegulator', 'entities': ('T143', 'T279')},
 {'id': 'R4', 'type': 'UpRegulator', 'entities': ('T140', 'T279')},
 {'id': 'R5', 'type': 'Substrate', 'entities': ('T67', 'T685')},
 {'id': 'R6', 'type': 'Substrate', 'entities': ('T227', 'T274')},
 {'id': 'R7', 'type': 'UpRegulator', 'entities': ('T24', 'T557')},
 {'id': 'R8', 'type': 'UpRegulator', 'entities': ('T377', 'T598')},
 {'id': 'R9', 'type': 'DownRegulator', 'entities': ('T104', 'T46')},
 {'id': 'R10', 'type': 'Substrate', 'entities': ('T328', 'T366')},
 {'id': 'R11', 'type': 'Substrate', 'entities': ('T287', 'T328')},
 {'id': 'R12', 'type': 'Substrate', 'entities': ('T458', 'T57')},
 {'id': 'R13', 'type': 'Substrate', 'entities': ('T340', 'T57')},
 {'id': 'R14', 'type': 'UpRegulator', 'entities': ('T291', 'T59