In [84]:
import spacy
import json
from spacy import displacy
from tqdm import tqdm_notebook, tqdm
import glob
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from nltk import tokenize

### Fonction evaluate

In [100]:
def evaluate_by_ent(nlp, examples, ent='GENE'):
    scorer = Scorer()
    for input_, annot in tqdm_notebook(examples):
        try:
            text_entities = []
            for entity in annot.get('entities'):
                if ent in entity:
                    text_entities.append(entity)
            doc_gold_text = nlp.make_doc(input_)
            gold = GoldParse(doc_gold_text, entities=text_entities)
            pred_value = nlp(input_)
            scorer.score(pred_value, gold)
        except:
            pass
    return scorer.scores

def evaluate(ner_model, examples):
    scorer = Scorer()
    for input_, annot in tqdm_notebook(examples):
#         try:
            doc_gold_text = ner_model.make_doc(input_)
            gold = GoldParse(doc_gold_text, entities=annot["entities"])
            pred_value = ner_model(input_)
            scorer.score(pred_value, gold)
#         except:
#             pass
    return scorer.scores

### Load data

In [5]:
TEST_DATA = []
files = glob.glob("test/normal/*.json")
for f in files:
    with open(f) as fl:
        js = json.load(fl)
        for j in js:
            TEST_DATA.append(tuple(j))

### Load model 

In [247]:
nlp = spacy.load("models/fs_normal/epoch_32")

#### score general

In [79]:
res = evaluate(nlp, TEST_DATA[0:50000])

In [26]:
res

{u'ents_f': 87.95439401791663,
 u'ents_p': 90.70779567839963,
 u'ents_r': 85.36322483293813,
 u'las': 0.0,
 u'tags_acc': 0.0,
 u'token_acc': 100.0,
 u'uas': 0.0}

#### Evaluation on NCBItestset_corpus

In [184]:
nlp = spacy.load("models/fs_normal_ncbi/epoch_71/")

In [185]:
with open("test/normal_NCBI/test_set_NCBI_only.json") as g:
    NCBI_TEST = json.load(g)

In [199]:
res = evaluate(nlp, NCBI_TEST)

HBox(children=(IntProgress(value=0, max=99), HTML(value=u'')))

In [200]:
res

{u'ents_f': 85.23063901819721,
 u'ents_p': 88.33333333333333,
 u'ents_r': 82.33851185609157,
 u'las': 0.0,
 u'tags_acc': 0.0,
 u'token_acc': 100.0,
 u'uas': 0.0}

In [196]:
doc = nlp(NCBI_TEST[0][0])

In [197]:
[(ent.text, ent.label_) for ent in doc.ents if ent.label_ == "DISEASE"]

[(u'inherited disorder', u'DISEASE'),
 (u'hepatic copper accumulation', u'DISEASE'),
 (u'Wilson disease', u'DISEASE'),
 (u'WD', u'DISEASE'),
 (u'autosomal recessive copper toxicosis', u'DISEASE'),
 (u'liver disease', u'DISEASE'),
 (u'WD', u'DISEASE'),
 (u'WD', u'DISEASE'),
 (u'CTR2', u'DISEASE'),
 (u'BAC', u'DISEASE')]

In [201]:
res2 = evaluate_by_ent(nlp, NCBI_TEST, "DISEASE")
res2

HBox(children=(IntProgress(value=0, max=99), HTML(value=u'')))

{u'ents_f': 66.36280765724703,
 u'ents_p': 57.50394944707741,
 u'ents_r': 78.44827586206897,
 u'las': 0.0,
 u'tags_acc': 0.0,
 u'token_acc': 100.0,
 u'uas': 0.0}

In [202]:
res2 = evaluate_by_ent(nlp, NCBI_TEST, "GENE")
res2

HBox(children=(IntProgress(value=0, max=99), HTML(value=u'')))

{u'ents_f': 43.21868916288125,
 u'ents_p': 29.67914438502674,
 u'ents_r': 79.47494033412887,
 u'las': 0.0,
 u'tags_acc': 0.0,
 u'token_acc': 100.0,
 u'uas': 0.0}

In [246]:
nlp.get_pipe("ner").moves.

<spacy.syntax.ner.BiluoPushDown at 0x7f7a68790130>

In [238]:
scorer = Scorer()
ent = "DISEASE"
for input_, annot in tqdm_notebook(NCBI_TEST):
    text_entities = []
    for entity in annot.get('entities'):
        if ent in entity:
            text_entities.append(entity)
    doc_gold_text = nlp.make_doc(input_)
    gold = GoldParse(doc_gold_text, entities=text_entities)
    pred_value = nlp(input_)
    scorer.score(pred_value, gold)
    break

HBox(children=(IntProgress(value=0, max=99), HTML(value=u'')))

In [266]:
pred_value.ents

(inherited disorder,
 hepatic copper accumulation,
 ATP7B,
 Wilson disease,
 WD,
 ATP7B,
 ATP7B,
 autosomal recessive copper toxicosis,
 liver disease,
 WD,
 ATP7B,
 ATP7B,
 ATP7B,
 WD,
 CTR2,
 BAC)

In [218]:
# len(text_entities)

In [219]:
scorer.scores

{u'ents_f': 42.42424242424242,
 u'ents_p': 43.75,
 u'ents_r': 41.17647058823529,
 u'las': 0.0,
 u'tags_acc': 0.0,
 u'token_acc': 100.0,
 u'uas': 0.0}

In [220]:
[(ent.text, ent.label_) for ent in pred_value.ents if ent.label_ == "DISEASE"]

[(u'inherited disorder', u'DISEASE'),
 (u'hepatic copper accumulation', u'DISEASE'),
 (u'Wilson disease', u'DISEASE'),
 (u'WD', u'DISEASE'),
 (u'autosomal recessive copper toxicosis', u'DISEASE'),
 (u'liver disease', u'DISEASE'),
 (u'WD', u'DISEASE'),
 (u'WD', u'DISEASE'),
 (u'CTR2', u'DISEASE'),
 (u'BAC', u'DISEASE')]

In [179]:
text_entities

[[23, 39, u'DISEASE'],
 [158, 185, u'DISEASE'],
 [206, 224, u'DISEASE'],
 [272, 299, u'DISEASE'],
 [346, 360, u'DISEASE'],
 [362, 364, u'DISEASE'],
 [499, 514, u'DISEASE'],
 [523, 553, u'DISEASE'],
 [637, 653, u'DISEASE'],
 [655, 657, u'DISEASE'],
 [738, 751, u'DISEASE'],
 [777, 779, u'DISEASE'],
 [814, 816, u'DISEASE'],
 [999, 1001, u'DISEASE'],
 [1147, 1149, u'DISEASE'],
 [1174, 1176, u'DISEASE'],
 [1261, 1263, u'DISEASE']]

In [223]:
alld = []
allg = []
for dg in NCBI_TEST[0:1]:
    if dg[1]["entities"] :
        for d  in dg[1]["entities"]:
            if d[2] == "DISEASE":
                allg.append(dg[0][d[0]:d[1]])
            if d[2] == "GENE":
#                 print()
#                 print(dg[0][d[0]:d[1]], d[2])
#                 if dg[0][d[0]:d[1]] == "set":
#                     print(dg)
                alld.append(dg[0][d[0]:d[1]])

In [225]:
allg

[u'copper toxicosis',
 u'hepatic copper accumulation',
 u'inherited disorder',
 u'hepatic copper accumulation',
 u'Wilson disease',
 u'WD',
 u'copper overload',
 u'non-Indian childhood cirrhosis',
 u'copper toxicosis',
 u'CT',
 u'liver disease',
 u'WD',
 u'CT',
 u'CT',
 u'WD',
 u'CT',
 u'CT']

In [86]:
# c = 0
# val = []
# ent = sorted(TEST_DATA[5][1]["entities"], reverse=True)
# for t in text.split(" "):
#     c += len(t)-1
#     if ent:
#         if c+1 > ent[-1][0]: 
#             val.append((t, ent[-1][2]))
#             ent.pop()
#         else:
#             val.append((t, "default"))
#     else:
#         val.append((t, "default"))

In [25]:
# doc = nlp2(unicode(datas[8:2000]))

In [25]:
# print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

In [24]:
# [(t.text, t.ent_type_ if t.ent_type_ else "default") for t in doc]

In [10]:
# displacy.render(doc, style='ent', jupyter=True)

In [17]:
# nlp = spacy.load('en')
# doc = nlp(u'This is a sentence.')
# displacy.serve(doc, style='ent')