In [84]:
import spacy
import json
from spacy import displacy
from tqdm import tqdm_notebook, tqdm
import glob
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from nltk import tokenize

### Fonction evaluate

In [100]:
def evaluate_by_ent(nlp, examples, ent='GENE'):
    scorer = Scorer()
    for input_, annot in tqdm_notebook(examples):
        try:
            text_entities = []
            for entity in annot.get('entities'):
                if ent in entity:
                    text_entities.append(entity)
            doc_gold_text = nlp.make_doc(input_)
            gold = GoldParse(doc_gold_text, entities=text_entities)
            pred_value = nlp(input_)
            scorer.score(pred_value, gold)
        except:
            pass
    return scorer.scores

def evaluate(ner_model, examples):
    scorer = Scorer()
    for input_, annot in tqdm_notebook(examples):
#         try:
            doc_gold_text = ner_model.make_doc(input_)
            gold = GoldParse(doc_gold_text, entities=annot["entities"])
            pred_value = ner_model(input_)
            scorer.score(pred_value, gold)
#         except:
#             pass
    return scorer.scores

### Load data

In [5]:
TEST_DATA = []
files = glob.glob("test/normal/*.json")
for f in files:
    with open(f) as fl:
        js = json.load(fl)
        for j in js:
            TEST_DATA.append(tuple(j))

### Load model 

In [116]:
nlp = spacy.load("models/fs_normal/epoch_32")

#### score general

In [79]:
res = evaluate(nlp, TEST_DATA[0:50000])

In [26]:
res

{u'ents_f': 87.95439401791663,
 u'ents_p': 90.70779567839963,
 u'ents_r': 85.36322483293813,
 u'las': 0.0,
 u'tags_acc': 0.0,
 u'token_acc': 100.0,
 u'uas': 0.0}

#### Evaluation on NCBItestset_corpus

In [124]:
nlp = spacy.load("models/fs_normal_ncbi/epoch_7/")

In [95]:
with open("NCBItestset_corpus.txt") as g:
    NCBItest = g.readlines()

TEST_NCB = []
art = ["", {"entities":[]}]
for line in tqdm_notebook(NCBItest):
    
    if "|a|" in line:
        text = line.split("|a|")[1]
        art[0] = text.decode("utf-8")
    
    if "Modifier" in line or "SpecificDisease" in line or "DiseaseClass" in line:
        d = tokenize.word_tokenize(line)[:-1]
        entity = [int(d[1]), int(d[2]), "DISEASE"]
        art[1]["entities"].append(entity)
    if line == '\n':
        TEST_NCB.append(tuple(art))
        art = ["", {"entities":[]}]

HBox(children=(IntProgress(value=0, max=1259), HTML(value=u'')))

In [125]:
res = evaluate_by_ent(nlp, TEST_NCB, "DISEASE")

HBox(children=(IntProgress(value=0, max=99), HTML(value=u'')))

In [126]:
res

{u'ents_f': 0.0,
 u'ents_p': 0.0,
 u'ents_r': 0.0,
 u'las': 0.0,
 u'tags_acc': 0.0,
 u'token_acc': 100.0,
 u'uas': 0.0}

In [127]:
doc = nlp(TEST_NCB[0][0])

In [128]:
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

('Entities', [(u'ATP7B', u'GENE'), (u'ATP7B', u'GENE'), (u'ATP7B', u'GENE'), (u'ATP7B', u'GENE'), (u'ATP7B', u'GENE'), (u'ATP7B', u'GENE')])


In [120]:
TEST_NCB[0]

(u'Abnormal hepatic copper accumulation is recognized as an inherited disorder in man, mouse, rat and dog. The major cause of hepatic copper accumulation in man is a dysfunctional ATP7B gene, causing Wilson disease (WD). Mutations in the ATP7B genes have also been demonstrated in mouse and rat. The ATP7B gene has been excluded in the much rarer human copper overload disease non-Indian childhood cirrhosis, indicating genetic heterogeneity. By investigating the common autosomal recessive copper toxicosis (CT) in Bedlington terriers, we have identified a new locus involved in progressive liver disease. We examined whether the WD gene ATP7B was also causative for CT by investigating the chromosomal co-localization of ATP7B and C04107, using fluorescence in situ hybridization (FISH). C04107 is an anonymous microsatellite marker closely linked to CT. However, BAC clones containing ATP7B and C04107 mapped to the canine chromosome regions CFA22q11 and CFA10q26, respectively, demonstrating that

In [86]:
# c = 0
# val = []
# ent = sorted(TEST_DATA[5][1]["entities"], reverse=True)
# for t in text.split(" "):
#     c += len(t)-1
#     if ent:
#         if c+1 > ent[-1][0]: 
#             val.append((t, ent[-1][2]))
#             ent.pop()
#         else:
#             val.append((t, "default"))
#     else:
#         val.append((t, "default"))

In [25]:
# doc = nlp2(unicode(datas[8:2000]))

In [25]:
# print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

In [24]:
# [(t.text, t.ent_type_ if t.ent_type_ else "default") for t in doc]

In [10]:
# displacy.render(doc, style='ent', jupyter=True)

In [17]:
# nlp = spacy.load('en')
# doc = nlp(u'This is a sentence.')
# displacy.serve(doc, style='ent')