##The entities and labels were extracted from a previous clinical entity recognition task



## scispaCy python [library](https://allenai.github.io/scispacy/)

In [1]:
%%capture                                                               
!pip install scispacy
!pip install 'https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_md-0.5.0.tar.gz'

In [2]:
import spacy
import scispacy
import pandas as pd
import en_core_sci_md
from scispacy.linking import EntityLinker

In [3]:
entities_dataframe = pd.read_csv('/content/bionlp_entities.csv')

In [4]:
entities_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,patients,ORGANISM,bionlp13cg
1,loperamide hydrochloride,SIMPLE_CHEMICAL,bionlp13cg
2,sodium chloride,SIMPLE_CHEMICAL,bionlp13cg
3,gut-liver,CELLULAR_COMPONENT,bionlp13cg
4,lymphocytes,CELL,bionlp13cg
...,...,...,...
96,electrolytes,CELLULAR_COMPONENT,bionlp13cg
97,lymphocyte,CELL,bionlp13cg
98,C-reactive protein,GENE_OR_GENE_PRODUCT,bionlp13cg
99,glutathione,SIMPLE_CHEMICAL,bionlp13cg


In [None]:
mesh_nlp = spacy.load("en_core_sci_md")
mesh_nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "mesh"}) #MeSH contains ~30k entities
linker = mesh_nlp.get_pipe("scispacy_linker")
def mesh_entity_linker(document):
    doc = mesh_nlp(document)
    try:
        entity = doc.ents[0]
    except IndexError:
        entity = 'Nan'
    entity_details = []
    entity_details.append(entity)
    try:
        for linker_ent in entity._.kb_ents:
            Concept_Id, Score = linker_ent
            entity_details.append('Entity_Matching_Score :{}'.format(Score))
            entity_details.append(linker.kb.cui_to_entity[linker_ent[0]])
    except AttributeError:
        pass
    return entity_details

In [None]:
entities_dataframe['mesh_output'] = entities_dataframe['Entity'].apply(lambda x : mesh_entity_linker(x))

In [None]:
entities_dataframe

Unnamed: 0,Entity,Label,Ner_model,mesh_output
0,patients,ORGANISM,bionlp13cg,"[(patients), Entity_Matching_Score :0.99999988..."
1,loperamide hydrochloride,SIMPLE_CHEMICAL,bionlp13cg,"[(loperamide, hydrochloride), Entity_Matching_..."
2,sodium chloride,SIMPLE_CHEMICAL,bionlp13cg,"[(sodium, chloride), Entity_Matching_Score :0...."
3,gut-liver,CELLULAR_COMPONENT,bionlp13cg,[(gut-liver)]
4,lymphocytes,CELL,bionlp13cg,"[(lymphocytes), Entity_Matching_Score :1.0, (D..."
...,...,...,...,...
96,electrolytes,CELLULAR_COMPONENT,bionlp13cg,"[(electrolytes), Entity_Matching_Score :1.0, (..."
97,lymphocyte,CELL,bionlp13cg,"[(lymphocyte), Entity_Matching_Score :0.907585..."
98,C-reactive protein,GENE_OR_GENE_PRODUCT,bionlp13cg,"[(C-reactive, protein), Entity_Matching_Score ..."
99,glutathione,SIMPLE_CHEMICAL,bionlp13cg,"[(glutathione), Entity_Matching_Score :1.0, (D..."


In [None]:
entities_dataframe['mesh_output'][0]

[patients,
 'Entity_Matching_Score :0.9999998807907104',
 CUI: D010361, Name: Patients
Definition: Individuals participating in the health care system for the purpose of receiving therapeutic, diagnostic, or preventive procedures.
TUI(s): 
Aliases: (total: 2): 
	 Patients, Clients,
 'Entity_Matching_Score :0.8265573382377625',
 CUI: D028642, Name: Mentally Ill Persons
Definition: Persons with psychiatric illnesses or diseases, particularly psychotic and severe mood disorders.
TUI(s): 
Aliases: (total: 3): 
	 Mentally Ill Persons, Mentally Ill, Mental Patients,
 'Entity_Matching_Score :0.7403873801231384',
 CUI: D007297, Name: Inpatients
Definition: Persons admitted to health facilities which provide board and room, for the purpose of observation, care, diagnosis or treatment.
TUI(s): 
Aliases: (total: 1): 
	 Inpatients]

In [None]:
hpo_nlp = spacy.load("en_core_sci_md")
hpo_nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "hpo"}) #16k concepts focused on phenotypic abnormalities encountered in human disease.
linker = hpo_nlp.get_pipe("scispacy_linker")
def hpo_entity_linker(document):
    doc = hpo_nlp(document)
    try:
        entity = doc.ents[0]
    except IndexError:
        entity = 'Nan'
    entity_details = []
    entity_details.append(entity)
    try:
        for linker_ent in entity._.kb_ents:
            Concept_Id, Score = linker_ent
            entity_details.append('Entity_Matching_Score :{}'.format(Score))
            entity_details.append(linker.kb.cui_to_entity[linker_ent[0]])
    except AttributeError:
        pass
    return entity_details

In [None]:
entities_dataframe['hpo_output'] = entities_dataframe['Entity'].apply(lambda x : hpo_entity_linker(x))

In [None]:
entities_dataframe

Unnamed: 0,Entity,Label,Ner_model,mesh_output,hpo_output
0,patients,ORGANISM,bionlp13cg,"[(patients), Entity_Matching_Score :0.99999988...",[(patients)]
1,loperamide hydrochloride,SIMPLE_CHEMICAL,bionlp13cg,"[(loperamide, hydrochloride), Entity_Matching_...","[(loperamide, hydrochloride)]"
2,sodium chloride,SIMPLE_CHEMICAL,bionlp13cg,"[(sodium, chloride), Entity_Matching_Score :0....","[(sodium, chloride)]"
3,gut-liver,CELLULAR_COMPONENT,bionlp13cg,[(gut-liver)],[(gut-liver)]
4,lymphocytes,CELL,bionlp13cg,"[(lymphocytes), Entity_Matching_Score :1.0, (D...","[(lymphocytes), Entity_Matching_Score :0.91434..."
...,...,...,...,...,...
96,electrolytes,CELLULAR_COMPONENT,bionlp13cg,"[(electrolytes), Entity_Matching_Score :1.0, (...","[(electrolytes), Entity_Matching_Score :0.7121..."
97,lymphocyte,CELL,bionlp13cg,"[(lymphocyte), Entity_Matching_Score :0.907585...","[(lymphocyte), Entity_Matching_Score :0.839224..."
98,C-reactive protein,GENE_OR_GENE_PRODUCT,bionlp13cg,"[(C-reactive, protein), Entity_Matching_Score ...","[(C-reactive, protein), Entity_Matching_Score ..."
99,glutathione,SIMPLE_CHEMICAL,bionlp13cg,"[(glutathione), Entity_Matching_Score :1.0, (D...",[(glutathione)]


In [None]:
entities_dataframe['hpo_output'][4]

[lymphocytes,
 'Entity_Matching_Score :0.9143446683883667',
 CUI: C0221277, Name: Abnormal lymphocyte morphology
Definition: A lymphocyte that may be irregular or not conforming to type.
TUI(s): T033
Aliases: (total: 2): 
	 Abnormality of cells of the lymphoid lineage, Abnormal lymphocytes,
 'Entity_Matching_Score :0.7550817728042603',
 CUI: C0580550, Name: Abnormal number of lymphocytes
Definition: Any abnormality in the total number of lymphocytes in the blood. []
TUI(s): T033
Aliases: (total: 4): 
	 Abnormal numbers of lymphocytes, Abnormal lymphocyte count, Abnormal lymphocyte counts, Abnormality of lymphocyte number,
 'Entity_Matching_Score :0.7446752190589905',
 CUI: C0024282, Name: Lymphocytosis
Definition: Excess of normal lymphocytes in the blood or in any effusion.
TUI(s): T047
Aliases: (total: 1): 
	 High lymphocyte count,
 'Entity_Matching_Score :0.7003375291824341',
 CUI: C1836855, Name: Vacuolated blood lymphocytes
Definition: The presence of clear, sharply defined vacuol

In [None]:
rxnorm_nlp = spacy.load("en_core_sci_md")
rxnorm_nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "rxnorm"}) #RxNorm contains ~100k concepts focused on normalized names for clinical drugs
linker = rxnorm_nlp.get_pipe("scispacy_linker")
def rxnorm_entity_linker(document):
    doc = rxnorm_nlp(document)
    try:
        entity = doc.ents[0]
    except IndexError:
        entity = 'Nan'
    entity_details = []
    entity_details.append(entity)
    try:
        for linker_ent in entity._.kb_ents:
            Concept_Id, Score = linker_ent
            entity_details.append('Entity_Matching_Score :{}'.format(Score))
            entity_details.append(linker.kb.cui_to_entity[linker_ent[0]])
    except AttributeError:
        pass
    return entity_details

In [None]:
entities_dataframe['rxnorm_output'] = entities_dataframe['Entity'].apply(lambda x : rxnorm_entity_linker(x))

In [None]:
entities_dataframe

Unnamed: 0,Entity,Label,Ner_model,mesh_output,hpo_output,rxnorm_output
0,patients,ORGANISM,bionlp13cg,"[(patients), Entity_Matching_Score :0.99999988...",[(patients)],[(patients)]
1,loperamide hydrochloride,SIMPLE_CHEMICAL,bionlp13cg,"[(loperamide, hydrochloride), Entity_Matching_...","[(loperamide, hydrochloride)]","[(loperamide, hydrochloride), Entity_Matching_..."
2,sodium chloride,SIMPLE_CHEMICAL,bionlp13cg,"[(sodium, chloride), Entity_Matching_Score :0....","[(sodium, chloride)]","[(sodium, chloride), Entity_Matching_Score :0...."
3,gut-liver,CELLULAR_COMPONENT,bionlp13cg,[(gut-liver)],[(gut-liver)],[(gut-liver)]
4,lymphocytes,CELL,bionlp13cg,"[(lymphocytes), Entity_Matching_Score :1.0, (D...","[(lymphocytes), Entity_Matching_Score :0.91434...",[(lymphocytes)]
...,...,...,...,...,...,...
96,electrolytes,CELLULAR_COMPONENT,bionlp13cg,"[(electrolytes), Entity_Matching_Score :1.0, (...","[(electrolytes), Entity_Matching_Score :0.7121...",[(electrolytes)]
97,lymphocyte,CELL,bionlp13cg,"[(lymphocyte), Entity_Matching_Score :0.907585...","[(lymphocyte), Entity_Matching_Score :0.839224...","[(lymphocyte), Entity_Matching_Score :0.703145..."
98,C-reactive protein,GENE_OR_GENE_PRODUCT,bionlp13cg,"[(C-reactive, protein), Entity_Matching_Score ...","[(C-reactive, protein), Entity_Matching_Score ...","[(C-reactive, protein)]"
99,glutathione,SIMPLE_CHEMICAL,bionlp13cg,"[(glutathione), Entity_Matching_Score :1.0, (D...",[(glutathione)],"[(glutathione), Entity_Matching_Score :1.0, (C..."


In [None]:
entities_dataframe['rxnorm_output'][99]

[glutathione,
 'Entity_Matching_Score :1.0',
 CUI: C0017817, Name: L-Glutathione
Definition: A tripeptide with many roles in cells. It conjugates to drugs to make them more soluble for excretion, is a cofactor for some enzymes, is involved in protein disulfide bond rearrangement and reduces peroxides.
TUI(s): T116, T121, T123
Aliases: (total: 1): 
	 Glutathione]

In [None]:
go_nlp = spacy.load("en_core_sci_md")
go_nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "go"}) #Gene Ontology contains ~67k concepts focused on the functions of genes
linker = go_nlp.get_pipe("scispacy_linker")
def go_entity_linker(document):
    doc = go_nlp(document)
    try:
        entity = doc.ents[0]
    except IndexError:
        entity = 'Nan'
    entity_details = []
    entity_details.append(entity)
    try:
        for linker_ent in entity._.kb_ents:
            Concept_Id, Score = linker_ent
            entity_details.append('Entity_Matching_Score :{}'.format(Score))
            entity_details.append(linker.kb.cui_to_entity[linker_ent[0]])
    except AttributeError:
        pass
    return entity_details

In [None]:
entities_dataframe['go_output'] = entities_dataframe['Entity'].apply(lambda x : go_entity_linker(x))

In [None]:
entities_dataframe

Unnamed: 0,Entity,Label,Ner_model,mesh_output,hpo_output,rxnorm_output,go_output
0,patients,ORGANISM,bionlp13cg,"[(patients), Entity_Matching_Score :0.99999988...",[(patients)],[(patients)],[(patients)]
1,loperamide hydrochloride,SIMPLE_CHEMICAL,bionlp13cg,"[(loperamide, hydrochloride), Entity_Matching_...","[(loperamide, hydrochloride)]","[(loperamide, hydrochloride), Entity_Matching_...","[(loperamide, hydrochloride)]"
2,sodium chloride,SIMPLE_CHEMICAL,bionlp13cg,"[(sodium, chloride), Entity_Matching_Score :0....","[(sodium, chloride)]","[(sodium, chloride), Entity_Matching_Score :0....","[(sodium, chloride)]"
3,gut-liver,CELLULAR_COMPONENT,bionlp13cg,[(gut-liver)],[(gut-liver)],[(gut-liver)],[(gut-liver)]
4,lymphocytes,CELL,bionlp13cg,"[(lymphocytes), Entity_Matching_Score :1.0, (D...","[(lymphocytes), Entity_Matching_Score :0.91434...",[(lymphocytes)],"[(lymphocytes), Entity_Matching_Score :0.77868..."
...,...,...,...,...,...,...,...
96,electrolytes,CELLULAR_COMPONENT,bionlp13cg,"[(electrolytes), Entity_Matching_Score :1.0, (...","[(electrolytes), Entity_Matching_Score :0.7121...",[(electrolytes)],[(electrolytes)]
97,lymphocyte,CELL,bionlp13cg,"[(lymphocyte), Entity_Matching_Score :0.907585...","[(lymphocyte), Entity_Matching_Score :0.839224...","[(lymphocyte), Entity_Matching_Score :0.703145...","[(lymphocyte), Entity_Matching_Score :0.896695..."
98,C-reactive protein,GENE_OR_GENE_PRODUCT,bionlp13cg,"[(C-reactive, protein), Entity_Matching_Score ...","[(C-reactive, protein), Entity_Matching_Score ...","[(C-reactive, protein)]","[(C-reactive, protein)]"
99,glutathione,SIMPLE_CHEMICAL,bionlp13cg,"[(glutathione), Entity_Matching_Score :1.0, (D...",[(glutathione)],"[(glutathione), Entity_Matching_Score :1.0, (C...","[(glutathione), Entity_Matching_Score :0.83759..."


In [None]:
entities_dataframe['go_output'][4]

[lymphocytes,
 'Entity_Matching_Score :0.7786888480186462',
 CUI: C1326202, Name: B cell apoptotic process
Definition: Any apoptotic process in a B cell, a lymphocyte of B lineage with the phenotype CD19-positive and capable of B cell mediated immunity. [CL:0000236, GOC:add, GOC:mtg_apoptosis, ISBN:0781735149]
TUI(s): T043
Aliases (abbreviated, total: 20): 
	 B-cell apoptosis, programmed cell death of B-lymphocytes by apoptosis, programmed cell death, B-cells, apoptosis of B-lymphocytes, B-lymphocyte programmed cell death by apoptosis, B cell programmed cell death by apoptosis, programmed cell death, B lymphocytes, apoptosis of B-cells, apoptosis of B cells, programmed cell death, B cells,
 'Entity_Matching_Score :0.7447288632392883',
 CUI: C0024262, Name: lymphocyte activation
Definition: Morphologic alteration of small B LYMPHOCYTES or T LYMPHOCYTES in culture into large blast-like cells able to synthesize DNA and RNA and to divide mitotically. It is induced by INTERLEUKINS; MITOGENS

In [None]:
umls_nlp = spacy.load("en_core_sci_md")
umls_nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"}) #Unified Medical Language System, levels 0,1,2 and 9. This has ~3M concepts.
linker = umls_nlp.get_pipe("scispacy_linker")
def umls_entity_linker(document):
    doc = umls_nlp(document)
    try:
        entity = doc.ents[0]
    except IndexError:
        entity = 'Nan'
    entity_details = []
    entity_details.append(entity)
    try:
        for linker_ent in entity._.kb_ents:
            Concept_Id, Score = linker_ent
            entity_details.append('Entity_Matching_Score :{}'.format(Score))
            entity_details.append(linker.kb.cui_to_entity[linker_ent[0]])
    except AttributeError:
        pass
    return entity_details


In [None]:
entities_dataframe['umls_output'] = entities_dataframe['Entity'].apply(lambda x : umls_entity_linker(x))

In [None]:
entities_dataframe

Unnamed: 0,Entity,Label,Ner_model,umls_output
0,patients,ORGANISM,bionlp13cg,"[(patients), Entity_Matching_Score :1.0, (C003..."
1,loperamide hydrochloride,SIMPLE_CHEMICAL,bionlp13cg,"[(loperamide, hydrochloride), Entity_Matching_..."
2,sodium chloride,SIMPLE_CHEMICAL,bionlp13cg,"[(sodium, chloride), Entity_Matching_Score :1...."
3,gut-liver,CELLULAR_COMPONENT,bionlp13cg,[(gut-liver)]
4,lymphocytes,CELL,bionlp13cg,"[(lymphocytes), Entity_Matching_Score :1.0, (C..."
...,...,...,...,...
96,electrolytes,CELLULAR_COMPONENT,bionlp13cg,"[(electrolytes), Entity_Matching_Score :1.0, (..."
97,lymphocyte,CELL,bionlp13cg,"[(lymphocyte), Entity_Matching_Score :1.0, (C0..."
98,C-reactive protein,GENE_OR_GENE_PRODUCT,bionlp13cg,"[(C-reactive, protein), Entity_Matching_Score ..."
99,glutathione,SIMPLE_CHEMICAL,bionlp13cg,"[(glutathione), Entity_Matching_Score :0.99999..."
