In [1]:
import spacy
import scispacy
import swifter
import pandas as pd
from spacy import displacy
# import en_core_sci_sm
# import en_core_sci_md
import en_ner_bc5cdr_md
import en_ner_jnlpba_md
import en_ner_craft_md
import en_ner_bionlp13cg_md
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker
from collections import OrderedDict,Counter
from pprint import pprint
from tqdm import tqdm
DEBUG = True
tqdm.pandas()

In [2]:
text_file = open("./datasets/covid/cleaned_text.txt", 'r')
data = text_file.readlines()
text_file.close()


In [3]:
if DEBUG: 
    print(data[:3])

['sars-cov-2 has been sequenced\n', 'a phylogenetic analysis found a bat origin for the sars-cov-2\n', 'there is a diversity of possible intermediate_hosts for sars-cov-2 , including pangolins , but not mice and rats\n']


In [4]:
def display_entities(model,document):
    """ A function that returns a tuple of displacy image of named or unnamed word entities and
        a set of unique entities recognized based on scispacy model in use
        Args: 
            model: A pretrained model from spaCy or ScispaCy
            document: text data to be analysed"""
    nlp = model.load()
    doc = nlp(document)
    displacy_image = displacy.render(doc, jupyter=True,style='ent')
    entity_and_label = set([(X.text, X.label_) for X in doc.ents])
    return  displacy_image, entity_and_label

### Data in one string
---
Since the input data in one string form is too large. We divide it into one sentence each line.

### bc5dr_entities 

In [10]:
def get_entity_list(model, data):
    output = []
    nlp = model.load()
    for datum in data:
        doc = nlp(datum)
        entity_and_label = set([(X.text, X.label_) for X in doc.ents])
        output.extend(entity_and_label)
    return set(output)

In [208]:
en_core_web_sm = spacy.load("en_core_web_sm")

In [210]:
websm_entities = []
for datum in data[:10000]:
    doc = en_core_web_sm(datum)
    entity_and_label = set([(X.text, X.label_) for X in doc.ents])
    websm_entities.extend(entity_and_label)
websm_entities = set(websm_entities)
websm_entities

{('0.001', 'DATE'),
 ('4.7 days', 'DATE'),
 ('6.6 %', 'PERCENT'),
 ('rgd_motif', 'GPE'),
 ('1,308', 'CARDINAL'),
 ('about 6 feet', 'QUANTITY'),
 ('10 , 11', 'DATE'),
 ('only three', 'CARDINAL'),
 ('an additional 10,315e10,928', 'CARDINAL'),
 ('4b', 'CARDINAL'),
 ('1c', 'CARDINAL'),
 ('january 10 , 2020', 'DATE'),
 ('subjects_aged 80 and older', 'DATE'),
 ('1366', 'DATE'),
 ('376', 'CARDINAL'),
 ('237', 'CARDINAL'),
 ('14 days', 'DATE'),
 ('20 15 %', 'PERCENT'),
 ('world_health_organization , 2020', 'DATE'),
 ('0.70', 'CARDINAL'),
 ('january 25 , 2018', 'DATE'),
 ('more than 6 months', 'DATE'),
 ('1 to 17 january 2020', 'DATE'),
 ('532', 'CARDINAL'),
 ('25.7', 'CARDINAL'),
 ('12 january 2020', 'DATE'),
 ('280', 'CARDINAL'),
 ('1 billion', 'CARDINAL'),
 ('30 %', 'PERCENT'),
 ('3770', 'CARDINAL'),
 ('87.1 %', 'PERCENT'),
 ('world_health_organization , 2020b', 'DATE'),
 ('≥4', 'ORG'),
 ('as high as 67', 'CARDINAL'),
 ('regn3048 & regn3051', 'ORG'),
 ('21 january to', 'DATE'),
 ('177', 'CAR

In [211]:
websm_entities_dataframe = pd.DataFrame(websm_entities,columns=['Entity','Label'])  #save returned values of entities and their labels in a pandas dataframe
websm_entities_dataframe['Ner_model'] = 'websm'  #include a column with constant value of NER model
websm_entities_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,0.001,DATE,websm
1,4.7 days,DATE,websm
2,6.6 %,PERCENT,websm
3,rgd_motif,GPE,websm
4,1308,CARDINAL,websm
...,...,...,...
3064,"40 , 98 %",PERCENT,websm
3065,6.90,CARDINAL,websm
3066,10 s and,QUANTITY,websm
3067,2604-9601,DATE,websm


In [213]:
bc5dr_entities = get_entity_list(en_ner_bc5cdr_md, data[:10000])
bc5dr_entities

{('kasugamycin', 'CHEMICAL'),
 ('january_23rd', 'CHEMICAL'),
 ('ventilatory_support\n', 'CHEMICAL'),
 ('chlorhexidine\n', 'CHEMICAL'),
 ('social_science', 'CHEMICAL'),
 ('troponin', 'CHEMICAL'),
 ('foshan', 'CHEMICAL'),
 ('linear_epitopes', 'CHEMICAL'),
 ('figure_2b', 'CHEMICAL'),
 ('chloroxylenol', 'CHEMICAL'),
 ('multiplesequence-alignment', 'CHEMICAL'),
 ('direct_contact', 'CHEMICAL'),
 ('room_temperature', 'CHEMICAL'),
 ('human coronavirus ( cov ) of', 'DISEASE'),
 ('shibo jiang', 'CHEMICAL'),
 ('blue dotted_lines', 'CHEMICAL'),
 ('//omict ools.com/aller hunte r-tool', 'CHEMICAL'),
 ('colon , ovary , breast , cerebellum , epididymis , esophagus', 'DISEASE'),
 ('bronchiolitis', 'DISEASE'),
 ('mext-supported', 'CHEMICAL'),
 ('rneasy', 'CHEMICAL'),
 ('bootstrap_support', 'CHEMICAL'),
 ('population_densities', 'CHEMICAL'),
 ("li wenliang 's name", 'CHEMICAL'),
 ('spillover_events', 'CHEMICAL'),
 ('emergent coronaviruses\n', 'DISEASE'),
 ('crispr-based', 'CHEMICAL'),
 ('anorexia', 'DISE

In [216]:
bc5dr_entities_dataframe = pd.DataFrame(bc5dr_entities,columns=['Entity','Label'])  #save returned values of entities and their labels in a pandas dataframe
bc5dr_entities_dataframe['Ner_model'] = 'bc5dr'  #include a column with constant value of NER model
bc5dr_entities_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,kasugamycin,CHEMICAL,bc5dr
1,january_23rd,CHEMICAL,bc5dr
2,ventilatory_support\n,CHEMICAL,bc5dr
3,chlorhexidine\n,CHEMICAL,bc5dr
4,social_science,CHEMICAL,bc5dr
...,...,...,...
2027,consensus_sequence,CHEMICAL,bc5dr
2028,lower lung fields,DISEASE,bc5dr
2029,hematogenous infections,DISEASE,bc5dr
2030,faecal_samples,CHEMICAL,bc5dr


In [215]:
bionlp13cg_entities = get_entity_list(en_ner_bionlp13cg_md, data[:10000])
bionlp13cg_entities

{('computed_tomography ( hrct', 'GENE_OR_GENE_PRODUCT'),
 ('hcov', 'GENE_OR_GENE_PRODUCT'),
 ('possas 2016', 'GENE_OR_GENE_PRODUCT'),
 ('table_s1\n', 'GENE_OR_GENE_PRODUCT'),
 ('barcode cells\n', 'CELL'),
 ('low_tidal_volume', 'GENE_OR_GENE_PRODUCT'),
 ('travel_patterns', 'GENE_OR_GENE_PRODUCT'),
 ('qrt-pcr sybr_green kit', 'GENE_OR_GENE_PRODUCT'),
 ('debemos prepararnos cada día más', 'GENE_OR_GENE_PRODUCT'),
 ('1/ ( 1 +', 'GENE_OR_GENE_PRODUCT'),
 ('pre-amplification_step', 'GENE_OR_GENE_PRODUCT'),
 ('acute_respiratory_syndrome', 'GENE_OR_GENE_PRODUCT'),
 ('post-partum', 'IMMATERIAL_ANATOMICAL_ENTITY'),
 ('luciferase', 'GENE_OR_GENE_PRODUCT'),
 ('pneumocystis_carinii', 'SIMPLE_CHEMICAL'),
 ('social_media', 'CELLULAR_COMPONENT'),
 ('1:40-1:80\n', 'CANCER'),
 ('data_center', 'GENE_OR_GENE_PRODUCT'),
 ('nuclear_factor kappa b', 'GENE_OR_GENE_PRODUCT'),
 ('severe-or common-type\n', 'CELL'),
 ('xuanlin huang 1', 'GENE_OR_GENE_PRODUCT'),
 ('ebola_virus 5', 'GENE_OR_GENE_PRODUCT'),
 ('rd2',

In [217]:
bionlp13cg_entities_dataframe = pd.DataFrame(bionlp13cg_entities,columns=['Entity','Label']) #save returned values of entities and their labels in a pandas dataframe
bionlp13cg_entities_dataframe['Ner_model'] = 'bionlp13cg'  #include a column with constant value of NER model
bionlp13cg_entities_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,computed_tomography ( hrct,GENE_OR_GENE_PRODUCT,bionlp13cg
1,hcov,GENE_OR_GENE_PRODUCT,bionlp13cg
2,possas 2016,GENE_OR_GENE_PRODUCT,bionlp13cg
3,table_s1\n,GENE_OR_GENE_PRODUCT,bionlp13cg
4,barcode cells\n,CELL,bionlp13cg
...,...,...,...
6520,clinical_trials\n,GENE_OR_GENE_PRODUCT,bionlp13cg
6521,toxin-resolving blood-quickening decoction\n,GENE_OR_GENE_PRODUCT,bionlp13cg
6522,rna genome_sequence,GENE_OR_GENE_PRODUCT,bionlp13cg
6523,cov-sialate binding_sites\n,GENE_OR_GENE_PRODUCT,bionlp13cg


In [218]:
craft_entities = get_entity_list(en_ner_craft_md, data[:10000])
craft_entities

{('nt', 'CHEBI'),
 ('wild-type viruses', 'SO'),
 ('glucocorticoid and interferon\n', 'GGP'),
 ('plant', 'CHEBI'),
 ('probes designed', 'SO'),
 ('residues mutated', 'CHEBI'),
 ('bushmeat', 'TAXON'),
 ('mcp1', 'GGP'),
 ('nuclei', 'GO'),
 ('pipistrellus', 'TAXON'),
 ('viral epitopes', 'TAXON'),
 ('shengmaiyin', 'CHEBI'),
 ('domestic_pig', 'GGP'),
 ('class-1 alleles', 'GGP'),
 ('endothelial_cells', 'GGP'),
 ('ion', 'CHEBI'),
 ('effector', 'CHEBI'),
 ('evans', 'CHEBI'),
 ('water', 'CHEBI'),
 ('dogs', 'TAXON'),
 ('wild chinese_horseshoe_bats', 'SO'),
 ('rpe cells', 'CL'),
 ('//www.viprbrc.org/', 'GGP'),
 ('gln-115', 'GGP'),
 ('positions', 'SO'),
 ('membranes', 'GO'),
 ('class-i predicted epitopes', 'GGP'),
 ('pro-drug', 'CHEBI'),
 ('half-life', 'CHEBI'),
 ('msa', 'GGP'),
 ('s1a', 'GGP'),
 ('ccl20', 'GGP'),
 ('rna molecule', 'SO'),
 ('nucleosome', 'SO'),
 ('human-human', 'GGP'),
 ('antibody protein', 'GO'),
 ('experimentally known', 'SO'),
 ('viral genomes', 'TAXON'),
 ('with~90', 'GGP'),
 ('

In [219]:
craft_entities_dataframe = pd.DataFrame(craft_entities,columns=['Entity','Label'])  #save returned values of entities and their labels in a pandas dataframe
craft_entities_dataframe['Ner_model'] = 'craft' #include a column with constant value of NER model
craft_entities_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,nt,CHEBI,craft
1,wild-type viruses,SO,craft
2,glucocorticoid and interferon\n,GGP,craft
3,plant,CHEBI,craft
4,probes designed,SO,craft
...,...,...,...
1258,target-template,SO,craft
1259,hace2,GGP,craft
1260,query sequences,SO,craft
1261,human neural_progenitor cells,TAXON,craft


In [220]:
jnlpba_entities = get_entity_list(en_ner_jnlpba_md, data[:10000])
jnlpba_entities

{('mers-cov spike_proteins', 'PROTEIN'),
 ('68e80', 'PROTEIN'),
 ('kashiwase', 'PROTEIN'),
 ('2019-ncov', 'PROTEIN'),
 ('exposure_history\n', 'PROTEIN'),
 ('illness_onset\n', 'PROTEIN'),
 ('preventive_measures\n', 'PROTEIN'),
 ('serial_interval ( si )', 'DNA'),
 ('systems_science', 'PROTEIN'),
 ('refseq_database', 'PROTEIN'),
 ('aspartate_transaminase (', 'PROTEIN'),
 ('//www.viprbrc.org/brcdocs/documents/announcements/ corona/2019-ncov-vipr-report_24jan2020.pdf',
  'PROTEIN'),
 ('human dpp4', 'PROTEIN'),
 ('cov-2019', 'DNA'),
 ('written_informed_consent', 'PROTEIN'),
 ('host_cell', 'CELL_TYPE'),
 ('importar las pruebas', 'DNA'),
 ('van_der_hoek', 'PROTEIN'),
 ('interferon alfacon-1', 'PROTEIN'),
 ('kavanagh', 'DNA'),
 ('open_access genbank', 'DNA'),
 ('lymphocytes', 'CELL_TYPE'),
 ('dsp fragments', 'DNA'),
 ('infectious_disease', 'PROTEIN'),
 ('differentially_expressed lncrnas and', 'DNA'),
 ('\uf0a5 \uf03e', 'DNA'),
 ('autonomous regions', 'DNA'),
 ('peripheral_blood_mononuclear_cell

In [221]:
jnlpa_entities_dataframe = pd.DataFrame(jnlpba_entities,columns=['Entity','Label']) #save returned values of entities and their labels in a pandas dataframe
jnlpa_entities_dataframe['Ner_model'] = 'jnlpa' # include a column with constant value of NER model
jnlpa_entities_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,mers-cov spike_proteins,PROTEIN,jnlpa
1,68e80,PROTEIN,jnlpa
2,kashiwase,PROTEIN,jnlpa
3,2019-ncov,PROTEIN,jnlpa
4,exposure_history\n,PROTEIN,jnlpa
...,...,...,...
3421,statistical_analysis,PROTEIN,jnlpa
3422,z.s\n,PROTEIN,jnlpa
3423,herbal_remedies,PROTEIN,jnlpa
3424,jll,PROTEIN,jnlpa


In [32]:
nlp = en_ner_bc5cdr_md.load()

In [49]:
doc = nlp("gentle_shaking")
for i in range(len(doc)):
#     if (doc[i].tag_ == 'NNS'):
    print(doc[i].tag_, ", ", doc[i])

VBG ,  gentle_shaking


In [45]:
for d in doc:
    print(d)

virus-mab


In [43]:
for i in range(0, 10, 2):
    print(i)

0
2
4
6
8


In [37]:
displacy_image = displacy.render(doc, jupyter=True,style='ent')

In [203]:
bc5dr_entities = display_entities(en_ner_bc5cdr_md, data[2])

In [204]:
bc5dr_entities = display_entities(en_ner_bc5cdr_md, "a phylogenetic analysis [ 3 , 4 ] found a bat origin for the sars-cov-2 .")

In [222]:
import json

In [223]:
seed = json.load(open("./datasets/covid/seed.txt", 'r'))
seed

{'COVID': ['incubation',
  'sars-cov-2',
  '2019-ncov',
  'covid-19',
  'sars-2',
  'covid-19',
  'methyltransferase',
  'prrsv_nsp11',
  'mavs-mediated_apoptosis',
  '29-o-mtase',
  'mtase',
  'mhv',
  'n7-mtase',
  'd471g_mutant',
  'sars-cov-2',
  'sars',
  'sars-cov',
  'dpp4-expressing_cells',
  'dipeptidyl_peptidase_4',
  'mers',
  'dpp4',
  'non-rbd',
  's377',
  's-rbd',
  'nbl-7',
  'mers-cov',
  'm41-ck',
  'm41',
  'ibv_beaudette',
  'm41-ck-derived',
  'ribv',
  'replicase_gene',
  'ibv',
  'teschovirus',
  'camelpox',
  'metapneumovirus',
  'severe_acute_respiratory_syndrome_coronavirus',
  'respiratory_syncytial_virus',
  'coronvirus',
  'coronavirus',
  'non-productive_cough',
  'chills',
  'myalgia',
  'headache',
  'dyspnea',
  'malaise',
  'runny_nose',
  'retro-orbital_pain',
  'fever',
  'bronchiolitis',
  'bronchopneumonia',
  'lrti',
  'bronchitis',
  'lrtis',
  'artis',
  'acute_pharyngitis',
  'pneumonia',
  'acute_respiratory_distress_syndrome',
  'septic_shock

In [53]:
type(seed)

dict

In [224]:
def is_in_seed(word, seed):
    for key in seed:
        if word in seed[key]:
            return word, key

In [225]:
def label_based_on_seed(seed, data, model):
    nlp = model.load()
    seed_entities = set()
    for datum in data:
        doc = nlp(datum)
        for d in doc:
            if "NN" in d.tag_:
                entity_and_label = is_in_seed(d.text, seed)
                if entity_and_label is not None: 
                    seed_entities.add(entity_and_label)
    return seed_entities

In [226]:
seed_entities = label_based_on_seed(seed, data[:10000], en_ner_bc5cdr_md)
seed_entities

{('2019-ncov', 'COVID'),
 ('accessory_proteins', 'VIRAL_PROTEIN'),
 ('acute_lung_injury', 'COVID'),
 ('acute_respiratory_distress_syndrome', 'COVID'),
 ('ards', 'COVID'),
 ('bat', 'WILDLIFE'),
 ('bats', 'WILDLIFE'),
 ('birds', 'WILDLIFE'),
 ('bronchiolitis', 'COVID'),
 ('bronchitis', 'COVID'),
 ('chickens', 'WILDLIFE'),
 ('chikv', 'VIRAL_PROTEIN'),
 ('chills', 'COVID'),
 ('coronavirus', 'COVID'),
 ('covid-19', 'COVID'),
 ('cow', 'LIVESTOCK'),
 ('cross-species_transmission', 'WILDLIFE'),
 ('cross_species', 'WILDLIFE'),
 ('deer', 'LIVESTOCK'),
 ('dipeptidyl_peptidase_4', 'COVID'),
 ('dpp4', 'COVID'),
 ('drugs', 'VIRAL_PROTEIN'),
 ('dyspnea', 'COVID'),
 ('envelope', 'VIRAL_PROTEIN'),
 ('fever', 'COVID'),
 ('glycoprotein', 'VIRAL_PROTEIN'),
 ('glycoproteins', 'VIRAL_PROTEIN'),
 ('goat', 'LIVESTOCK'),
 ('goats', 'LIVESTOCK'),
 ('goose', 'LIVESTOCK'),
 ('gp120', 'VIRAL_PROTEIN'),
 ('hamster', 'WILDLIFE'),
 ('headache', 'COVID'),
 ('helicase', 'VIRAL_PROTEIN'),
 ('herd', 'LIVESTOCK'),
 ('herd

In [227]:
seed_entities_dataframe = pd.DataFrame(seed_entities,columns=['Entity','Label']) #save returned values of entities and their labels in a pandas dataframe
seed_entities_dataframe['Ner_model'] = 'seed' # include a column with constant value of NER model
seed_entities_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,cow,LIVESTOCK,seed
1,glycoprotein,VIRAL_PROTEIN,seed
2,vero,WILDLIFE,seed
3,gp120,VIRAL_PROTEIN,seed
4,chills,COVID,seed
...,...,...,...
95,nsp14,VIRAL_PROTEIN,seed
96,neuraminidase,VIRAL_PROTEIN,seed
97,cross_species,WILDLIFE,seed
98,nonstructural_proteins,VIRAL_PROTEIN,seed


In [228]:
entities_and_label_from_6_NER_model_dataframe = pd.concat([websm_entities_dataframe,
                                                           bc5dr_entities_dataframe,
                                                           bionlp13cg_entities_dataframe,
                                                           craft_entities_dataframe,
                                                           jnlpa_entities_dataframe,
                                                           seed_entities_dataframe
                                                          ])
entities_and_label_from_6_NER_model_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16415 entries, 0 to 99
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Entity     16415 non-null  object
 1   Label      16415 non-null  object
 2   Ner_model  16415 non-null  object
dtypes: object(3)
memory usage: 513.0+ KB


In [232]:
entities_and_label_from_6_NER_model_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,0.001,DATE,websm
1,4.7 days,DATE,websm
2,6.6 %,PERCENT,websm
3,rgd_motif,GPE,websm
4,1308,CARDINAL,websm
...,...,...,...
95,nsp14,VIRAL_PROTEIN,seed
96,neuraminidase,VIRAL_PROTEIN,seed
97,cross_species,WILDLIFE,seed
98,nonstructural_proteins,VIRAL_PROTEIN,seed


In [239]:
labels = {}
labels["LABEL"] = list(set(list(entities_and_label_from_6_NER_model_dataframe["Label"])))
labels

{'LABEL': ['PERSON',
  'ANATOMICAL_SYSTEM',
  'MONEY',
  'TIME',
  'PRODUCT',
  'GGP',
  'FAC',
  'SO',
  'CARDINAL',
  'ORGAN',
  'LAW',
  'PERCENT',
  'CL',
  'LIVESTOCK',
  'NORP',
  'TISSUE',
  'MULTI_TISSUE_STRUCTURE',
  'WILDLIFE',
  'LANGUAGE',
  'TAXON',
  'CELL_LINE',
  'VIRAL_PROTEIN',
  'EVENT',
  'ORDINAL',
  'ORGANISM_SUBDIVISION',
  'CHEBI',
  'DEVELOPING_ANATOMICAL_STRUCTURE',
  'CELLULAR_COMPONENT',
  'GENE_OR_GENE_PRODUCT',
  'COVID',
  'GPE',
  'CHEMICAL',
  'CANCER',
  'ORGANISM',
  'DISEASE',
  'CELL',
  'ORG',
  'AMINO_ACID',
  'CELL_TYPE',
  'ORGANISM_SUBSTANCE',
  'RNA',
  'WORK_OF_ART',
  'PROTEIN',
  'IMMATERIAL_ANATOMICAL_ENTITY',
  'LOC',
  'GO',
  'QUANTITY',
  'DATE',
  'PATHOLOGICAL_FORMATION',
  'SIMPLE_CHEMICAL',
  'DNA']}

In [240]:
with open("./datasets/covid/labels.txt", 'w') as labels_output:
    json.dump(labels, labels_output)

In [234]:
len(list(labels))

51

### Get each label and its entities

In [241]:
def get_labels_and_entities_dict_test(df, labels):
    output = {}
#     labels = set(list(df["Label"]))
    for label in labels:
        filter = df["Label"] == label
        output[label] = set([en for en in df.where(filter)["Entity"] if en == en])
    return output

In [242]:
def get_labels_and_entities_dict(df):
    output = {}
    labels = set(list(df["Label"]))
    for label in labels:
        filter = df["Label"] == label
        output[label] = set([en for en in df.where(filter)["Entity"] if en == en])
    return output

In [243]:
labels_and_entities_dict = get_labels_and_entities_dict(entities_and_label_from_6_NER_model_dataframe)
labels_and_entities_dict

{'PERSON': {'//drugvirus.info/',
  '//www.who.int/',
  'acute_respiratory_disease',
  'acute_respiratory_infections',
  'al-kourdi',
  'asn479',
  'at1r-ang_ii',
  'bat_sl_covzxc21',
  'bovine_serum_albumin',
  'broadinstitute.org/files/publications/special/covid-19 %',
  'cairns',
  'case_fatality_risk',
  'chest_ct',
  'chronic_kidney_disease',
  'contact_tracing_data',
  'covid-19',
  'discontinuous_b_cell_epitopes',
  'disease-19',
  'enter_host_cells',
  'european_economic_area',
  'gln-70',
  'gln300x',
  'glu170',
  'gly488',
  'green_fluorescent_protein',
  'human_immunodeficiency_virus',
  'innate_immune_response',
  'intensive_care_unit',
  'intestinal_epithelial_cells',
  'jx869059.2',
  'leu472',
  'li_keqiang',
  'lsm880',
  'lys353',
  'markov_chain_monte_carlo',
  'monoclonal_antibody',
  'n222d',
  'ns7b',
  'oc43',
  'passive_lysis_buffer',
  'pdoc00016',
  'phe472',
  'ratg13',
  'rgd_motif',
  'ser442',
  'severe_acute_respiratory_syndrome_coronavirus',
  'trizol_ls'

In [244]:
labels_and_entities_dict["WILDLIFE"]

{'bat',
 'bats',
 'birds',
 'chickens',
 'cross-species_transmission',
 'cross_species',
 'hamster',
 'host_species',
 'mice',
 'mouse',
 'natural_reservoirs',
 'pangolin',
 'pangolins',
 'poultry',
 'rat',
 'rats',
 'reservoirs',
 'rhinolophus',
 'vero'}

In [245]:
seed_label_entities_dict = {}
seed_label_entities_dict["COVID"] = labels_and_entities_dict["COVID"]
seed_label_entities_dict["WILDLIFE"] = labels_and_entities_dict["WILDLIFE"]
seed_label_entities_dict["VIRAL_PROTEIN"] = labels_and_entities_dict["VIRAL_PROTEIN"]
seed_label_entities_dict["LIVESTOCK"] = labels_and_entities_dict["LIVESTOCK"]
seed_label_entities_dict

{'COVID': {'2019-ncov',
  'acute_lung_injury',
  'acute_respiratory_distress_syndrome',
  'ards',
  'bronchiolitis',
  'bronchitis',
  'chills',
  'coronavirus',
  'covid-19',
  'dipeptidyl_peptidase_4',
  'dpp4',
  'dyspnea',
  'fever',
  'headache',
  'hypotension',
  'incubation',
  'infection',
  'malaise',
  'mers',
  'mers-cov',
  'metapneumovirus',
  'mhv',
  'myalgia',
  'non-productive_cough',
  'pneumonia',
  'respiratory_failure',
  'respiratory_syncytial_virus',
  'runny_nose',
  'sars',
  'sars-2',
  'sars-cov',
  'sars-cov-2',
  'septic_shock',
  'severe_acute_respiratory_syndrome_coronavirus'},
 'WILDLIFE': {'bat',
  'bats',
  'birds',
  'chickens',
  'cross-species_transmission',
  'cross_species',
  'hamster',
  'host_species',
  'mice',
  'mouse',
  'natural_reservoirs',
  'pangolin',
  'pangolins',
  'poultry',
  'rat',
  'rats',
  'reservoirs',
  'rhinolophus',
  'vero'},
 'VIRAL_PROTEIN': {'accessory_proteins',
  'chikv',
  'drugs',
  'envelope',
  'glycoprotein',


In [246]:
print(' '.join(is_in_seed("mice", seed_label_entities_dict)))

mice WILDLIFE


In [247]:
def output_IOB_label(data, model, seed):
    output = []
    nlp = model.load()
    for datum in data:
        doc = nlp(datum)
        for d in doc:
            if 'NN' in d.tag_:
                word_label_pair = is_in_seed(d.text, seed)
                if word_label_pair != None:
                    output.append(' '.join([word_label_pair[0], 'B-'+word_label_pair[1]+'\n']))
                else:
                    output.append(' '.join([d.text, 'O'+'\n']))
            elif d.text != '\n':
                output.append(' '.join([d.text, 'O'+'\n']))
        output.append('\n') # separate for step 3
    return output

In [248]:
data_iob = output_IOB_label(data[:10000], en_ner_bc5cdr_md, seed_label_entities_dict)
data_iob

['sars-cov-2 B-COVID\n',
 'has O\n',
 'been O\n',
 'sequenced O\n',
 '\n',
 'a O\n',
 'phylogenetic O\n',
 'analysis O\n',
 'found O\n',
 'a O\n',
 'bat B-WILDLIFE\n',
 'origin O\n',
 'for O\n',
 'the O\n',
 'sars-cov-2 B-COVID\n',
 '\n',
 'there O\n',
 'is O\n',
 'a O\n',
 'diversity O\n',
 'of O\n',
 'possible O\n',
 'intermediate_hosts O\n',
 'for O\n',
 'sars-cov-2 B-COVID\n',
 ', O\n',
 'including O\n',
 'pangolins B-WILDLIFE\n',
 ', O\n',
 'but O\n',
 'not O\n',
 'mice B-WILDLIFE\n',
 'and O\n',
 'rats B-WILDLIFE\n',
 '\n',
 'there O\n',
 'are O\n',
 'many O\n',
 'similarities O\n',
 'of O\n',
 'sars-cov-2 B-COVID\n',
 'with O\n',
 'the O\n',
 'original O\n',
 'sars-cov B-COVID\n',
 '\n',
 'using O\n',
 'computer O\n',
 'modeling O\n',
 ', O\n',
 'xu O\n',
 'et O\n',
 'al O\n',
 '\n',
 'found O\n',
 'that O\n',
 'the O\n',
 'spike_proteins O\n',
 'of O\n',
 'sars-cov-2 B-COVID\n',
 'and O\n',
 'sars-cov B-COVID\n',
 'have O\n',
 'almost O\n',
 'identical O\n',
 '3-d O\n',
 'struc

In [249]:
iob_file = open('./iob-test.txt', 'w')
# iob_file.writelines("%s\n" % l for l in data_iob)
iob_file.writelines(data_iob)

In [117]:
l = list(labels)[:3]
l

['ANATOMICAL_SYSTEM', 'GGP', 'SO']

In [119]:
get_labels_and_entities_dict_test(entities_and_label_from_5_NER_model_dataframe, l)

{'ANATOMICAL_SYSTEM': {'immune organs ,'},
 'GGP': {'//creativecommons',
  '//www.nytimes.com/2020/01/22/world/asia/coronavirusquarantines-history.html',
  '//www.viprbrc.org/',
  '37,289',
  'a1-6',
  'a471',
  'a59',
  'a59 s1a',
  'a59 vlps',
  'ace2',
  'ace2 +',
  'ace2 + cells',
  'ace2 11',
  'ace2 antibody',
  'ace2 antibody sequences',
  'ace2 cell',
  'ace2 coding variants',
  'ace2 domain',
  'ace2 enzymatic',
  'ace2 enzyme',
  'ace2 extracellular domain',
  'ace2 gene',
  'ace2 gene region',
  'ace2 mrna',
  'ace2 peptidase',
  'ace2 peptidase mutation',
  'ace2 plasmid',
  'ace2 protein',
  'ace2 proteins',
  'ace2 receptor',
  'ace2 residue',
  'ace2 residues',
  'ace2 rna',
  'ace2 variants',
  'acf',
  'albumin',
  'alserehi',
  'anfis',
  'ang1',
  'angiotensin ii',
  'angiotensin ii receptor',
  'angiotensin ii type 1a receptor',
  'annexin v',
  'anti-integrin',
  'antigenic_peptides',
  'antigenicity-score',
  'antivirals/antibacterials +',
  'apoa1',
  'apoa2',
  

In [110]:
filter = entities_and_label_from_5_NER_model_dataframe["Label"] == "CHEMICAL"
chem = set([i for i in entities_and_label_from_5_NER_model_dataframe.where(filter)["Entity"] if i == i])
chem

{'lactate',
 'wuhan-hu-1',
 'gilead_sciences',
 'susceptible_population',
 'genetic_analysis',
 '±',
 'super_spreading',
 'binding_energies',
 'fudan_university',
 'academic_journals\n',
 'membrane_protein',
 'rheumatoid_arthritis',
 'sichuan_province',
 'n95_masks',
 '//www.who.int/emergencies/mers-cov/en/',
 'zika',
 'ifn',
 'bbduk',
 'evans',
 'people^',
 'case-contact',
 'fortify',
 'table_s4',
 'bsaas',
 'inhale',
 'nucleotide',
 'chest_x-rays',
 'world_health_organization , 2020b',
 '1·5',
 'two-beforeand-three-after',
 'slit_lamp',
 'münchen',
 'self-quarantine',
 'tangerine',
 'chest_tightness',
 'aminoglycoside',
 'nonhuman_primate',
 'mortality_rate',
 'collection_date',
 "the 'qlqmgfgitvqygt",
 '1 , 2\n',
 'omeprazole',
 'takes_place',
 'pcdna3.1 expression_vector plasmids , respectively\n',
 'western_medicine',
 'respiratory_illness',
 'chromium',
 'hemorrhagic_fever',
 'qpyrvvvlsf',
 'infected_individuals',
 'mientras',
 'polyclonal_antibodies',
 'line_list',
 'enfermedade

In [109]:
len(chem)

1505

In [93]:
entities_and_label_from_5_NER_model_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,,,
1,,,
2,,,
3,,,
4,,,
...,...,...,...
103,,,
104,,,
105,,,
106,septic_shock,COVID,seed


In [143]:
d = entities_and_label_from_5_NER_model_dataframe.groupby(["Entity"], as_index=False).count()
filter = d["Label"] != 1
c = d.where(filter)
c

Unnamed: 0,Entity,Label,Ner_model
0,,,
1,,,
2,,,
3,,,
4,,,
...,...,...,...
9376,,,
9377,,2.0,2.0
9378,,2.0,2.0
9379,,,


In [150]:
entities_with_one_more_label = [en for en in c.where(filter)["Entity"] if en == en]
entities_with_one_more_label

['-london',
 '//crdd.osdd.net/ ragha va/toxin pred/',
 '//creativecommons',
 '//github.com/aakhmetz/ wuhanincubationperiod2020\n',
 '//github.com/globalcitizen/2019-wuhan-coronavirus-data\n',
 '//omict ools.com/aller hunte r-tool',
 '//www.allelefrequencies.net/',
 '//www.cbs.dtu.dk/servi ces/tmhmm',
 '//www.cdc.gov/',
 '//www.cdc.gov/flu/weekly/',
 '//www.gisaid.org/',
 '//www.mdpi.com/2077-0383/9/2/538/s1',
 '//www.nytimes.com/2020/01/22/world/asia/coronavirusquarantines-history.html',
 '//www.uniprot.org/uniprot/p59594',
 '//www.viprbrc.org/',
 '//www.who.int/',
 '//www.who.int/docs/default-source/coronaviruse/situationreports/20200211-sitrep-22-ncov.pdf',
 '/oxygen',
 '0-fold_increase',
 '0-folde2-fold_increase',
 '0.7.12',
 '1-2 mg/kg',
 '1.4e2.5',
 '1.7e4.3\n',
 '1/γ b\n',
 '1/ε',
 '1/ω',
 '1/ω b',
 '10/4719',
 '11,317e12,118 cumulative_cases',
 '11,588e13,499',
 '117/47 mm_hg',
 '1186/s40779-020-0233-6\n',
 '13/565',
 '13/565-i.e',
 '15/10009',
 '1918',
 '1:10',
 '1:40-1:80\n',


In [151]:
len(entities_with_one_more_label)

3036

In [153]:
test_en = entities_with_one_more_label[1000]
test_en

'epi_isl_402131'

In [167]:
filter = entities_and_label_from_5_NER_model_dataframe["Entity"] == test_en
entities_and_label_from_5_NER_model_dataframe.where(filter)

Unnamed: 0,Entity,Label,Ner_model
0,,,
1,,,
2,,,
3,,,
4,,,
...,...,...,...
103,,,
104,,,
105,,,
106,,,


In [168]:
t = [en for en in entities_and_label_from_5_NER_model_dataframe.where(filter)["Label"] if en == en]
t

['GENE_OR_GENE_PRODUCT', 'CELL_LINE']