In [1]:
import spacy
import scispacy
import swifter
import pandas as pd
from spacy import displacy
import en_core_sci_sm
import en_core_sci_md
import en_ner_bc5cdr_md
import en_ner_jnlpba_md
import en_ner_craft_md
import en_ner_bionlp13cg_md
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker
from collections import OrderedDict,Counter
from pprint import pprint
from tqdm import tqdm
DEBUG = True
tqdm.pandas()

In [2]:
text_file = open("./datasets/covid/cleaned_text.txt", 'r')
data = text_file.readlines()
text_file.close()


In [3]:
if DEBUG: 
    print(data[:3])

['sars-cov-2 has been sequenced\n', 'a phylogenetic analysis found a bat origin for the sars-cov-2\n', 'there is a diversity of possible intermediate_hosts for sars-cov-2 , including pangolins , but not mice and rats\n']


In [4]:
def display_entities(model,document):
    """ A function that returns a tuple of displacy image of named or unnamed word entities and
        a set of unique entities recognized based on scispacy model in use
        Args: 
            model: A pretrained model from spaCy or ScispaCy
            document: text data to be analysed"""
    nlp = model.load()
    doc = nlp(document)
    displacy_image = displacy.render(doc, jupyter=True,style='ent')
    entity_and_label = set([(X.text, X.label_) for X in doc.ents])
    return  displacy_image, entity_and_label

### Data in one string
---
Since the input data in one string form is too large. We divide it into one sentence each line.

In [88]:
one_string_data = ' '.join(data)

### bc5dr_entities 

In [5]:
len(data)

556143

In [10]:
def get_entity_list(model, data):
    output = []
    nlp = model.load()
    for datum in data:
        doc = nlp(datum)
        entity_and_label = set([(X.text, X.label_) for X in doc.ents])
        output.extend(entity_and_label)
    return set(output)

In [20]:
bc5dr_entities = get_entity_list(en_ner_bc5cdr_md, data[:10000])
bc5dr_entities

In [24]:
bc5dr_entities_dataframe = pd.DataFrame(bc5dr_entities,columns=['Entity','Label'])  #save returned values of entities and their labels in a pandas dataframe
bc5dr_entities_dataframe['Ner_model'] = 'bc5dr'  #include a column with constant value of NER model
bc5dr_entities_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,kasugamycin,CHEMICAL,bc5dr
1,january_23rd,CHEMICAL,bc5dr
2,ventilatory_support\n,CHEMICAL,bc5dr
3,chlorhexidine\n,CHEMICAL,bc5dr
4,social_science,CHEMICAL,bc5dr
...,...,...,...
2027,consensus_sequence,CHEMICAL,bc5dr
2028,lower lung fields,DISEASE,bc5dr
2029,hematogenous infections,DISEASE,bc5dr
2030,faecal_samples,CHEMICAL,bc5dr


In [25]:
bionlp13cg_entities = get_entity_list(en_ner_bionlp13cg_md, data[:10000])
bionlp13cg_entities

In [27]:
bionlp13cg_entities_dataframe = pd.DataFrame(bionlp13cg_entities,columns=['Entity','Label']) #save returned values of entities and their labels in a pandas dataframe
bionlp13cg_entities_dataframe['Ner_model'] = 'bionlp13cg'  #include a column with constant value of NER model
bionlp13cg_entities_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,computed_tomography ( hrct,GENE_OR_GENE_PRODUCT,bionlp13cg
1,hcov,GENE_OR_GENE_PRODUCT,bionlp13cg
2,possas 2016,GENE_OR_GENE_PRODUCT,bionlp13cg
3,table_s1\n,GENE_OR_GENE_PRODUCT,bionlp13cg
4,barcode cells\n,CELL,bionlp13cg
...,...,...,...
6520,clinical_trials\n,GENE_OR_GENE_PRODUCT,bionlp13cg
6521,toxin-resolving blood-quickening decoction\n,GENE_OR_GENE_PRODUCT,bionlp13cg
6522,rna genome_sequence,GENE_OR_GENE_PRODUCT,bionlp13cg
6523,cov-sialate binding_sites\n,GENE_OR_GENE_PRODUCT,bionlp13cg


In [28]:
craft_entities = get_entity_list(en_ner_craft_md, data[:10000])
craft_entities

{('nt', 'CHEBI'),
 ('wild-type viruses', 'SO'),
 ('glucocorticoid and interferon\n', 'GGP'),
 ('plant', 'CHEBI'),
 ('probes designed', 'SO'),
 ('residues mutated', 'CHEBI'),
 ('bushmeat', 'TAXON'),
 ('mcp1', 'GGP'),
 ('nuclei', 'GO'),
 ('pipistrellus', 'TAXON'),
 ('viral epitopes', 'TAXON'),
 ('shengmaiyin', 'CHEBI'),
 ('domestic_pig', 'GGP'),
 ('class-1 alleles', 'GGP'),
 ('endothelial_cells', 'GGP'),
 ('ion', 'CHEBI'),
 ('effector', 'CHEBI'),
 ('evans', 'CHEBI'),
 ('water', 'CHEBI'),
 ('dogs', 'TAXON'),
 ('wild chinese_horseshoe_bats', 'SO'),
 ('rpe cells', 'CL'),
 ('//www.viprbrc.org/', 'GGP'),
 ('gln-115', 'GGP'),
 ('positions', 'SO'),
 ('membranes', 'GO'),
 ('class-i predicted epitopes', 'GGP'),
 ('pro-drug', 'CHEBI'),
 ('half-life', 'CHEBI'),
 ('msa', 'GGP'),
 ('s1a', 'GGP'),
 ('ccl20', 'GGP'),
 ('rna molecule', 'SO'),
 ('nucleosome', 'SO'),
 ('human-human', 'GGP'),
 ('antibody protein', 'GO'),
 ('experimentally known', 'SO'),
 ('viral genomes', 'TAXON'),
 ('with~90', 'GGP'),
 ('

In [29]:
craft_entities_dataframe = pd.DataFrame(craft_entities,columns=['Entity','Label'])  #save returned values of entities and their labels in a pandas dataframe
craft_entities_dataframe['Ner_model'] = 'craft' #include a column with constant value of NER model
craft_entities_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,nt,CHEBI,craft
1,wild-type viruses,SO,craft
2,glucocorticoid and interferon\n,GGP,craft
3,plant,CHEBI,craft
4,probes designed,SO,craft
...,...,...,...
1258,target-template,SO,craft
1259,hace2,GGP,craft
1260,query sequences,SO,craft
1261,human neural_progenitor cells,TAXON,craft


In [30]:
jnlpba_entities = get_entity_list(en_ner_jnlpba_md, data[:10000])
jnlpba_entities

{('mers-cov spike_proteins', 'PROTEIN'),
 ('68e80', 'PROTEIN'),
 ('kashiwase', 'PROTEIN'),
 ('2019-ncov', 'PROTEIN'),
 ('exposure_history\n', 'PROTEIN'),
 ('illness_onset\n', 'PROTEIN'),
 ('preventive_measures\n', 'PROTEIN'),
 ('serial_interval ( si )', 'DNA'),
 ('systems_science', 'PROTEIN'),
 ('refseq_database', 'PROTEIN'),
 ('aspartate_transaminase (', 'PROTEIN'),
 ('//www.viprbrc.org/brcdocs/documents/announcements/ corona/2019-ncov-vipr-report_24jan2020.pdf',
  'PROTEIN'),
 ('human dpp4', 'PROTEIN'),
 ('cov-2019', 'DNA'),
 ('written_informed_consent', 'PROTEIN'),
 ('host_cell', 'CELL_TYPE'),
 ('importar las pruebas', 'DNA'),
 ('van_der_hoek', 'PROTEIN'),
 ('interferon alfacon-1', 'PROTEIN'),
 ('kavanagh', 'DNA'),
 ('open_access genbank', 'DNA'),
 ('lymphocytes', 'CELL_TYPE'),
 ('dsp fragments', 'DNA'),
 ('infectious_disease', 'PROTEIN'),
 ('differentially_expressed lncrnas and', 'DNA'),
 ('\uf0a5 \uf03e', 'DNA'),
 ('autonomous regions', 'DNA'),
 ('peripheral_blood_mononuclear_cell

In [31]:
jnlpa_entities_dataframe = pd.DataFrame(jnlpba_entities,columns=['Entity','Label']) #save returned values of entities and their labels in a pandas dataframe
jnlpa_entities_dataframe['Ner_model'] = 'jnlpa' # include a column with constant value of NER model
jnlpa_entities_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,mers-cov spike_proteins,PROTEIN,jnlpa
1,68e80,PROTEIN,jnlpa
2,kashiwase,PROTEIN,jnlpa
3,2019-ncov,PROTEIN,jnlpa
4,exposure_history\n,PROTEIN,jnlpa
...,...,...,...
3421,statistical_analysis,PROTEIN,jnlpa
3422,z.s\n,PROTEIN,jnlpa
3423,herbal_remedies,PROTEIN,jnlpa
3424,jll,PROTEIN,jnlpa


In [32]:
nlp = en_ner_bc5cdr_md.load()

In [49]:
doc = nlp("gentle_shaking")
for i in range(len(doc)):
#     if (doc[i].tag_ == 'NNS'):
    print(doc[i].tag_, ", ", doc[i])

VBG ,  gentle_shaking


In [45]:
for d in doc:
    print(d)

virus-mab


In [43]:
for i in range(0, 10, 2):
    print(i)

0
2
4
6
8


In [37]:
displacy_image = displacy.render(doc, jupyter=True,style='ent')

In [38]:
bc5dr_entities = display_entities(en_ner_bc5cdr_md, data[2])

In [39]:
bc5dr_entities = display_entities(en_ner_bc5cdr_md, data[1])



In [50]:
import json

In [52]:
seed = json.load(open("./datasets/covid/seed.txt", 'r'))
seed

{'COVID': ['37ºc',
  'pbs/tween',
  'room_temperature',
  '37uc',
  'c',
  'incubation',
  'sars-cov-2',
  '2019-ncov',
  'confirmed_cases',
  'mainland_china',
  'line_list',
  'hubei_province',
  'sars-2',
  'covid-19',
  'prrsv_nsp11',
  'mavs-mediated_apoptosis',
  '29-o-mtase',
  'mtase',
  'nsp15',
  'mhv',
  'n7-mtase',
  'd471g_mutant',
  'sars-cov',
  'dpp4-expressing_cells',
  'dipeptidyl_peptidase_4',
  'mers',
  'dpp4',
  'non-rbd',
  's377',
  's-rbd',
  'nbl-7',
  'mers-cov',
  'm41-ck',
  'm41',
  'ibv_beaudette',
  'm41-ck-derived',
  'ribv',
  'replicase_gene',
  'beau-r',
  '793/b',
  'ibv',
  'teschovirus',
  'camelpox',
  'metapneumovirus',
  'severe_acute_respiratory_syndrome_coronavirus',
  'respiratory_syncytial_virus',
  'coronavirus',
  'non-productive_cough',
  'chills',
  'myalgia',
  'headache',
  'dyspnea',
  'malaise',
  'runny_nose',
  'retro-orbital_pain',
  'fever',
  'bronchiolitis',
  'bronchopneumonia',
  'lrti',
  'bronchitis',
  'lrtis',
  'artis',

In [None]:
def label_based_on_seed()

In [33]:
doc

there is a diversity of possible intermediate_hosts for sars-cov-2 , including pangolins , but not mice and rats [ 5 ] .

In [34]:
bc5dr_entities[1]

{('intermediate_hosts', 'CHEMICAL'),
 ('mice', 'WILDLIFE'),
 ('pangolins', 'WILDLIFE'),
 ('rats', 'WILDLIFE'),
 ('sars-cov-2', 'COVID')}