In [1]:
import spacy
import scispacy
import swifter
import pandas as pd
from spacy import displacy
import en_core_sci_sm
import en_core_sci_md
import en_ner_bc5cdr_md
import en_ner_jnlpba_md
import en_ner_craft_md
import en_ner_bionlp13cg_md
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker
from collections import OrderedDict,Counter
from pprint import pprint
from tqdm import tqdm
DEBUG = True
tqdm.pandas()

In [2]:
text_file = open("./datasets/covid/cleaned_text.txt", 'r')
data = text_file.readlines()
text_file.close()


In [3]:
if DEBUG: 
    print(data[:3])

['sars-cov-2 has been sequenced\n', 'a phylogenetic analysis found a bat origin for the sars-cov-2\n', 'there is a diversity of possible intermediate_hosts for sars-cov-2 , including pangolins , but not mice and rats\n']


In [4]:
def display_entities(model,document):
    """ A function that returns a tuple of displacy image of named or unnamed word entities and
        a set of unique entities recognized based on scispacy model in use
        Args: 
            model: A pretrained model from spaCy or ScispaCy
            document: text data to be analysed"""
    nlp = model.load()
    doc = nlp(document)
    displacy_image = displacy.render(doc, jupyter=True,style='ent')
    entity_and_label = set([(X.text, X.label_) for X in doc.ents])
    return  displacy_image, entity_and_label

### Data in one string
---
Since the input data in one string form is too large. We divide it into one sentence each line.

In [88]:
one_string_data = ' '.join(data)

### bc5dr_entities 

In [5]:
len(data)

556143

In [10]:
def get_entity_list(model, data):
    output = []
    nlp = model.load()
    for datum in data:
        doc = nlp(datum)
        entity_and_label = set([(X.text, X.label_) for X in doc.ents])
        output.extend(entity_and_label)
    return set(output)

In [20]:
bc5dr_entities = get_entity_list(en_ner_bc5cdr_md, data[:10000])
bc5dr_entities

In [24]:
bc5dr_entities_dataframe = pd.DataFrame(bc5dr_entities,columns=['Entity','Label'])  #save returned values of entities and their labels in a pandas dataframe
bc5dr_entities_dataframe['Ner_model'] = 'bc5dr'  #include a column with constant value of NER model
bc5dr_entities_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,kasugamycin,CHEMICAL,bc5dr
1,january_23rd,CHEMICAL,bc5dr
2,ventilatory_support\n,CHEMICAL,bc5dr
3,chlorhexidine\n,CHEMICAL,bc5dr
4,social_science,CHEMICAL,bc5dr
...,...,...,...
2027,consensus_sequence,CHEMICAL,bc5dr
2028,lower lung fields,DISEASE,bc5dr
2029,hematogenous infections,DISEASE,bc5dr
2030,faecal_samples,CHEMICAL,bc5dr


In [25]:
bionlp13cg_entities = get_entity_list(en_ner_bionlp13cg_md, data[:10000])
bionlp13cg_entities

In [27]:
bionlp13cg_entities_dataframe = pd.DataFrame(bionlp13cg_entities,columns=['Entity','Label']) #save returned values of entities and their labels in a pandas dataframe
bionlp13cg_entities_dataframe['Ner_model'] = 'bionlp13cg'  #include a column with constant value of NER model
bionlp13cg_entities_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,computed_tomography ( hrct,GENE_OR_GENE_PRODUCT,bionlp13cg
1,hcov,GENE_OR_GENE_PRODUCT,bionlp13cg
2,possas 2016,GENE_OR_GENE_PRODUCT,bionlp13cg
3,table_s1\n,GENE_OR_GENE_PRODUCT,bionlp13cg
4,barcode cells\n,CELL,bionlp13cg
...,...,...,...
6520,clinical_trials\n,GENE_OR_GENE_PRODUCT,bionlp13cg
6521,toxin-resolving blood-quickening decoction\n,GENE_OR_GENE_PRODUCT,bionlp13cg
6522,rna genome_sequence,GENE_OR_GENE_PRODUCT,bionlp13cg
6523,cov-sialate binding_sites\n,GENE_OR_GENE_PRODUCT,bionlp13cg


In [28]:
craft_entities = get_entity_list(en_ner_craft_md, data[:10000])
craft_entities

{('nt', 'CHEBI'),
 ('wild-type viruses', 'SO'),
 ('glucocorticoid and interferon\n', 'GGP'),
 ('plant', 'CHEBI'),
 ('probes designed', 'SO'),
 ('residues mutated', 'CHEBI'),
 ('bushmeat', 'TAXON'),
 ('mcp1', 'GGP'),
 ('nuclei', 'GO'),
 ('pipistrellus', 'TAXON'),
 ('viral epitopes', 'TAXON'),
 ('shengmaiyin', 'CHEBI'),
 ('domestic_pig', 'GGP'),
 ('class-1 alleles', 'GGP'),
 ('endothelial_cells', 'GGP'),
 ('ion', 'CHEBI'),
 ('effector', 'CHEBI'),
 ('evans', 'CHEBI'),
 ('water', 'CHEBI'),
 ('dogs', 'TAXON'),
 ('wild chinese_horseshoe_bats', 'SO'),
 ('rpe cells', 'CL'),
 ('//www.viprbrc.org/', 'GGP'),
 ('gln-115', 'GGP'),
 ('positions', 'SO'),
 ('membranes', 'GO'),
 ('class-i predicted epitopes', 'GGP'),
 ('pro-drug', 'CHEBI'),
 ('half-life', 'CHEBI'),
 ('msa', 'GGP'),
 ('s1a', 'GGP'),
 ('ccl20', 'GGP'),
 ('rna molecule', 'SO'),
 ('nucleosome', 'SO'),
 ('human-human', 'GGP'),
 ('antibody protein', 'GO'),
 ('experimentally known', 'SO'),
 ('viral genomes', 'TAXON'),
 ('with~90', 'GGP'),
 ('

In [29]:
craft_entities_dataframe = pd.DataFrame(craft_entities,columns=['Entity','Label'])  #save returned values of entities and their labels in a pandas dataframe
craft_entities_dataframe['Ner_model'] = 'craft' #include a column with constant value of NER model
craft_entities_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,nt,CHEBI,craft
1,wild-type viruses,SO,craft
2,glucocorticoid and interferon\n,GGP,craft
3,plant,CHEBI,craft
4,probes designed,SO,craft
...,...,...,...
1258,target-template,SO,craft
1259,hace2,GGP,craft
1260,query sequences,SO,craft
1261,human neural_progenitor cells,TAXON,craft


In [30]:
jnlpba_entities = get_entity_list(en_ner_jnlpba_md, data[:10000])
jnlpba_entities

{('mers-cov spike_proteins', 'PROTEIN'),
 ('68e80', 'PROTEIN'),
 ('kashiwase', 'PROTEIN'),
 ('2019-ncov', 'PROTEIN'),
 ('exposure_history\n', 'PROTEIN'),
 ('illness_onset\n', 'PROTEIN'),
 ('preventive_measures\n', 'PROTEIN'),
 ('serial_interval ( si )', 'DNA'),
 ('systems_science', 'PROTEIN'),
 ('refseq_database', 'PROTEIN'),
 ('aspartate_transaminase (', 'PROTEIN'),
 ('//www.viprbrc.org/brcdocs/documents/announcements/ corona/2019-ncov-vipr-report_24jan2020.pdf',
  'PROTEIN'),
 ('human dpp4', 'PROTEIN'),
 ('cov-2019', 'DNA'),
 ('written_informed_consent', 'PROTEIN'),
 ('host_cell', 'CELL_TYPE'),
 ('importar las pruebas', 'DNA'),
 ('van_der_hoek', 'PROTEIN'),
 ('interferon alfacon-1', 'PROTEIN'),
 ('kavanagh', 'DNA'),
 ('open_access genbank', 'DNA'),
 ('lymphocytes', 'CELL_TYPE'),
 ('dsp fragments', 'DNA'),
 ('infectious_disease', 'PROTEIN'),
 ('differentially_expressed lncrnas and', 'DNA'),
 ('\uf0a5 \uf03e', 'DNA'),
 ('autonomous regions', 'DNA'),
 ('peripheral_blood_mononuclear_cell

In [31]:
jnlpa_entities_dataframe = pd.DataFrame(jnlpba_entities,columns=['Entity','Label']) #save returned values of entities and their labels in a pandas dataframe
jnlpa_entities_dataframe['Ner_model'] = 'jnlpa' # include a column with constant value of NER model
jnlpa_entities_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,mers-cov spike_proteins,PROTEIN,jnlpa
1,68e80,PROTEIN,jnlpa
2,kashiwase,PROTEIN,jnlpa
3,2019-ncov,PROTEIN,jnlpa
4,exposure_history\n,PROTEIN,jnlpa
...,...,...,...
3421,statistical_analysis,PROTEIN,jnlpa
3422,z.s\n,PROTEIN,jnlpa
3423,herbal_remedies,PROTEIN,jnlpa
3424,jll,PROTEIN,jnlpa


In [32]:
nlp = en_ner_bc5cdr_md.load()

In [49]:
doc = nlp("gentle_shaking")
for i in range(len(doc)):
#     if (doc[i].tag_ == 'NNS'):
    print(doc[i].tag_, ", ", doc[i])

VBG ,  gentle_shaking


In [45]:
for d in doc:
    print(d)

virus-mab


In [43]:
for i in range(0, 10, 2):
    print(i)

0
2
4
6
8


In [37]:
displacy_image = displacy.render(doc, jupyter=True,style='ent')

In [38]:
bc5dr_entities = display_entities(en_ner_bc5cdr_md, data[2])

In [39]:
bc5dr_entities = display_entities(en_ner_bc5cdr_md, data[1])



In [50]:
import json

In [52]:
seed = json.load(open("./datasets/covid/seed.txt", 'r'))
seed

{'COVID': ['37ºc',
  'pbs/tween',
  'room_temperature',
  '37uc',
  'c',
  'incubation',
  'sars-cov-2',
  '2019-ncov',
  'confirmed_cases',
  'mainland_china',
  'line_list',
  'hubei_province',
  'sars-2',
  'covid-19',
  'prrsv_nsp11',
  'mavs-mediated_apoptosis',
  '29-o-mtase',
  'mtase',
  'nsp15',
  'mhv',
  'n7-mtase',
  'd471g_mutant',
  'sars-cov',
  'dpp4-expressing_cells',
  'dipeptidyl_peptidase_4',
  'mers',
  'dpp4',
  'non-rbd',
  's377',
  's-rbd',
  'nbl-7',
  'mers-cov',
  'm41-ck',
  'm41',
  'ibv_beaudette',
  'm41-ck-derived',
  'ribv',
  'replicase_gene',
  'beau-r',
  '793/b',
  'ibv',
  'teschovirus',
  'camelpox',
  'metapneumovirus',
  'severe_acute_respiratory_syndrome_coronavirus',
  'respiratory_syncytial_virus',
  'coronavirus',
  'non-productive_cough',
  'chills',
  'myalgia',
  'headache',
  'dyspnea',
  'malaise',
  'runny_nose',
  'retro-orbital_pain',
  'fever',
  'bronchiolitis',
  'bronchopneumonia',
  'lrti',
  'bronchitis',
  'lrtis',
  'artis',

In [53]:
type(seed)

dict

In [54]:
def is_in_seed(word, seed):
    for key in seed:
        if word in seed[key]:
            return word, key

In [60]:
def label_based_on_seed(seed, data, model):
    nlp = model.load()
    seed_entities = set()
    for datum in data:
        doc = nlp(datum)
        for d in doc:
            if "NN" in d.tag_:
                entity_and_label = is_in_seed(d.text, seed)
                if entity_and_label is not None: 
                    seed_entities.add(entity_and_label)
    return seed_entities

In [61]:
seed_entities = label_based_on_seed(seed, data[:10000], en_ner_bc5cdr_md)

In [62]:
seed_entities

{('2019-ncov', 'COVID'),
 ('7b', 'VIRAL_PROTEIN'),
 ('accessory_proteins', 'VIRAL_PROTEIN'),
 ('acute_lung_injury', 'COVID'),
 ('acute_respiratory_distress_syndrome', 'COVID'),
 ('ards', 'COVID'),
 ('bat', 'WILDLIFE'),
 ('bats', 'WILDLIFE'),
 ('bronchiolitis', 'COVID'),
 ('bronchitis', 'COVID'),
 ('c', 'COVID'),
 ('cell_line', 'WILDLIFE'),
 ('chikv', 'VIRAL_PROTEIN'),
 ('chills', 'COVID'),
 ('confirmed_cases', 'COVID'),
 ('coronavirus', 'COVID'),
 ('cov', 'VIRAL_PROTEIN'),
 ('covid-19', 'COVID'),
 ('cow', 'LIVESTOCK'),
 ('deer', 'LIVESTOCK'),
 ('dipeptidyl_peptidase_4', 'COVID'),
 ('domestic_animals', 'WILDLIFE'),
 ('dpp4', 'COVID'),
 ('drugs', 'VIRAL_PROTEIN'),
 ('dyspnea', 'COVID'),
 ('envelope', 'VIRAL_PROTEIN'),
 ('farm', 'LIVESTOCK'),
 ('fever', 'COVID'),
 ('glycoprotein', 'VIRAL_PROTEIN'),
 ('glycoproteins', 'VIRAL_PROTEIN'),
 ('goat', 'LIVESTOCK'),
 ('goats', 'LIVESTOCK'),
 ('goose', 'LIVESTOCK'),
 ('gp120', 'VIRAL_PROTEIN'),
 ('headache', 'COVID'),
 ('helicase', 'VIRAL_PROTEIN'

In [63]:
seed_entities_dataframe = pd.DataFrame(seed_entities,columns=['Entity','Label']) #save returned values of entities and their labels in a pandas dataframe
seed_entities_dataframe['Ner_model'] = 'seed' # include a column with constant value of NER model
seed_entities_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,cow,LIVESTOCK,seed
1,glycoprotein,VIRAL_PROTEIN,seed
2,vero,WILDLIFE,seed
3,gp120,VIRAL_PROTEIN,seed
4,room_temperature,COVID,seed
...,...,...,...
103,neuraminidase,VIRAL_PROTEIN,seed
104,nonstructural_proteins,VIRAL_PROTEIN,seed
105,n,VIRAL_PROTEIN,seed
106,septic_shock,COVID,seed


In [165]:
entities_and_label_from_5_NER_model_dataframe = pd.concat([bc5dr_entities_dataframe,
                                                           bionlp13cg_entities_dataframe,
                                                           craft_entities_dataframe,
                                                           jnlpa_entities_dataframe,
                                                           seed_entities_dataframe
                                                          ])
entities_and_label_from_5_NER_model_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13354 entries, 0 to 107
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Entity     13354 non-null  object
 1   Label      13354 non-null  object
 2   Ner_model  13354 non-null  object
dtypes: object(3)
memory usage: 417.3+ KB


In [166]:
entities_and_label_from_5_NER_model_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,kasugamycin,CHEMICAL,bc5dr
1,january_23rd,CHEMICAL,bc5dr
2,ventilatory_support\n,CHEMICAL,bc5dr
3,chlorhexidine\n,CHEMICAL,bc5dr
4,social_science,CHEMICAL,bc5dr
...,...,...,...
103,neuraminidase,VIRAL_PROTEIN,seed
104,nonstructural_proteins,VIRAL_PROTEIN,seed
105,n,VIRAL_PROTEIN,seed
106,septic_shock,COVID,seed


In [88]:
labels = set(list(entities_and_label_from_5_NER_model_dataframe["Label"]))
labels

{'AMINO_ACID',
 'ANATOMICAL_SYSTEM',
 'CANCER',
 'CELL',
 'CELLULAR_COMPONENT',
 'CELL_LINE',
 'CELL_TYPE',
 'CHEBI',
 'CHEMICAL',
 'CL',
 'COVID',
 'DEVELOPING_ANATOMICAL_STRUCTURE',
 'DISEASE',
 'DNA',
 'GENE_OR_GENE_PRODUCT',
 'GGP',
 'GO',
 'IMMATERIAL_ANATOMICAL_ENTITY',
 'LIVESTOCK',
 'MULTI_TISSUE_STRUCTURE',
 'ORGAN',
 'ORGANISM',
 'ORGANISM_SUBDIVISION',
 'ORGANISM_SUBSTANCE',
 'PATHOLOGICAL_FORMATION',
 'PROTEIN',
 'RNA',
 'SIMPLE_CHEMICAL',
 'SO',
 'TAXON',
 'TISSUE',
 'VIRAL_PROTEIN',
 'WILDLIFE'}

In [113]:
len(list(labels))

33

### Get each label and its entities

In [114]:
def get_labels_and_entities_dict_test(df, labels):
    output = {}
#     labels = set(list(df["Label"]))
    for label in labels:
        filter = df["Label"] == label
        output[label] = set([en for en in df.where(filter)["Entity"] if en == en])
    return output

In [120]:
def get_labels_and_entities_dict(df):
    output = {}
    labels = set(list(df["Label"]))
    for label in labels:
        filter = df["Label"] == label
        output[label] = set([en for en in df.where(filter)["Entity"] if en == en])
    return output

In [122]:
labels_and_entities_dict = get_labels_and_entities_dict(entities_and_label_from_5_NER_model_dataframe)
labels_and_entities_dict

{'ANATOMICAL_SYSTEM': {'immune organs ,'},
 'GGP': {'//creativecommons',
  '//www.nytimes.com/2020/01/22/world/asia/coronavirusquarantines-history.html',
  '//www.viprbrc.org/',
  '37,289',
  'a1-6',
  'a471',
  'a59',
  'a59 s1a',
  'a59 vlps',
  'ace2',
  'ace2 +',
  'ace2 + cells',
  'ace2 11',
  'ace2 antibody',
  'ace2 antibody sequences',
  'ace2 cell',
  'ace2 coding variants',
  'ace2 domain',
  'ace2 enzymatic',
  'ace2 enzyme',
  'ace2 extracellular domain',
  'ace2 gene',
  'ace2 gene region',
  'ace2 mrna',
  'ace2 peptidase',
  'ace2 peptidase mutation',
  'ace2 plasmid',
  'ace2 protein',
  'ace2 proteins',
  'ace2 receptor',
  'ace2 residue',
  'ace2 residues',
  'ace2 rna',
  'ace2 variants',
  'acf',
  'albumin',
  'alserehi',
  'anfis',
  'ang1',
  'angiotensin ii',
  'angiotensin ii receptor',
  'angiotensin ii type 1a receptor',
  'annexin v',
  'anti-integrin',
  'antigenic_peptides',
  'antigenicity-score',
  'antivirals/antibacterials +',
  'apoa1',
  'apoa2',
  

In [124]:
labels_and_entities_dict["WILDLIFE"]

{'bat',
 'bats',
 'cell_line',
 'domestic_animals',
 'mammals',
 'mouse',
 'pangolin',
 'pangolins',
 'rat',
 'ratg13',
 'reservoirs',
 't',
 'vero',
 'wild',
 'wild_animals',
 'wildlife'}

In [169]:
seed_label_entities_dict = {}
seed_label_entities_dict["COVID"] = labels_and_entities_dict["COVID"]
seed_label_entities_dict["WILDLIFE"] = labels_and_entities_dict["WILDLIFE"]
seed_label_entities_dict["VIRAL_PROTEIN"] = labels_and_entities_dict["VIRAL_PROTEIN"]
seed_label_entities_dict["LIVESTOCK"] = labels_and_entities_dict["LIVESTOCK"]
seed_label_entities_dict

{'COVID': {'2019-ncov',
  'acute_lung_injury',
  'acute_respiratory_distress_syndrome',
  'ards',
  'bronchiolitis',
  'bronchitis',
  'c',
  'chills',
  'confirmed_cases',
  'coronavirus',
  'covid-19',
  'dipeptidyl_peptidase_4',
  'dpp4',
  'dyspnea',
  'fever',
  'headache',
  'hubei_province',
  'hypotension',
  'incubation',
  'infection',
  'line_list',
  'mainland_china',
  'malaise',
  'mers',
  'mers-cov',
  'metapneumovirus',
  'mhv',
  'myalgia',
  'non-productive_cough',
  'nsp15',
  'pneumonia',
  'respiratory_failure',
  'respiratory_syncytial_virus',
  'room_temperature',
  'runny_nose',
  'sars-2',
  'sars-cov',
  'sars-cov-2',
  'septic_shock',
  'severe_acute_respiratory_syndrome_coronavirus'},
 'WILDLIFE': {'bat',
  'bats',
  'cell_line',
  'domestic_animals',
  'mammals',
  'mouse',
  'pangolin',
  'pangolins',
  'rat',
  'ratg13',
  'reservoirs',
  't',
  'vero',
  'wild',
  'wild_animals',
  'wildlife'},
 'VIRAL_PROTEIN': {'7b',
  'accessory_proteins',
  'chikv',

In [193]:
print(' '.join(is_in_seed("mice", seed_label_entities_dict)))

TypeError: can only join an iterable

In [198]:
def output_IOB_label(data, model, seed):
    output = []
    nlp = model.load()
    for datum in data:
        doc = nlp(datum)
        for d in doc:
            if 'NN' in d.tag_:
                word_label_pair = is_in_seed(d.text, seed)
                if word_label_pair != None:
                    output.append(' '.join([word_label_pair[0], 'B-'+word_label_pair[1]+'\n']))
                else:
                    output.append(' '.join([d.text, 'O'+'\n']))
            elif d.text != '\n':
                output.append(' '.join([d.text, 'O'+'\n']))
        output.append('\n') # separate for step 3
    return output

In [199]:
data_iob = output_IOB_label(data[:10000], en_ner_bc5cdr_md, seed_label_entities_dict)
data_iob

['sars-cov-2 B-COVID\n',
 'has O\n',
 'been O\n',
 'sequenced O\n',
 '\n',
 'a O\n',
 'phylogenetic O\n',
 'analysis O\n',
 'found O\n',
 'a O\n',
 'bat B-WILDLIFE\n',
 'origin O\n',
 'for O\n',
 'the O\n',
 'sars-cov-2 B-COVID\n',
 '\n',
 'there O\n',
 'is O\n',
 'a O\n',
 'diversity O\n',
 'of O\n',
 'possible O\n',
 'intermediate_hosts O\n',
 'for O\n',
 'sars-cov-2 B-COVID\n',
 ', O\n',
 'including O\n',
 'pangolins B-WILDLIFE\n',
 ', O\n',
 'but O\n',
 'not O\n',
 'mice O\n',
 'and O\n',
 'rats O\n',
 '\n',
 'there O\n',
 'are O\n',
 'many O\n',
 'similarities O\n',
 'of O\n',
 'sars-cov-2 B-COVID\n',
 'with O\n',
 'the O\n',
 'original O\n',
 'sars-cov B-COVID\n',
 '\n',
 'using O\n',
 'computer O\n',
 'modeling O\n',
 ', O\n',
 'xu O\n',
 'et O\n',
 'al O\n',
 '\n',
 'found O\n',
 'that O\n',
 'the O\n',
 'spike_proteins O\n',
 'of O\n',
 'sars-cov-2 B-COVID\n',
 'and O\n',
 'sars-cov B-COVID\n',
 'have O\n',
 'almost O\n',
 'identical O\n',
 '3-d O\n',
 'structures O\n',
 'in O

In [197]:
iob_file = open('./iob-test.txt', 'w')
# iob_file.writelines("%s\n" % l for l in data_iob)
iob_file.writelines(data_iob)

In [117]:
l = list(labels)[:3]
l

['ANATOMICAL_SYSTEM', 'GGP', 'SO']

In [119]:
get_labels_and_entities_dict_test(entities_and_label_from_5_NER_model_dataframe, l)

{'ANATOMICAL_SYSTEM': {'immune organs ,'},
 'GGP': {'//creativecommons',
  '//www.nytimes.com/2020/01/22/world/asia/coronavirusquarantines-history.html',
  '//www.viprbrc.org/',
  '37,289',
  'a1-6',
  'a471',
  'a59',
  'a59 s1a',
  'a59 vlps',
  'ace2',
  'ace2 +',
  'ace2 + cells',
  'ace2 11',
  'ace2 antibody',
  'ace2 antibody sequences',
  'ace2 cell',
  'ace2 coding variants',
  'ace2 domain',
  'ace2 enzymatic',
  'ace2 enzyme',
  'ace2 extracellular domain',
  'ace2 gene',
  'ace2 gene region',
  'ace2 mrna',
  'ace2 peptidase',
  'ace2 peptidase mutation',
  'ace2 plasmid',
  'ace2 protein',
  'ace2 proteins',
  'ace2 receptor',
  'ace2 residue',
  'ace2 residues',
  'ace2 rna',
  'ace2 variants',
  'acf',
  'albumin',
  'alserehi',
  'anfis',
  'ang1',
  'angiotensin ii',
  'angiotensin ii receptor',
  'angiotensin ii type 1a receptor',
  'annexin v',
  'anti-integrin',
  'antigenic_peptides',
  'antigenicity-score',
  'antivirals/antibacterials +',
  'apoa1',
  'apoa2',
  

In [110]:
filter = entities_and_label_from_5_NER_model_dataframe["Label"] == "CHEMICAL"
chem = set([i for i in entities_and_label_from_5_NER_model_dataframe.where(filter)["Entity"] if i == i])
chem

{'lactate',
 'wuhan-hu-1',
 'gilead_sciences',
 'susceptible_population',
 'genetic_analysis',
 '±',
 'super_spreading',
 'binding_energies',
 'fudan_university',
 'academic_journals\n',
 'membrane_protein',
 'rheumatoid_arthritis',
 'sichuan_province',
 'n95_masks',
 '//www.who.int/emergencies/mers-cov/en/',
 'zika',
 'ifn',
 'bbduk',
 'evans',
 'people^',
 'case-contact',
 'fortify',
 'table_s4',
 'bsaas',
 'inhale',
 'nucleotide',
 'chest_x-rays',
 'world_health_organization , 2020b',
 '1·5',
 'two-beforeand-three-after',
 'slit_lamp',
 'münchen',
 'self-quarantine',
 'tangerine',
 'chest_tightness',
 'aminoglycoside',
 'nonhuman_primate',
 'mortality_rate',
 'collection_date',
 "the 'qlqmgfgitvqygt",
 '1 , 2\n',
 'omeprazole',
 'takes_place',
 'pcdna3.1 expression_vector plasmids , respectively\n',
 'western_medicine',
 'respiratory_illness',
 'chromium',
 'hemorrhagic_fever',
 'qpyrvvvlsf',
 'infected_individuals',
 'mientras',
 'polyclonal_antibodies',
 'line_list',
 'enfermedade

In [109]:
len(chem)

1505

In [93]:
entities_and_label_from_5_NER_model_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,,,
1,,,
2,,,
3,,,
4,,,
...,...,...,...
103,,,
104,,,
105,,,
106,septic_shock,COVID,seed


In [143]:
d = entities_and_label_from_5_NER_model_dataframe.groupby(["Entity"], as_index=False).count()
filter = d["Label"] != 1
c = d.where(filter)
c

Unnamed: 0,Entity,Label,Ner_model
0,,,
1,,,
2,,,
3,,,
4,,,
...,...,...,...
9376,,,
9377,,2.0,2.0
9378,,2.0,2.0
9379,,,


In [150]:
entities_with_one_more_label = [en for en in c.where(filter)["Entity"] if en == en]
entities_with_one_more_label

['-london',
 '//crdd.osdd.net/ ragha va/toxin pred/',
 '//creativecommons',
 '//github.com/aakhmetz/ wuhanincubationperiod2020\n',
 '//github.com/globalcitizen/2019-wuhan-coronavirus-data\n',
 '//omict ools.com/aller hunte r-tool',
 '//www.allelefrequencies.net/',
 '//www.cbs.dtu.dk/servi ces/tmhmm',
 '//www.cdc.gov/',
 '//www.cdc.gov/flu/weekly/',
 '//www.gisaid.org/',
 '//www.mdpi.com/2077-0383/9/2/538/s1',
 '//www.nytimes.com/2020/01/22/world/asia/coronavirusquarantines-history.html',
 '//www.uniprot.org/uniprot/p59594',
 '//www.viprbrc.org/',
 '//www.who.int/',
 '//www.who.int/docs/default-source/coronaviruse/situationreports/20200211-sitrep-22-ncov.pdf',
 '/oxygen',
 '0-fold_increase',
 '0-folde2-fold_increase',
 '0.7.12',
 '1-2 mg/kg',
 '1.4e2.5',
 '1.7e4.3\n',
 '1/γ b\n',
 '1/ε',
 '1/ω',
 '1/ω b',
 '10/4719',
 '11,317e12,118 cumulative_cases',
 '11,588e13,499',
 '117/47 mm_hg',
 '1186/s40779-020-0233-6\n',
 '13/565',
 '13/565-i.e',
 '15/10009',
 '1918',
 '1:10',
 '1:40-1:80\n',


In [151]:
len(entities_with_one_more_label)

3036

In [153]:
test_en = entities_with_one_more_label[1000]
test_en

'epi_isl_402131'

In [167]:
filter = entities_and_label_from_5_NER_model_dataframe["Entity"] == test_en
entities_and_label_from_5_NER_model_dataframe.where(filter)

Unnamed: 0,Entity,Label,Ner_model
0,,,
1,,,
2,,,
3,,,
4,,,
...,...,...,...
103,,,
104,,,
105,,,
106,,,


In [168]:
t = [en for en in entities_and_label_from_5_NER_model_dataframe.where(filter)["Label"] if en == en]
t

['GENE_OR_GENE_PRODUCT', 'CELL_LINE']