In [None]:
#!pip install --upgrade pandas

In [None]:
#!pip install featuretools

In [None]:
#!pip install git+https://github.com/dgunning/cord19.git

In [1]:
import pandas as pd
import pickle
import subprocess
import shutil
import hashlib
#import nltk
import os

In [2]:
from cord import ResearchPapers
import featuretools as ft
#from featuretools.nlp_primitives import UniversalSentenceEncoder


  import pandas.util.testing as tm


In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
#path information
task='task1'# or 'task2' # specify task
#root_path='/repo1/code/autoreview/'
root_path='./'
data_path=root_path+'data/'+task+'/'
literature_path=data_path+'CORD-19-research-challenge/'#path to save retrieved articles abstract
sentence_file_name='hypercoagulable_sentences.tsv'
#sentence_embedding_file_name='hypercoagulable_sentence_USEfeatures.pkl'
keywords_of_interest =['anticoagulants','venous thromboembolism', 'thrombotic complications', 'hypercoagulability','clot formation', 'Thrombosis', 'Thrombotic', 'D-Dimer']


In [5]:
literature_path

'./data/task1/CORD-19-research-challenge/'

In [6]:
def download_literature():
    """
    Download literatures from CORD19, Google Scholar, and Bioarix.
    """
    #Download
    #Biorxiv
    subprocess.call(['curl -o collection.json https://connect.biorxiv.org/relate/collection_json.php?grp=181'], shell=True)
    #CORD19
    subprocess.call(['wget https://www.dropbox.com/s/osa58hx8rs5yl3t/metadata.csv?raw=1'],shell=True)
    #Google Scholar
    subprocess.call(['wget https://www.dropbox.com/s/5incr3c86sh43gq/hypercoagulable_fulltext.xlsx?raw=1'],shell=True)

    #Create directory to put literature files
    if not os.path.exists(literature_path):
        os.makedirs(literature_path)
    
    #move files to data_path
    shutil.move('./metadata.csv?raw=1',literature_path+'metadata_old.csv')
    shutil.move('./hypercoagulable_fulltext.xlsx?raw=1',literature_path+'hypercoagulable_fulltext.xlsx')
    shutil.move('./collection.json', literature_path+'biorxiv.json')


In [7]:
def merge2cord19_metadata():
    """
    Merge the three different dataframes into one dataframe following CORD19 structure
    """
    
    def hash(sourcedf,destinationdf,*column):
        columnName = ''
        destinationdf['sha'] = pd.DataFrame(sourcedf[list(column)].values.sum(axis=1))[0].str.encode('utf-8').apply(lambda x: (hashlib.sha512(x).hexdigest().upper()))
        destinationdf['cord_uid'] = destinationdf['sha'].apply(lambda x: x[0:8] )
    # hash(df,df,'ID','Salt')

    #Load the literature files as pd.Dataframe
    df_cord19_meta = pd.read_csv(literature_path+'metadata_old.csv')
    df_pap_gscholar = pd.read_excel(literature_path+'hypercoagulable_fulltext.xlsx')
    df_biomedRxiv = pd.json_normalize(pd.read_json(literature_path+'biorxiv.json')['rels'],'rel_authors',['rel_title', 'rel_doi',\
                                                                        'rel_link', 'rel_abs', \
                                                                        'rel_date', 'rel_site'])

    df_biomedRxiv = df_biomedRxiv.groupby(['rel_title', 'rel_doi', 'rel_link',
           'rel_abs', 'rel_date', 'rel_site'])['author_name'].agg(author_name=lambda x: ','.join(x)).reset_index()


    # Merge the three different dataframes into `df_cord19_meta_augmented`
    dic_pap2cord10 = {'Authors':'authors', 'Title': 'title', 
                      'Year':'publish_time',  'ArticleURL':'url', 'Source':'journal',
                       'DOI':'doi', 'Abstract':'abstract'}
    dic_biomedRxiv2cord10 = {
        'rel_title':'title', 'rel_doi':'doi' ,
        'rel_link':'url', 'rel_abs':'abstract', 
        'author_name':'authors', 'rel_date':'publish_time', 
        'rel_site':'journal'
    }

    #display(df_cord19_meta.head(1))

    df_pap_scholar_compatible = df_pap_gscholar[list(dic_pap2cord10.keys())]
    df_pap_scholar_compatible.columns = list(dic_pap2cord10.values())
    hash(df_pap_scholar_compatible,df_pap_scholar_compatible,'title')
    #display(df_pap_scholar_compatible.head(1))

    df_biomedRxiv_compatible = df_biomedRxiv[list(dic_biomedRxiv2cord10.keys())]
    df_biomedRxiv_compatible.columns = list(dic_biomedRxiv2cord10.values())
    hash(df_biomedRxiv_compatible,df_biomedRxiv_compatible,'title')
    #display(df_biomedRxiv_compatible.head(1))

    df_cord19_meta_augmented = df_cord19_meta.append(df_pap_scholar_compatible, ignore_index=True).append(df_biomedRxiv_compatible, ignore_index=True)
    df_cord19_meta_augmented['publish_time'] = df_cord19_meta_augmented['publish_time'].astype('datetime64')
    
    df_cord19_meta_augmented.to_csv(literature_path+'metadata.csv')

    #return df_cord19_meta_augmented



In [8]:
def create_sentence_df(df):
    u = df.abstract.str.split('.',expand=True).stack()

    sentences = pd.DataFrame({
        'ncord_uid': u.index.get_level_values(0) , 
        'sentence': u.values
    })
    
    return sentences
    

In [9]:
def create_entityset(entityset_name, df):
    es = ft.EntitySet(entityset_name)
    sentences=create_sentence_df(df)

    es = es.entity_from_dataframe(entity_id="paper",
                                  dataframe=df,
                                  index = 'ncord_uid',
                                  make_index = True,
                                 )

    es = es.entity_from_dataframe(entity_id="sentence",
                                  dataframe=sentences,
                                  index = 'sid',
                                  make_index = True,
                                 )

    es = es.add_relationship(ft.Relationship(es["paper"]["ncord_uid"],
                                       es["sentence"]["ncord_uid"]))

    return es


In [10]:
# Prepare merged `metadata` if not exist
download_literature()
merge2cord19_metadata()

  if (await self.run_code(code, result,  async_=asy)):


In [11]:
# integrate with CORD19 package for screening/searching
research_papers = ResearchPapers.load(data_dir=data_path)

Loading metadata from data/task1/CORD-19-research-challenge
Cleaning metadata
Applying tags to metadata

Indexing research papers
Creating the BM25 index from the abstracts of the papers
Use index="text" if you want to index the texts of the paper instead
Finished Indexing in 233.0 seconds


In [12]:
# Search papers related to keywords
fields = ['cord_uid','sha','title','journal','authors','abstract','covid_related','virus','coronavirus','sars','published','when']
hypercoagulable =pd.DataFrame(columns = fields)

for item in keywords_of_interest:
    print('Searching term = ', item)
    temp  = research_papers.contains('treat',column='abstract').search(item,num_results=1000)
    hypercoagulable = hypercoagulable.append(temp.results[fields], ignore_index = True)

Searching term =  anticoagulants
Searching term =  venous thromboembolism
Searching term =  thrombotic complications
Searching term =  hypercoagulability
Searching term =  clot formation
Searching term =  Thrombosis
Searching term =  Thrombotic
Searching term =  D-Dimer


In [13]:
# Generate sentence level dataframe from all papers
es=create_entityset("covid19_complete", research_papers.metadata[fields])
fulldf = es['sentence'].df.merge(es['paper'].df[['ncord_uid','cord_uid','sha']])
fulldf.rename(columns={'sha':'paper_id'},inplace=True)
fulldf.to_csv(literature_path+'metadata_sentences_with_cord_uid.csv')


In [14]:
# Generate sentence level dataframe from hypercoagulabel papers
es_hyper=create_entityset("covid19", hypercoagulable[fields])
es_hyper['sentence'].df.to_csv(data_path+sentence_file_name, sep='\t')


In [20]:
research_papers.metadata[fields]

Unnamed: 0,cord_uid,sha,title,journal,authors,abstract,covid_related,virus,coronavirus,sars,published,when
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,Clinical features of culture-proven Mycoplasma...,BMC Infect Dis,"Madani, Tariq A; Al-Ghamdi, Aisha A",OBJECTIVE: This retrospective chart review des...,False,False,False,False,2001-07-04,19 years ago
1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,Nitric oxide: a pro-inflammatory mediator in l...,Respir Res,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Inflammatory diseases of the respiratory tract...,False,False,False,False,2000-08-15,20 years ago
2,ejv2xln0,06ced00a5fc04215949aa72528f2eeaae1d58927,Surfactant protein-D and pulmonary host defense,Respir Res,"Crouch, Erika C",Surfactant protein-D (SP-D) participates in th...,False,True,False,False,2000-08-25,20 years ago
3,2b73a28n,348055649b6b8cf2b9a376498df9bf41f7123605,Role of endothelin-1 in lung disease,Respir Res,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",Endothelin-1 (ET-1) is a 21 amino acid peptide...,False,False,False,False,2001-02-22,19 years ago
4,9785vg6d,5f48792a5fa08bed9f56016f4981ae2ca6031b32,Gene expression in epithelial cells in respons...,Respir Res,"Domachowske, Joseph B; Bonville, Cynthia A; Ro...",Respiratory syncytial virus (RSV) and pneumoni...,False,True,False,False,2001-05-11,19 years ago
...,...,...,...,...,...,...,...,...,...,...,...,...
144424,247517D7,247517D70CB9DFDE79E2B8DB415EAC61CEB02AEDBF6177...,eCovSens-Ultrasensitive Novel In-House Built P...,biorxiv,"Subhasis Mahari,Akanksha Roberts,Deepshikha Sh...",Severe acute respiratory syndrome coronavirus ...,True,True,True,False,2020-04-25,1 month ago
144425,C3134D68,C3134D68AD2A300616A0E6417BEE06D12FC1AC84BDCDF8...,iSCAN: An RT-LAMP-coupled CRISPR-Cas12 module ...,medrxiv,"Zahir Ali,Rashid Aman,Ahmed Mahas,sivakrishna ...",The COVID-19 pandemic caused by SARS-CoV-2 aff...,True,True,False,False,2020-06-05,2 weeks ago
144426,647A925F,647A925FDBE1BB8E47EB95D64C62F4689216E82FAA938A...,,biorxiv,"A. Sina Booeshaghi,Lior Pachter",Single-cell RNA-seq technologies have been suc...,True,False,False,False,2020-05-19,4 weeks ago
144427,21DAC328,21DAC3289CEB276401D586FD85C56FF45683A8DC64D9BC...,protein-sol pKa: prediction of electrostatic f...,biorxiv,"Max Hebditch,Jim Warwicker",Evolution couples differences in ambient pH to...,False,True,True,False,2020-04-22,1 month ago


In [19]:
hypercoagulable

Unnamed: 0,cord_uid,sha,title,journal,authors,abstract,covid_related,virus,coronavirus,sars,published,when
0,6580mi5z,,Anti-thrombotic therapy in patients with atria...,Expert review of neurotherapeutics,"Diener, Hans-Christoph; Stanford, Sophia; Abdu...",Patients with atrial fibrillation have an incr...,False,False,False,False,2014-01-01,6 years ago
1,7e4niv98,,Early Recurrence and Cerebral Bleeding in Pati...,Stroke,"Paciaroni, Maurizio; Agnelli, Giancarlo; Faloc...",AND PURPOSE The best time for administering a...,False,False,False,False,2015-01-01,5 years ago
2,i4jy2wj6,,Use of emerging oral anticoagulants in clinica...,Annals of surgery,"Merli, Geno; Spyropoulos, Alex C; Caprini, Jos...",OBJECTIVE A review of clinical data from oral ...,False,False,False,False,2009-01-01,11 years ago
3,t7memf4w,,Nonvitamin-K-antagonist oral anticoagulants ve...,International journal of stroke : official jou...,"Ntaios, George; Papavasileiou, Vasileios; Dien...",Background In a previous systematic review and...,False,False,False,False,2017-01-01,3 years ago
4,et5totru,,"Sex-related differences in risk factors, type ...",European stroke journal,"Antonenko, Kateryna; Paciaroni, Maurizio; Agne...",Introduction Atrial fibrillation is an indepen...,False,False,False,False,2017-01-01,3 years ago
...,...,...,...,...,...,...,...,...,...,...,...,...
7856,6hii27h6,,Hernia repair and simultaneous continuous ambu...,Hernia : the journal of hernias and abdominal ...,"Horvath, P; Königsrainer, A; Mühlbacher, T; Th...",Occurrence of abdominal wall hernias during a...,False,False,False,False,2019-11-26,6 months ago
7857,0n7905e0,,Suspected Lateral Periodontal Cyst Presenting ...,Journal of veterinary dentistry,"Tjepkema, Jennifer; Soukup, Jason W; Bell, Cyn...",Lateral periodontal cysts (LPCs) are odontogen...,False,False,False,False,2017-01-01,3 years ago
7858,x738yj4w,,Perioperative increase in global blood flow to...,The Cochrane database of systematic reviews,"Grocott, Michael P W; Dushianthan, Ahilanandan...",Studies have suggested that increasing whole ...,False,False,False,False,2012-01-01,8 years ago
7859,5tp3zfze,,Bilateral acute iris transillumination.,Archives of ophthalmology,"Tugal-Tutkun, Ilknur; Onal, Sumru; Garip, Ayli...",OBJECTIVE To describe a series of patients wit...,False,False,False,False,2011-01-01,9 years ago


In [15]:
fulldf

Unnamed: 0,sid,ncord_uid,sentence,cord_uid,paper_id
0,0,0,OBJECTIVE: This retrospective chart review des...,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb
1,1,0,METHODS: Patients with positive M,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb
2,2,0,pneumoniae cultures from respiratory specimen...,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb
3,3,0,Charts of patients were reviewed,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb
4,4,0,"RESULTS: 40 patients were identified, 33 (82",ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb
...,...,...,...,...,...
1420065,1420065,144428,Here we employed single-cell RNA-seq (scRNA-s...,489FD052,489FD052FE0E26BCBB34508813E39E83DC9033B9019B28...
1420066,1420066,144428,We report the co-expression of ACE2 and TMPRS...,489FD052,489FD052FE0E26BCBB34508813E39E83DC9033B9019B28...
1420067,1420067,144428,"Importantly, we fail to detect the expression...",489FD052,489FD052FE0E26BCBB34508813E39E83DC9033B9019B28...
1420068,1420068,144428,These results indicated that in COVID-19 asso...,489FD052,489FD052FE0E26BCBB34508813E39E83DC9033B9019B28...


In [17]:
research_papers.metadata

Unnamed: 0.1,Unnamed: 0,cord_uid,sha,source,title,doi,pmcid,pubmed_id,license,abstract,...,pmc_json_files,url,s2_id,when,covid_related,virus,coronavirus,sars,index_tokens,antivirals
0,0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636.0,no-cc,OBJECTIVE: This retrospective chart review des...,...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,,19 years ago,False,False,False,False,"[objective, retrospective, chart, review, desc...",
1,1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11667967.0,no-cc,Inflammatory diseases of the respiratory tract...,...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,,20 years ago,False,False,False,False,"[inflammatory, diseases, respiratory, tract, c...",
2,2,ejv2xln0,06ced00a5fc04215949aa72528f2eeaae1d58927,PMC,Surfactant protein-D and pulmonary host defense,10.1186/rr19,PMC59549,11667972.0,no-cc,Surfactant protein-D (SP-D) participates in th...,...,document_parses/pmc_json/PMC59549.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,,20 years ago,False,True,False,False,"[surfactant, protein-d, sp-d, participates, in...",
3,3,2b73a28n,348055649b6b8cf2b9a376498df9bf41f7123605,PMC,Role of endothelin-1 in lung disease,10.1186/rr44,PMC59574,11686871.0,no-cc,Endothelin-1 (ET-1) is a 21 amino acid peptide...,...,document_parses/pmc_json/PMC59574.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,,19 years ago,False,False,False,False,"[endothelin-1, et-1, amino, acid, peptide, div...",
4,4,9785vg6d,5f48792a5fa08bed9f56016f4981ae2ca6031b32,PMC,Gene expression in epithelial cells in respons...,10.1186/rr61,PMC59580,11686888.0,no-cc,Respiratory syncytial virus (RSV) and pneumoni...,...,document_parses/pmc_json/PMC59580.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,,19 years ago,False,True,False,False,"[respiratory, syncytial, virus, rsv, pneumonia...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144424,144424,247517D7,247517D70CB9DFDE79E2B8DB415EAC61CEB02AEDBF6177...,,eCovSens-Ultrasensitive Novel In-House Built P...,10.1101/2020.04.24.059204,,,,Severe acute respiratory syndrome coronavirus ...,...,,http://biorxiv.org/cgi/content/short/2020.04.2...,,1 month ago,True,True,True,False,"[severe, acute, respiratory, syndrome, coronav...",
144425,144425,C3134D68,C3134D68AD2A300616A0E6417BEE06D12FC1AC84BDCDF8...,,iSCAN: An RT-LAMP-coupled CRISPR-Cas12 module ...,10.1101/2020.06.02.20117739,,,,The COVID-19 pandemic caused by SARS-CoV-2 aff...,...,,http://medrxiv.org/cgi/content/short/2020.06.0...,,2 weeks ago,True,True,False,False,"[covid-19, pandemic, caused, sars-cov-2, affec...",
144426,144426,647A925F,647A925FDBE1BB8E47EB95D64C62F4689216E82FAA938A...,,,10.1101/2020.05.19.100214,,,,Single-cell RNA-seq technologies have been suc...,...,,http://biorxiv.org/cgi/content/short/2020.05.1...,,4 weeks ago,True,False,False,False,"[single-cell, rna-seq, technologies, successfu...",
144427,144427,21DAC328,21DAC3289CEB276401D586FD85C56FF45683A8DC64D9BC...,,protein-sol pKa: prediction of electrostatic f...,10.1101/2020.04.21.053967,,,,Evolution couples differences in ambient pH to...,...,,http://biorxiv.org/cgi/content/short/2020.04.2...,,1 month ago,False,True,True,False,"[evolution, couples, differences, ambient, ph,...",


In [None]:
# # Extract sentence embedding from UniversalSentenceEncoder
# It takes around 15 minutes
# fm_inv, features_inv = ft.dfs(entityset=es_hyper,
#                               target_entity='sentence',
#                               max_depth = 1,
#                               trans_primitives =[UniversalSentenceEncoder],
#                               verbose=True)
# fm_inv.filter(regex='UNIVERSAL_SENTENCE_ENCODER(sentence)*|ncord_uid',axis=1)
# fm_inv.to_pickle(data_path+'hypercoagulable_sentence_USEfeatures.pkl')