In [None]:
#!pip install --upgrade pandas

In [None]:
#!pip install featuretools

In [None]:
#!pip install git+https://github.com/dgunning/cord19.git

In [1]:
import pandas as pd
import pickle
import subprocess
import shutil
import hashlib
#import nltk
import os

In [2]:
from cord import ResearchPapers
import featuretools as ft
#from featuretools.nlp_primitives import UniversalSentenceEncoder


  import pandas.util.testing as tm


In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
#path information
task='task1'# or 'task2' # specify task
#root_path='/repo1/code/autoreview/'
root_path='./'
data_path=root_path+'data/'+task+'/'
literature_path=data_path+'CORD-19-research-challenge/'#path to save retrieved articles abstract
sentence_file_name='hypercoagulable_sentences.tsv'
#sentence_embedding_file_name='hypercoagulable_sentence_USEfeatures.pkl'
keywords_of_interest =['anticoagulants','venous thromboembolism', 'thrombotic complications', 'hypercoagulability','clot formation', 'Thrombosis', 'Thrombotic', 'D-Dimer']


In [5]:
def download_literature():
    """
    Download literatures from CORD19, Google Scholar, and Bioarix.
    """
    #Download
    #Biorxiv
    subprocess.call(['curl -o collection.json https://connect.biorxiv.org/relate/collection_json.php?grp=181'], shell=True)
    #CORD19
    subprocess.call(['wget https://www.dropbox.com/s/osa58hx8rs5yl3t/metadata.csv?raw=1'],shell=True)
    #Google Scholar
    subprocess.call(['wget https://www.dropbox.com/s/5incr3c86sh43gq/hypercoagulable_fulltext.xlsx?raw=1'],shell=True)

    #Create directory to put literature files
    if not os.path.exists(literature_path):
        os.makedirs(literature_path)
    
    #move files to data_path
    shutil.move('./metadata.csv?raw=1',literature_path+'metadata_old.csv')
    shutil.move('./hypercoagulable_fulltext.xlsx?raw=1',literature_path+'hypercoagulable_fulltext.xlsx')
    shutil.move('./collection.json', literature_path+'biorxiv.json')


In [6]:
def merge2cord19_metadata():
    """
    Merge the three different dataframes into one dataframe following CORD19 structure
    """
    
    def hash(sourcedf,destinationdf,*column):
        columnName = ''
        destinationdf['sha'] = pd.DataFrame(sourcedf[list(column)].values.sum(axis=1))[0].str.encode('utf-8').apply(lambda x: (hashlib.sha512(x).hexdigest().upper()))
        destinationdf['cord_uid'] = destinationdf['sha'].apply(lambda x: x[0:8] )
    # hash(df,df,'ID','Salt')

    #Load the literature files as pd.Dataframe
    df_cord19_meta = pd.read_csv(literature_path+'metadata_old.csv')
    df_pap_gscholar = pd.read_excel(literature_path+'hypercoagulable_fulltext.xlsx')
    df_biomedRxiv = pd.json_normalize(pd.read_json(literature_path+'biorxiv.json')['rels'],'rel_authors',['rel_title', 'rel_doi',\
                                                                        'rel_link', 'rel_abs', \
                                                                        'rel_date', 'rel_site'])

    df_biomedRxiv = df_biomedRxiv.groupby(['rel_title', 'rel_doi', 'rel_link',
           'rel_abs', 'rel_date', 'rel_site'])['author_name'].agg(author_name=lambda x: ','.join(x)).reset_index()


    # Merge the three different dataframes into `df_cord19_meta_augmented`
    dic_pap2cord10 = {'Authors':'authors', 'Title': 'title', 
                      'Year':'publish_time',  'ArticleURL':'url', 'Source':'journal',
                       'DOI':'doi', 'Abstract':'abstract'}
    dic_biomedRxiv2cord10 = {
        'rel_title':'title', 'rel_doi':'doi' ,
        'rel_link':'url', 'rel_abs':'abstract', 
        'author_name':'authors', 'rel_date':'publish_time', 
        'rel_site':'journal'
    }

    #display(df_cord19_meta.head(1))

    df_pap_scholar_compatible = df_pap_gscholar[list(dic_pap2cord10.keys())]
    df_pap_scholar_compatible.columns = list(dic_pap2cord10.values())
    hash(df_pap_scholar_compatible,df_pap_scholar_compatible,'title')
    #display(df_pap_scholar_compatible.head(1))

    df_biomedRxiv_compatible = df_biomedRxiv[list(dic_biomedRxiv2cord10.keys())]
    df_biomedRxiv_compatible.columns = list(dic_biomedRxiv2cord10.values())
    hash(df_biomedRxiv_compatible,df_biomedRxiv_compatible,'title')
    #display(df_biomedRxiv_compatible.head(1))

    df_cord19_meta_augmented = df_cord19_meta.append(df_pap_scholar_compatible, ignore_index=True).append(df_biomedRxiv_compatible, ignore_index=True)
    df_cord19_meta_augmented['publish_time'] = df_cord19_meta_augmented['publish_time'].astype('datetime64')
    
    df_cord19_meta_augmented.to_csv(literature_path+'metadata.csv')

    #return df_cord19_meta_augmented



In [7]:
def create_sentence_df(df):
    u = df.abstract.str.split('.',expand=True).stack()

    sentences = pd.DataFrame({
        'ncord_uid': u.index.get_level_values(0) , 
        'sentence': u.values
    })
    
    return sentences
    

In [8]:
def create_entityset(entityset_name, df):
    es = ft.EntitySet(entityset_name)
    sentences=create_sentence_df(df)

    es = es.entity_from_dataframe(entity_id="paper",
                                  dataframe=df,
                                  index = 'ncord_uid',
                                  make_index = True,
                                 )

    es = es.entity_from_dataframe(entity_id="sentence",
                                  dataframe=sentences,
                                  index = 'sid',
                                  make_index = True,
                                 )

    es = es.add_relationship(ft.Relationship(es["paper"]["ncord_uid"],
                                       es["sentence"]["ncord_uid"]))

    return es


In [9]:
# Prepare merged `metadata` if not exist
download_literature()
merge2cord19_metadata()

  if (await self.run_code(code, result,  async_=asy)):


In [10]:
# integrate with CORD19 package for screening/searching
research_papers = ResearchPapers.load(data_dir=data_path)

Loading metadata from data/task1/CORD-19-research-challenge
Cleaning metadata
Applying tags to metadata

Indexing research papers
Creating the BM25 index from the abstracts of the papers
Use index="text" if you want to index the texts of the paper instead
Finished Indexing in 234.0 seconds


In [12]:
# Search papers related to keywords
fields = ['cord_uid','sha','title','journal','authors','abstract','covid_related','virus','coronavirus','sars','published','when']
hypercoagulable =pd.DataFrame(columns = fields)

for item in keywords_of_interest:
    print('Searching term = ', item)
    temp  = research_papers.contains('treat',column='abstract').search(item,num_results=1000)
    hypercoagulable = hypercoagulable.append(temp.results[fields], ignore_index = True)

Searching term =  anticoagulants
Searching term =  venous thromboembolism
Searching term =  thrombotic complications
Searching term =  hypercoagulability
Searching term =  clot formation
Searching term =  Thrombosis
Searching term =  Thrombotic
Searching term =  D-Dimer


In [None]:
hypercoagulable.reset_index().rename(columns={'index':'ncord_uid'}).to_csv(data_path+'metadata_hypercoagulable.tsv', sep='\t')

In [None]:
# # Generate sentence level dataframe from all papers
# es=create_entityset("covid19_complete", research_papers.metadata[fields])
# fulldf = es['sentence'].df.merge(es['paper'].df[['ncord_uid','cord_uid','sha']])
# fulldf.rename(columns={'sha':'paper_id'},inplace=True)
# fulldf.to_csv(literature_path+'metadata_sentences_with_cord_uid.csv')


In [13]:
# Generate sentence level dataframe from hypercoagulabel papers
#es_hyper=create_entityset("covid19", hypercoagulable[fields])
#es_hyper['sentence'].df.to_csv(data_path+sentence_file_name, sep='\t')
