In [42]:
# This script finds matches to sets of words in tokenized datasets.  It can run over titles, abstracts, body text, etc.

In [43]:
RunTitle    = True
RunAbstract = True
RunBody     = False

In [44]:
import numpy as np
import pylab
import pandas as pd
import json
import os
import scispacy
import spacy
nlp=spacy.load("en_core_sci_lg")

In [45]:
# Loop over all the files in these paths 
Paths=["./CORD-19-research-challenge/noncomm_use_subset/noncomm_use_subset/","./CORD-19-research-challenge/comm_use_subset/comm_use_subset/","./CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv/","./CORD-19-research-challenge/custom_license/custom_license/"]
#Paths=["./CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv/"]

In [46]:
# These functions determine what blocks are pulled from the paper for matching
def TitleBlocks(paper):
    return([{'text':paper['metadata']['title']}])

def AbstractBlocks(paper):
    return(paper['abstract'])

def BodyBlocks(paper):
    return(paper['body_text'])

In [47]:
# This function finds matching lemmas and notes positions of
# occurence in the relevant json block
def PullMentions(Paths, BlockSelector,SecName, Words):
    Positions=[]
    FoundWords=[]
    Section=[]
    BlockID=[]
    BlockText=[]
    PaperID=[]
    
    tokenized_words=[]
    for w in Words:
        tokenized_words.append(nlp(w)[0].lemma_)
    for Path in Paths:
        print(Path)

        Files=os.listdir(Path)
        for p in Files:

            readfile=open(Path+p,'r')
            paper=json.load(readfile)
            Blocks=BlockSelector(paper)

            for b in range(0,len(Blocks)):
                text=nlp(Blocks[b]['text'])

                for t in text:
                    for w in tokenized_words:
                        if(w == t.lemma_):
                            Section.append(SecName)
                            FoundWords.append(w)
                            Positions.append(t.idx)
                            BlockText.append(Blocks[b]['text'])
                            BlockID.append(b)
                            PaperID.append(p[:-5])
    return {'sha':PaperID,'blockid':BlockID,'word':FoundWords,'sec':Section,'pos':Positions,'block':BlockText}

In [48]:
# Run to get treatment words
def ExtractToCSV(Words,Filename):

    DataDicts=[]
    if(RunTitle): 
        DataDicts.append(PullMentions(Paths, TitleBlocks,    "title",    Words))
    if(RunAbstract):
        DataDicts.append(PullMentions(Paths, AbstractBlocks, "abstract", Words))
    if(RunBody):
        DataDicts.append(PullMentions(Paths, BodyBlocks,     "body",     Words))

    SummedDictionary=DataDicts[0]
    for k in DataDicts[0].keys():
        for d in DataDicts:
            SummedDictionary[k]=SummedDictionary[k]+d[k]

    dat=pd.DataFrame(SummedDictionary)
    dat.to_csv(Filename)


In [None]:
Words=['treat','treatment' 'alleviate', 'manage', 'suppress','suppression', 'prescribe','therapy','cure','remedy', 'therapeutic','administer']
ExtractToCSV(Words, "TitleAbstractMatches_therapies.csv")

Words=["naproxen","clarithromycin","chloroquine","kaletra","Favipiravir","Avigan",'hydroxychloroquine','baricitinib']
ExtractToCSV(Words, "TitleAbstractMatches_drugs.csv")

Words=["COVID-19", "Coronavirus", "Corona", "2019-nCoV", "SARS-CoV",]
ExtractToCSV(Words, "TitleAbstractMatches_virusnames.csv")

Words=["vitro", "vivo", "in-vitro", "in-vivo", "mouse","mice","clinial","human","computational","vertical","horizontal","theoretical","simulation"]
ExtractToCSV(Words, "TitleAbstractMatches_exptypes.csv")



./CORD-19-research-challenge/noncomm_use_subset/noncomm_use_subset/
./CORD-19-research-challenge/comm_use_subset/comm_use_subset/
./CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv/
./CORD-19-research-challenge/custom_license/custom_license/
./CORD-19-research-challenge/noncomm_use_subset/noncomm_use_subset/
./CORD-19-research-challenge/comm_use_subset/comm_use_subset/
./CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv/
./CORD-19-research-challenge/custom_license/custom_license/
./CORD-19-research-challenge/noncomm_use_subset/noncomm_use_subset/
./CORD-19-research-challenge/comm_use_subset/comm_use_subset/
./CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv/
./CORD-19-research-challenge/custom_license/custom_license/
./CORD-19-research-challenge/noncomm_use_subset/noncomm_use_subset/
./CORD-19-research-challenge/comm_use_subset/comm_use_subset/
