In [1]:
#!/usr/bin/env python

In [2]:
__author__ = "Tariq Faquih"
__copyright__ = "Copyright 2020, Clinical Epidemiology Department, LUMC"
__credits__ = ["Tariq Faquih", "Linda Nab", "Ype Jong"]
__license__ = "MIT License"
__maintainer__ = "Tariq Faquiih"
__email__ = "t.o.faquih@lumc.nl"
__status__ = "Development"

# Import modules

In [3]:
import json, csv , os , sys , datetime
from Bio import Entrez
from Bio import Medline



# COVID class

This class objects send pubmed queries through the API and stores the output in json files

In [4]:

class COVID:
 
    def __init__(self , startD , endD):

        #list of query terms to be used in the search_function

        self.querydict = {'COVID':None , 
                 'Big Five':'(NEJM[journal] OR BMJ[journal] OR lancet[journal] OR nature[journal] OR JAMA[journal])', 
                 'Elderly':'elderly[TITLE]',
                 'Clinical Trial':'clinical trial[Title/Abstract]' , 
                 'Italy':'italy[Title/Abstract]' , 
                 'Netherlands':'netherlands[Title/Abstract]' , 
                 'Case Control':'case control study' , 
                 'Epidemiology':'epidemiology' , 
                 'Mortality':'mortality', 
                 'Pregnant':'pregnant[TITLE]',
                 'Treatment':'(treatment[All Fields] OR drug[All Fields] OR intervention[All Fields] OR recovery[All Fields])' }
        
        #json_file stores the proper json format to be used in the googlesheet
        #dict_file stores the output in a dictionary to be loaded in later uses
        json_file ='results3.json'
        dict_file ='results_dictionary3.json'

        #read the stored dictionary file (dict_file) or create a new blank dictionary
        if os.path.isfile(dict_file):
            self.mainDict = json.load(open(dict_file))
        else:
            self.mainDict = {}
        
        #set counter for how many articles are added and create a list to store the log messages
        self.NumNew = 0
        self.Log = []
        
        #for each query term in the querylist, run the search function using the provideed start
        #and end dates
        for K in self.querydict.keys():
            self.search_function(K , startD , endD )
            
        #for X in querylist:
        #    self.search_function(X , startD , endD )
        
        #add the total number of added articles to the log list
        self.Log.append(self.NumNew)
        
        #Write the main json file
        with open(json_file , 'w') as fp:   
            Output=[]
            for Key,Item in self.mainDict.items():
                Output.append(Item)
            json.dump(Output, fp)
            
        #Write the exact dict as json file (easily read by the script)
        with open(dict_file , 'w') as fp:   
            json.dump(self.mainDict, fp)
        
        with open('log_{}_{}.txt'.format(startD.replace('/' , '') , endD.replace('/' , '')) ,'w'  , newline='') as fp:
            W = csv.writer(fp)
            W.writerow(['Search results for range {} to {}'.format(startD , endD)])
            W.writerow(['Number of Records Added: {}'.format(self.Log[-1])])
            for Line in self.Log[:-1]:
                W.writerow([Line])
        

    def search_function (self , MyTerms, startD , endD):

        Entrez.email = "tariqf549@gmail.com"
        MainTerm = """(((("coronavirus"[MeSH Terms] OR "coronavirus"[All Fields]) AND ("COVID-19"[All Fields] OR "severe acute respiratory syndrome coronavirus 2"[Supplementary Concept] OR "severe acute respiratory syndrome coronavirus 2"[All Fields] OR "2019-nCoV"[All Fields] OR "SARS-CoV-2"[All Fields] OR "2019nCoV"[All Fields] ))))"""

        #MainTerm = '"COVID-19"[All Fields]'
        DateRange = '"{}"[MHDA] : "{}"[MHDA]'.format(startD , endD)
        if MyTerms == 'COVID':
            Query = MainTerm + ' AND ' + DateRange
        else:
            Query = MainTerm + ' AND ' + self.querydict[MyTerms] + ' AND ' + DateRange
            
        print(Query)
        self.Log.append(Query)
        search_results = Entrez.read(
            Entrez.esearch(
                db="pubmed", term=Query,  datetype="pdat", usehistory="y" , sort = 'relevance' 
            )
        )
        count = int(search_results["Count"])
        self.Log.append("Found %i results" % count)
        
        print("Found %i results" % count)

        batch_size = 10
        out_handle = open("pubmed_results/corona_{}_papers.txt".format(MyTerms), "w")
        for start in range(0, count, batch_size):
            end = min(count, start + batch_size)
            print("Going to download record %i to %i" % (start + 1, end))
            self.Log.append("Going to download record %i to %i" % (start + 1, end))
            fetch_handle = Entrez.efetch(
                db="pubmed",
                rettype="medline",
                retmode="text",
                retstart=start,
                retmax=batch_size,
                webenv=search_results["WebEnv"],
                query_key=search_results["QueryKey"],
            )
            data = fetch_handle.read()

            dataresults = data.split('\nPMID')[1:]
            self.add2dict(dataresults , MyTerms)

            fetch_handle.close()
            out_handle.write(data)
        out_handle.close()
        
    def FormatAbstract (self, AB):
        Abstract = ''
        if 'AB' in AB.keys():
            Abstract = AB['AB']
            tempAbs = Abstract.split(' ')
            lastN=0
            newabs = []
            for N in range(30, len(tempAbs)+30, 30) :
                newabs.append(' '.join(tempAbs[lastN:N]))
                lastN= N
                
            Abstract = '\n'.join(newabs)
                
        else: 
            Abstract = 'NA'
            
        return(Abstract)
            
    def add2dict(self , dataresults , Q):
        for hit in dataresults:
            m1 = 'PMID' + hit
            parse_res = Medline.read(m1.split('\n'))
            PMID = parse_res['PMID']
            Tag = Q
            if PMID in self.mainDict.keys():
                print('PMID [{}] exists'.format(PMID))
                self.Log.append('PMID exists')
                if Q not in self.mainDict[PMID]['Tag']:
                    self.mainDict[PMID]['Tag'] = self.mainDict[PMID]['Tag']+';'+Tag
                    
                if self.mainDict[PMID]['Abstract'] == 'NA':
                    NewABS = self.FormatAbstract(parse_res)
                    if NewABS != 'NA':
                        self.mainDict[PMID]['Abstract'] = self.FormatAbstract(parse_res)
                        print('Updated Abstract')
                        self.Log.append('Updated Abstract')
                        

                    
                continue
            else:
                Title = parse_res['TI']
                dateP = parse_res['DP']
                dateC = parse_res['MHDA']

                if 'JT' in parse_res.keys():
                    JournalName  = parse_res['JT']
                else: JournalName = ''

                #if 'LR' in parse_res.keys():
                #    dateMod  = parse_res['LR']
                #else: dateMod = ''

                Abstract = self.FormatAbstract(parse_res)
                
                    
                Link= 'https://www.ncbi.nlm.nih.gov/pubmed/{}'.format(PMID)   


                self.mainDict[PMID] = {'PMID': PMID, 'Title':Title ,
                                'JournalName':JournalName ,
                                  'Creation Date':dateC ,
                                'Publication Date':dateP , 
                                'Abstract':Abstract , 
                                'Link':Link,
                                'Tag':Tag   }
                self.NumNew +=1
                print('Added new PMID: {}'.format(PMID))
                self.Log.append('Added new PMID: {}'.format(PMID))


In [5]:
if __name__ == '__main__':
    Today =datetime.datetime.now()
    StartDate = Today - datetime.timedelta(days=3)

    Today = Today.strftime("%Y/%m/%d")
    StartDate = StartDate.strftime("%Y/%m/%d")
    
    COVID(StartDate , Today )

(((("coronavirus"[MeSH Terms] OR "coronavirus"[All Fields]) AND ("COVID-19"[All Fields] OR "severe acute respiratory syndrome coronavirus 2"[Supplementary Concept] OR "severe acute respiratory syndrome coronavirus 2"[All Fields] OR "2019-nCoV"[All Fields] OR "SARS-CoV-2"[All Fields] OR "2019nCoV"[All Fields] )))) AND "2020/03/25"[MHDA] : "2020/03/28"[MHDA]
Found 172 results
Going to download record 1 to 10
Added new PMID: 32081636
Added new PMID: 32209231
Added new PMID: 32194944
Added new PMID: 32207910
Added new PMID: 32174053
Added new PMID: 32134278
Added new PMID: 32081569
Added new PMID: 32207679
Added new PMID: 32191676
Added new PMID: 32207377
Going to download record 11 to 20
Added new PMID: 32191830
Added new PMID: 32216248
Added new PMID: 32119825
Added new PMID: 32100486
Added new PMID: 32215622
Added new PMID: 32206694
Added new PMID: 32151324
Added new PMID: 32191813
Added new PMID: 32105090
Added new PMID: 32215956
Going to download record 21 to 30
Added new PMID: 322047

KeyboardInterrupt: 

In [6]:
COVID('2020/01/01' , '2020/03/28' )

(((("coronavirus"[MeSH Terms] OR "coronavirus"[All Fields]) AND ("COVID-19"[All Fields] OR "severe acute respiratory syndrome coronavirus 2"[Supplementary Concept] OR "severe acute respiratory syndrome coronavirus 2"[All Fields] OR "2019-nCoV"[All Fields] OR "SARS-CoV-2"[All Fields] OR "2019nCoV"[All Fields] )))) AND "2020/01/01"[MHDA] : "2020/03/28"[MHDA]
Found 1069 results
Going to download record 1 to 10
Added new PMID: 32081636
Added new PMID: 32160889
Added new PMID: 32106567
Added new PMID: 32173241
Added new PMID: 32209231
Added new PMID: 32100667
Added new PMID: 32166607
Added new PMID: 32062645
Added new PMID: 32194944
Added new PMID: 32169119
Going to download record 11 to 20
Added new PMID: 32070465
Added new PMID: 32207910
Added new PMID: 32147628
Added new PMID: 32172672
Added new PMID: 32178768
Added new PMID: 32190290
Added new PMID: 32172669
Added new PMID: 32143123
Added new PMID: 32156648
Added new PMID: 32092911
Going to download record 21 to 30
Added new PMID: 32195

Added new PMID: 32184131
Added new PMID: 32184128
Added new PMID: 32183941
Added new PMID: 32183937
Added new PMID: 32183935
Added new PMID: 32183934
Added new PMID: 32183930
Added new PMID: 32183920
Added new PMID: 32183901
Added new PMID: 32183864
Going to download record 291 to 300
Added new PMID: 32183357
Added new PMID: 32183172
Added new PMID: 32182811
Added new PMID: 32182131
Added new PMID: 32181990
Added new PMID: 32181969
Added new PMID: 32181904
Added new PMID: 32181903
Added new PMID: 32181901
Added new PMID: 32181874
Going to download record 301 to 310
Added new PMID: 32181873
Added new PMID: 32181864
Added new PMID: 32181795
Added new PMID: 32181672
Added new PMID: 32181577
Added new PMID: 32181488
Added new PMID: 32181483
PMID [32180426] exists
Added new PMID: 32180292
Added new PMID: 32180175
Going to download record 311 to 320
Added new PMID: 32180173
Added new PMID: 32180140
Added new PMID: 32179910
Added new PMID: 32179908
Added new PMID: 32179860
Added new PMID: 321

Added new PMID: 32141062
Added new PMID: 32141058
Added new PMID: 32140538
Added new PMID: 32139521
PMID [32139372] exists
Added new PMID: 32139299
Added new PMID: 32138488
Added new PMID: 32138266
Added new PMID: 32135587
Added new PMID: 32135586
Going to download record 581 to 590
Added new PMID: 32135585
Added new PMID: 32135584
Added new PMID: 32135077
Added new PMID: 32134909
Added new PMID: 32134861
Added new PMID: 32134800
Added new PMID: 32134681
Added new PMID: 32134381
PMID [32134278] exists
Added new PMID: 32134205
Going to download record 591 to 600
Added new PMID: 32134116
Added new PMID: 32134111
Added new PMID: 32133964
Added new PMID: 32133962
Added new PMID: 32133833
Added new PMID: 32133832
Added new PMID: 32133578
Added new PMID: 32133153
Added new PMID: 32133152
Added new PMID: 32132747
Going to download record 601 to 610
Added new PMID: 32132744
PMID [32132521] exists
Added new PMID: 32132379
Added new PMID: 32132196
Added new PMID: 32132184
Added new PMID: 3213191

Added new PMID: 32074444
Added new PMID: 32073631
Added new PMID: 32073161
Added new PMID: 32072794
Added new PMID: 32072569
Added new PMID: 32072255
Added new PMID: 32071427
Added new PMID: 32071063
Added new PMID: 32070753
Added new PMID: 32070466
Going to download record 871 to 880
PMID [32070465] exists
Added new PMID: 32070391
Added new PMID: 32068012
Added new PMID: 32067043
Added new PMID: 32066541
Added new PMID: 32066526
Added new PMID: 32066525
Added new PMID: 32065221
Added new PMID: 32065057
Added new PMID: 32065055
Going to download record 881 to 890
Added new PMID: 32064855
Added new PMID: 32064853
Added new PMID: 32064795
Added new PMID: 32062875
PMID [32062645] exists
Added new PMID: 32061335
Added new PMID: 32061313
Added new PMID: 32061311
Added new PMID: 32061284
Added new PMID: 32061201
Going to download record 891 to 900
Added new PMID: 32061200
Added new PMID: 32061198
Added new PMID: 32060933
Added new PMID: 32060789
Added new PMID: 32059801
Added new PMID: 32059

PMID [32111649] exists
PMID [32111645] exists
PMID [32109372] exists
PMID [32109013] exists
PMID [32109011] exists
PMID [32107200] exists
PMID [32105637] exists
PMID [32105632] exists
PMID [32105610] exists
PMID [32105609] exists
Going to download record 81 to 90
PMID [32101683] exists
PMID [32091533] exists
PMID [32087820] exists
PMID [32087777] exists
PMID [32087125] exists
PMID [32087122] exists
PMID [32087098] exists
PMID [32085843] exists
PMID [32085842] exists
PMID [32085841] exists
Going to download record 91 to 100
PMID [32085840] exists
PMID [32085839] exists
PMID [32078803] exists
PMID [32075791] exists
PMID [32075786] exists
PMID [32074444] exists
PMID [32071063] exists
PMID [32066541] exists
PMID [32066526] exists
PMID [32066525] exists
Going to download record 101 to 110
PMID [32061335] exists
PMID [32061313] exists
PMID [32061311] exists
PMID [32061284] exists
PMID [32059801] exists
PMID [32059800] exists
PMID [32059799] exists
PMID [32059798] exists
PMID [32050060] exist

PMID [32167445] exists
PMID [32167181] exists
PMID [32167173] exists
PMID [32166607] exists
PMID [32166318] exists
PMID [32166310] exists
PMID [32166128] exists
PMID [32165502] exists
PMID [32165386] exists
PMID [32164834] exists
Going to download record 101 to 110
PMID [32164091] exists
PMID [32164079] exists
PMID [32164078] exists
PMID [32164053] exists
PMID [32161416] exists
PMID [32161107] exists
PMID [32161092] exists
PMID [32160942] exists
PMID [32160889] exists
PMID [32159234] exists
Going to download record 111 to 120
PMID [32158961] exists
PMID [32157235] exists
PMID [32156332] exists
PMID [32156331] exists
PMID [32156327] exists
PMID [32156224] exists
PMID [32155789] exists
PMID [32155431] exists
PMID [32154505] exists
PMID [32152612] exists
Going to download record 121 to 130
PMID [32152595] exists
PMID [32151613] exists
PMID [32151326] exists
PMID [32151325] exists
PMID [32151324] exists
PMID [32150618] exists
PMID [32149043] exists
PMID [32149037] exists
PMID [32148173] ex

PMID [32164092] exists
PMID [32187257] exists
PMID [32167747] exists
Added new PMID: 32209384
PMID [31986264] exists
PMID [32152082] exists
PMID [32184131] exists
PMID [32049687] exists
Added new PMID: 32210742
PMID [32204922] exists
Going to download record 51 to 60
PMID [32007143] exists
PMID [32035018] exists
PMID [32096367] exists
PMID [32075152] exists
PMID [31978293] exists
PMID [32188484] exists
PMID [31992388] exists
PMID [32162896] exists
Added new PMID: 32211816
PMID [32199877] exists
Going to download record 61 to 70
PMID [32032682] exists
PMID [32031570] exists
PMID [32031234] exists
PMID [32010938] exists
PMID [32007143] exists
PMID [31994742] exists
PMID [31992880] exists
PMID [31992388] exists
PMID [31986264] exists
PMID [31978293] exists
(((("coronavirus"[MeSH Terms] OR "coronavirus"[All Fields]) AND ("COVID-19"[All Fields] OR "severe acute respiratory syndrome coronavirus 2"[Supplementary Concept] OR "severe acute respiratory syndrome coronavirus 2"[All Fields] OR "201

PMID [32155789] exists
PMID [32155273] exists
PMID [32153170] exists
PMID [32152612] exists
PMID [32152595] exists
PMID [32152082] exists
PMID [32152059] exists
PMID [32151335] exists
PMID [32151326] exists
PMID [32151325] exists
Going to download record 251 to 260
PMID [32151274] exists
PMID [32150796] exists
PMID [32150360] exists
PMID [32150618] exists
PMID [32149773] exists
PMID [32149769] exists
PMID [32149043] exists
PMID [32149037] exists
PMID [32149036] exists
PMID [32148172] exists
Going to download record 261 to 270
PMID [32147628] exists
PMID [32147496] exists
PMID [32146924] exists
PMID [32145772] exists
PMID [32145718] exists
PMID [32145466] exists
PMID [32145186] exists
PMID [32143502] exists
PMID [32142651] exists
PMID [32142626] exists
Going to download record 271 to 280
PMID [32142596] exists
PMID [32141624] exists
PMID [32141619] exists
PMID [32141588] exists
PMID [32141570] exists
PMID [32141062] exists
PMID [32141058] exists
PMID [32139521] exists
PMID [32139372] ex

<__main__.COVID at 0x169b59b5808>

In [1]:
def MakeTemplate():
    headerslist = []
    for H in ('PMID',
              'Title',
              'Link',
                'JournalName',
              'Creation Date',
                'Publication Date',
                'Abstract',
                'Tag'):
        print(H)
        headers = '=ImportJSON("https://raw.githubusercontent.com/tofaquih/coronaPubGet/master/results3.json", "/{}", "noInherit,noTruncate",$A$1)'.format(H)
        headerslist.append(headers)

    headerslist
    with open('template.csv' ,'w'  , newline='' ) as fp:
        W = csv.writer(fp, delimiter=';')
        W.writerow(headerslist)

In [4]:
MakeTemplate()

PMID
Title
Link
JournalName
Creation Date
Publication Date
Abstract
Tag


# google

In [None]:
import requests
from bs4 import BeautifulSoup

query = '"covid-19"'
url =  'https://scholar.google.com/scholar?start=0&q='+ query + '&hl=en&scisbd=1&as_sdt=1,5&as_vis=1&ie=UTF-8&oe=UTF-8&hl=en&btnG=Search'

content = requests.get(url).text
page = BeautifulSoup(content, 'html')
results = []
for entry in page.find_all("h3", attrs={"class": "gs_rt"}):
    results.append({"title": entry.a.text, "url": entry.a['href']})

In [None]:
for entry in page.find_all(attrs={"class": "gs_rs"}):
    maintext = entry.get_text()
    D = maintext.split(' ')[0]
    print(D)
    print(entry.get_text())
    print()

In [None]:
results