## Query PubMed and get abstract information

In [1]:
from Bio import Entrez
import pandas as pd
import json
import tqdm
import copy
from pprint import pprint

In [2]:
%config Completer.use_jedi = False

In [3]:
def search(term='query',db='pubmed', sort='relevance',retmax=500,retmode='xml',):
    Entrez.email='zhijun.cao@fda.hhs.gov'
    handle = Entrez.esearch(db=db,
                           sort=sort,
                           retmax=retmax,
                           retmode=retmode,
                           term=term)
    results = Entrez.read(handle)
    return results

In [4]:
def fetch_details(id=[], db='pubmed', retmode='xml', retmax=100000):
    Entrez.email = 'zhijun.cao@fda.hhs.gov'
    handle = Entrez.efetch(db=db,
                           retmode=retmode,
                           id=id,
                           retmax=retmax)
    results = Entrez.read(handle)
    return results

In [12]:
tem =fetch_details(id="33707255")


In [13]:
#pprint(json.dumps(tem, indent=2))

In [5]:
import itertools
def split_seq(iterable, size):
    it = iter(iterable)
    item = list(itertools.islice(it, size))
    while item:
        yield item
        item = list(itertools.islice(it, size))

In [170]:
def flatten_lists(mylist):
    for low in mylist:
        for lower in low:
            yield lower
            
def chunks(mylist, size):
    length = len(mylist)
    for i in range(0, len(mylist), size):
        yield mylist[i:i + size]    

In [6]:
def xml_to_DataFrame(papers):
    abstracts=[]
    for paper in tqdm.tqdm(papers['PubmedArticle']):
        article = paper['MedlineCitation']['Article']
        pmid = paper['MedlineCitation']['PMID'][:]
        title = article['ArticleTitle']
        keywords = paper['MedlineCitation']["KeywordList"]
        if len(keywords)>0:
            keywords = ' '.join(keywords[0])
        else:
            keywords = None
        try:
            abstract = article['Abstract']['AbstractText'][0]
        except:
            abstract = None
        journal = article["Journal"]['Title']
        try:
            date = pd.to_datetime(pd.DataFrame(paper["PubmedData"]['History'])).iloc[-1]               
        except:
            date =None
        try:
            ArticleIds ='_'.join(paper["PubmedData"]['ArticleIdList'])
        except:
            ArticleIds =None
        authors = get_authors(paper)
        abstracts.append([pmid,title,abstract, keywords, journal, authors[0], authors[1], ArticleIds, date])
    df = pd.DataFrame(abstracts,columns=['pmid', 'Title', 'Abstract', 'Keywords','Journal','Authors', 'Affiliations','ArticleIds', 'LatestDate'])
    
    return df   


In [7]:
#paper = paper1['PubmedArticle'][48]

def get_authors(paper):
    try:
        authors = paper['MedlineCitation']['Article']['AuthorList']
        auths=[]
        affiliations=[]
        for auth in authors:
            if len(auth['AffiliationInfo'])>0:
                affiliation = auth['AffiliationInfo'][0]['Affiliation']
            else:
                affiliation = "NA"
            if 'ForeName' in auth.keys():
                forename = auth['ForeName']
            else:
                forename = "NA"
        
            if 'LastName' in auth.keys():
                lastname = auth['LastName']
            else:
                lastname = "NA"
            auths.append(' '.join([forename,lastname]))
            affiliations.append(' '.join([affiliation]))
        auths_str = '\n '.join(auths)
        affiliations_str = '\n '.join(affiliations)
        return (auths_str,affiliations_str)
    except:
        return ("NA", "NA")

#pprint(get_authors(paper))

In [357]:
query = '(metabolomics[Title/Abstract] OR proteomics[Title/Abstract]) AND (toxicity[Title/Abstract])'
results = search(term=query, retmax=5000)
id_list = results['IdList']


In [361]:
covid_query = '("SARS-CoV-2"[Title/Abstract] OR "COVID-19"[Title/Abstract]) AND (fha[Filter])'
covid_results = search(term=covid_query, retmax=100000)

In [176]:
cardio_prot_query = '("cardiotoxicity"[Title/Abstract] and "proteomics"[Title/Abstract])'

In [177]:
cardio_prot_result = search(term=cardio_prot_query, retmax=100000)

In [178]:
cardio_prot_result['IdList']



In [179]:
cardio_prot_abstract = fetch_details(id=cardio_prot_result['IdList'])

In [180]:
cardio_prot_abstract_df = xml_to_DataFrame(cardio_prot_abstract)

100%|██████████| 42/42 [00:00<00:00, 197.18it/s]


In [183]:
cardio_prot_abstract_df.to_excel('C:/zhijuncao/R/textming/cardioprot/cardioprot_abstract.xlsx')

In [8]:
drug_gly_query = '("drug resistance"[Title/Abstract] and "glycosylation"[Title/Abstract])'

In [9]:
drug_gly_result = search(term=drug_gly_query, retmax=100000)

In [11]:
drug_gly_abstract = fetch_details(id=drug_gly_result['IdList'])

In [12]:
drug_gly_abstract_df = xml_to_DataFrame(drug_gly_abstract)

100%|██████████| 140/140 [00:00<00:00, 231.47it/s]


In [15]:
drug_gly_abstract_df.to_excel('C:/zhijuncao/R/textming/druggly/drug_gly_abstract.xlsx')

In [542]:
covid_id10000_chunk190 = list(split_seq(covid_id10000,190))

In [405]:
import time

In [None]:
covid_batches=[]
i=0
for chunk in covid_id10000_chunk190:
    tem = fetch_details(id=chunk)
    covid_batches.append(tem)
    json.dump(tem, open(f"C:/zhijuncao/R/textming/covid_abstracts_batch{i}.json", 'w'))
    i = i+1
    print (i)
    time.sleep(30)
    
    

In [449]:
len(covid_batches)

327

In [None]:
bind_batches = []
for batch in tqdm.tqdm(covid_batches):
    bind_batches.extend(xml_to_DataFrame(papers=batch).values.tolist())

In [466]:
covid_1batch = xml_to_DataFrame(papers=covid_abstracts)

100%|██████████| 9997/9997 [00:35<00:00, 282.86it/s]


In [467]:
columns = covid_1batch.columns.tolist()
bind_batches.append(covid_1batch.values.tolist())

In [508]:
len(covid_batches)

328

In [509]:
json.dump(covid_batches, open("C:/zhijuncao/R/textming/covid_abstract72053.json", 'w'))

In [482]:
covid_df = pd.DataFrame(list(itertools.chain(*bind_batches)),columns=columns)

In [486]:
covid_df.to_excel('C:/zhijuncao/R/textming/covid_abstract72053.xlsx')

In [484]:
print(covid_df.shape)
covid_df.isna().sum()

(72053, 9)


pmid                0
Title               0
Abstract           68
Keywords        18502
Journal             0
Authors             0
Affiliations        0
ArticleIds          0
LatestDate          0
dtype: int64

In [522]:
#list(itertools.chain(*list(itertools.chain(*tem))))

In [373]:
pd.DataFrame({'covid':covid_id}).to_excel('C:/zhijuncao/R/textming/covid_id.xlsx')

In [369]:
covid_abstracts = fetch_details(id=covid_id) 

100%|██████████| 72119/72119 [00:00<00:00, 3275817.74it/s]


In [370]:
json.dump(covid_abstracts, open("C:/zhijuncao/R/textming/covid_abstracts.json", 'w'))

In [371]:
covid_abstracts_df = xml_to_DataFrame(covid_abstracts)

100%|██████████| 9997/9997 [00:30<00:00, 322.87it/s]


In [374]:
covid_abstracts_df.shape

(9997, 8)

In [377]:
covid_abstracts_df.to_excel('C:/zhijuncao/R/textming/covid_absract9997.xlsx')

In [368]:
papers = fetch_details(id=id_list)    

In [407]:
json.dump(papers, open("C:/zhijuncao/R/textming/tem.json", 'w'))

In [359]:
paper1 = json.load(open("C:/zhijuncao/R/textming/tem.json",'r'))
type(paper1)

dict

In [318]:
 author =paper1['PubmedArticle'][151]['MedlineCitation']['Article']['AuthorList'][1]
    #['AffiliationInfo'] #[0]['Affiliation']

In [327]:
abstracts_df1 = xml_to_DataFrame(paper1)

100%|██████████| 2284/2284 [00:07<00:00, 325.26it/s]


In [330]:
abstracts_df1.to_excel("C:/zhijuncao/R/textming/py_metaprot.xlsx")

In [328]:
abstracts_df1.dropna(subset=["Abstract"], inplace=True)
print(abstracts_df1.shape)
abstracts_df1.isna().sum()

(2280, 8)


pmid            0
Title           0
Abstract        0
Journal         0
Authors         0
Affiliations    0
ArticleIds      0
LatestDate      0
dtype: int64

In [None]:
abstracts_df1

In [None]:
print(json.dumps(paper1['PubmedArticle'][1], indent=10))

# get full article from science

In [29]:
import requests
from bs4 import BeautifulSoup
import re

In [None]:
pmids = [33707255, 33707254, 33707252, 33707243, 33707248, 33674488, 33649167, 33632834, ]

In [72]:
def get_pmids(url="https://pubmed.ncbi.nlm.nih.gov/collections/60600504/?sort=pubdate"):
    url = url
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")
    pmids = soup.find("meta",attrs={"name":"log_displayeduids"}).get('content').split(sep=',')
    return pmids

In [73]:
pmids = get_pmids(url="https://pubmed.ncbi.nlm.nih.gov/collections/60600504/?sort=pubdate&size=200")

In [74]:
len(pmids)

31

In [75]:
collection_details = fetch_details(id=pmids)

In [79]:
collection_details_df = xml_to_DataFrame(collection_details)

100%|██████████| 31/31 [00:00<00:00, 176.14it/s]


In [80]:
collection_details_df['pmid']=collection_details_df['pmid'].astype(str)

In [81]:
collection_details_df.dtypes

pmid                    object
Title                   object
Abstract                object
Keywords                object
Journal                 object
Authors                 object
Affiliations            object
ArticleIds              object
LatestDate      datetime64[ns]
dtype: object

In [9]:
collection_details_df.head(2)

NameError: name 'collection_details_df' is not defined

In [83]:
ids_repaired.head(2)

NameError: name 'ids_repaired' is not defined

In [89]:
collection_details_df1 = collection_details_df.join(ids_repaired.set_index('pmid'), how='inner', on='pmid')

In [90]:
collection_details_df1.head(2)

Unnamed: 0,pmid,Title,Abstract,Keywords,Journal,Authors,Affiliations,ArticleIds,LatestDate,pii,doi,pmc,mid
0,33707255,Using digital twins in viral infection.,,,"Science (New York, N.Y.)",Reinhard Laubenbacher\n James P Sluka\n James ...,"Department of Medicine, University of Florida,...",33707255_371/6534/1105_10.1126/science.abf3370,2021-03-20 06:00:00,371/6534/1105,10.1126/science.abf3370,,
1,33707254,Immunity to SARS-CoV-2 variants of concern.,,,"Science (New York, N.Y.)",Daniel M Altmann\n Rosemary J Boyton\n Rupert ...,"Department of Immunology and Inflammation, Imp...",33707254_371/6534/1103_10.1126/science.abg7404,2021-03-17 06:00:00,371/6534/1103,10.1126/science.abg7404,,


In [30]:
#ids = pd.DataFrame([x.split(sep="_") for x in collection_details_df.ArticleIds], columns=['pubmed','pii', 'doi', 'pmc', 'mid'])

#ids.to_excel('C:/zhijuncao/R/textming/collection_abstracts_1.xlsx')
ids_repaired = pd.read_excel('C:/zhijuncao/R/textming/collection_abstracts.xlsx')
ids_repaired.rename(columns={"pubmed":"pmid"}, inplace=True)


In [31]:
ids_repaired['pmid']=ids_repaired['pmid'].astype(str)

In [32]:
ids_repaired.dtypes

pmid    object
pii     object
doi     object
pmc     object
mid     object
dtype: object

In [33]:
import time
import tqdm
#import pickle

In [34]:
urls = 'https://science.sciencemag.org/content/' +ids_repaired.pii + '.long'
urls[1:3]

1    https://science.sciencemag.org/content/371/653...
2    https://science.sciencemag.org/content/371/653...
Name: pii, dtype: object

In [35]:
def get_science_web_content(url="https://science.sciencemag.org/content/371/6533/1019.long"):
    url = url
    html = requests.get(url)
    web = BeautifulSoup(html.content, "html.parser")
    return web

def get_science_pages(web):
    pages = web.find_all('p',id=True)
    return pages

def get_science_full_text(web):
    title = web.find("title").get_text()
    p_text = [p.get_text() for p in web.find_all('p',id=True)]
    p_text = p_text
    fulltext = "\n". join(p_text)
    return fulltext, title

In [46]:
html_content=[]
for url in tqdm.tqdm(urls):
    html = requests.get(url)
    html_content.append(html.content)
    time.sleep(2)

100%|██████████| 31/31 [01:51<00:00,  3.61s/it]


In [60]:
i = 0
for web in html_content:
    file = f'C:/zhijuncao/R/textming/sciencehtml/paper{i}.html' 
    with open (file, 'wb') as f:
        f.write(web)
    i = i+1

In [119]:
web_pages=[]
titles = []
for content in html_content:
    soup=BeautifulSoup(content, "html.parser")
    
    all_pages=soup.find_all('p', id=True)
    pages='\n'.join([p.text for p in all_pages if p.attrs.get('id') not in ['p-1','p-2']])
    web_pages.append(pages)
    titles.append(soup.find("title").get_text())

In [120]:
full_df = pd.DataFrame({'title':titles, 'urls':url, "full_text":web_pages})


In [168]:
#BeautifulSoup(html_content[30],"html.parser").find_all('p', id=True)

In [127]:
full_df.to_excel('C:/zhijuncao/R/textming/urls_full_text.xlsx')

In [128]:
full_df.to_json('C:/zhijuncao/R/textming/urls_full_text.json')

In [252]:
full_abstract = collection_details_df1.join(full_df)

In [254]:
full_abstract.to_excel('C:/zhijuncao/R/textming/collection_full_abstracts.xlsx')

In [255]:
full_abstract.to_json('C:/zhijuncao/R/textming/collection_full_abstracts.json')

In [256]:
html = requests.get(urls[0])
soup = BeautifulSoup(html.content, "html.parser")

In [270]:
#pprint(soup.find_all('p',id=True))

In [None]:
full_abstract.dtypes