In [2]:
from Bio import Entrez
import pandas as pd
import numpy as np
import random


# get list of IDs of studies that match your query
def search(query):
    Entrez.email = 'vdp14@case.edu'
    handle = Entrez.esearch(db='pubmed', sort='relevance', retmax='250000', retmode='xml', term=query)
    results = Entrez.read(handle)
    return results

# search for cardiac tissue engineering papers
q = 'decellularized ECM for cardiac tissue engineering'
studies = search(q)
studiesIdList = studies['IdList']
print(len(studiesIdList))

# If you want to reduce the number of papers for easy computation
# studiesIdList_shortened = random.sample(studiesIdList, 300)


# Use efetch to get the details of each study
def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'vdp14@case.edu'
    handle = Entrez.efetch(db='pubmed', retmode='xml', id=ids)
    results = Entrez.read(handle)
    return results

# create a pandas df that has the pubmed article info (title, abstract, journal, date, etc.)
# Efetch runs 10,000 studies max, so separate idList in sections
titles = []
abstracts = []
journals = []
pubdate_years = []
pudate_months = []

studies = fetch_details(studiesIdList)
chuncks_size = 10000

'''
Challenge:
Not all full text articles are available so we will have to write a function that retrieves free full text articles. There will probably be another list that has texts
that were not able to be retrieve that we will then feed PDFs to. The function will break down the PDF into similar format as the .xml files for consistency.
'''

# URL to retrieve publically available articles
format = 'json'
id = studiesIdList
url_bioc = 'https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_[format]/[ID]/[encoding]'

387


In [79]:
format = 'json'
id = studiesIdList[:10]
encoding = 'ascii'
# url_bioc = 'https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_' + format + '/' + id + '/' + encoding
# print(url_bioc)



In [80]:
import urllib.request, json

In [81]:
for i in id:
    print(i)
    try:
        url_bioc = 'https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_' + format + '/' + i + '/' + encoding
        response = urllib.request.urlopen(url_bioc)
        data = json.loads(response.read())
    except:
        print('No open access file for ', i)

34901526
31075518
No open access file for  31075518
36361824
26119160
No open access file for  26119160
35960635
No open access file for  35960635
32601935
No open access file for  32601935
35360395
31271596
37406920
No open access file for  37406920
32279344
No open access file for  32279344


In [46]:
studies_table = pd.DataFrame()
keys = data.keys()
keys
# for x in data['source']:
#     keys = x.keys()
#     print(keys)


dict_keys(['source', 'date', 'key', 'infons', 'documents'])

In [47]:
main_content = data['documents'][0]['passages']
main_content

[{'offset': 0,
  'infons': {'article-id_doi': '10.1016/j.bioactmat.2021.09.014',
   'article-id_pii': 'S2452-199X(21)00431-X',
   'article-id_pmc': '8637010',
   'article-id_pmid': '34901526',
   'fpage': '15',
   'kwd': 'Decellularization ECM 3D scaffolds Tissue regeneration Recellularization',
   'license': 'This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/).',
   'lpage': '31',
   'name_0': 'surname:Zhang;given-names:Xuewei',
   'name_1': 'surname:Chen;given-names:Xi',
   'name_2': 'surname:Hong;given-names:Hua',
   'name_3': 'surname:Hu;given-names:Rubei',
   'name_4': 'surname:Liu;given-names:Jiashang',
   'name_5': 'surname:Liu;given-names:Changsheng',
   'section_type': 'TITLE',
   'title': 'Keywords',
   'type': 'front',
   'volume': '10',
   'year': '2021'},
  'text': 'Decellularized extracellular matrix scaffolds: Recent trends and emerging strategies in tissue engineering',
  'sentences': [],
  'annotations': [],


In [48]:
# Create a for-loop that iterates throught the text (Create this into its own function that takes the json file, length of dict)
size = len(data['documents'][0]['passages'])
main_content = ''
for i in range(size):
    main_content = main_content + data['documents'][0]['passages'][i]['text'] # note that this also includes the references in the text (title of references)
print(main_content)

Decellularized extracellular matrix scaffolds: Recent trends and emerging strategies in tissue engineeringThe application of scaffolding materials is believed to hold enormous potential for tissue regeneration. Despite the widespread application and rapid advance of several tissue-engineered scaffolds such as natural and synthetic polymer-based scaffolds, they have limited repair capacity due to the difficulties in overcoming the immunogenicity, simulating in-vivo microenvironment, and performing mechanical or biochemical properties similar to native organs/tissues. Fortunately, the emergence of decellularized extracellular matrix (dECM) scaffolds provides an attractive way to overcome these hurdles, which mimic an optimal non-immune environment with native three-dimensional structures and various bioactive components. The consequent cell-seeded construct based on dECM scaffolds, especially stem cell-recellularized construct, is considered an ideal choice for regenerating functional or

In [49]:
title = data['documents'][0]['passages'][0]['text']

pmid = data['documents'][0]['passages'][0]['infons']['article-id_pmid']

content_list = [pmid, title, main_content]

In [50]:
# Dataframe to store retrieved data
studies_table = pd.DataFrame(columns=['id', 'title', 'body_content'])
studies_table

Unnamed: 0,id,title,body_content


In [51]:
studies_table.loc[len(studies_table)] = content_list
pd.set_option('display.max_colwidth', 1)

In [54]:
studies_table
studies_table['body_content'].replace('\t', '') # some text has \t so it needs to be removed (check a few of the text to see what they contain)

0    Decellularized extracellular matrix scaffolds: Recent trends and emerging strategies in tissue engineeringThe application of scaffolding materials is believed to hold enormous potential for tissue regeneration. Despite the widespread application and rapid advance of several tissue-engineered scaffolds such as natural and synthetic polymer-based scaffolds, they have limited repair capacity due to the difficulties in overcoming the immunogenicity, simulating in-vivo microenvironment, and performing mechanical or biochemical properties similar to native organs/tissues. Fortunately, the emergence of decellularized extracellular matrix (dECM) scaffolds provides an attractive way to overcome these hurdles, which mimic an optimal non-immune environment with native three-dimensional structures and various bioactive components. The consequent cell-seeded construct based on dECM scaffolds, especially stem cell-recellularized construct, is considered an ideal choice for regenerating function

In [59]:
studiesIdList[:2]

['34901526', '31075518']