In [1]:
from bs4 import BeautifulSoup
import re
import requests
import pandas as pd

Journal searches to scrape:  
https://www.hindawi.com/search/all/biological+aging/  
https://www.sciencedirect.com/journal/ageing-research-reviews/issues (limited availability)  
https://academic.oup.com/ageing/search-results?q=aging&allJournals=1&f_ContentType=Journal+Article&f_ArticleTypeDisplayName=Research+Article&fl_SiteID=5255&access_openaccess=true&access_unlocked=true&qb=%7b%22q%22%3a%22aging%22%7d&access_free=true (search could be expanded, but no search includes many results regarding injuries).  


In [2]:
## define variables for the parent website plus the URL for the first page of any search results
parent_link = 'https://www.hindawi.com'
search_link_modifier = '/search/all/cellular+aging/'

In [3]:
def fetch_page(page_link):
    """
    Args: 
        page_line (str): any valid URL
        
    Returns:
        page_content (obj): BeautifulSoup parsable webcontent associated with the input URL 
    
    To-do:
        Add error handling based on page_response
    """
    page_response = requests.get(page_link, timeout=5)
    page_content = BeautifulSoup(page_response.content, "html.parser")
    return page_content

In [4]:
def links_from_parent(parent_content, search_substring):
    """
    Args: 
        parent_content (obj): BeautifulSoup parsed html
            
        search_substring (str): string to search for within link URL
   
    Returns:
        list_links (list): List of links meeting search criteria
    """ 
    list_links = []
    for link in parent_content.findAll('a', href=re.compile(search_substring)):
        list_links.append(link.get('href'))
    return list_links
            
# def link_next_parent(parent_content, journal_name):
#     if journal_name == 'hindawi':
#         for link in parent_content.findAll('a', href=re.compile('search/')):
#             print(link.get('href'))

In [5]:
def get_all_search_links(parent_link, search_link_modifier, next_search_modifier, max_count = 1000, **kws):
    """
    Args:
        parent_link (str): 
            URL of parent site ('https://www.hindawi.com')
        
        search_link_modifier (str): 
            string to add to parent URL that links to the first search term
            ('/search/all/biological+aging/' within 'https://www.hindawi.com/search/all/biological+aging/')
    
        next_search_modifier (str): 
            string (plus optional regex) within a URL that signifies a link to the next
            page of search results ('search/' within 'https://www.hindawi.com/search/all/biological+aging/2/')
            
        max_count (int):
            maximum number of links to get from search to prevent the indefinite retrieval.
    
    Returns:
        list_links (list): 
            All links to all pages of search results within the parent site which correspond to the search term
    """
    list_links = [search_link_modifier]
    e = 0
    i = 0
    while (e == 0) and (i < max_count):
        search_content = fetch_page(parent_link + search_link_modifier)
        
        next_search_link = links_from_parent(search_content, next_search_modifier)
        if next_search_link[-1] not in list_links:
            list_links.append(next_search_link[-1])
            search_link_modifier = next_search_link[-1]
        else:
            e = 1
        i += 1
    return list_links

In [12]:
def get_all_journal_links(parent_link, search_link_list, journal_link_identifier):
    """
    Args:
        parent_link (str): 
            URL of parent site ('https://www.hindawi.com')
        
        search_link_list (list): 
            list of strings string to add to parent URL to get search pages
            (['/search/all/biological+aging/', '/search/all/biological+aging/2/'] 
            within 'https://www.hindawi.com/search/all/biological+aging/' and 
            'https://www.hindawi.com/search/all/biological+aging/2/, respectively.

    
        journal_link_identifier (str): 
            string (plus optional regex) within a URL that signifies a link to a journal
            ('journals/[a-z]' within 'https://www.hindawi.com/journals/[string starting with any letter]')
    
    Returns:
        list_links (list): 
            All links to all journals within the parent site which correspond to the search term
    """
    list_links = []
    for search_link_modifier in search_link_list:
        search_content = fetch_page(parent_link + search_link_modifier)
        page_links = links_from_parent(search_content, journal_link_identifier)
        list_links = list_links + page_links
    return list_links


## Everything below is WIP

In [None]:
def get_journal_text(journal_link_modifier, start_identifier, end_identifier):
    """
    Args:
        journal_link_modifier (str):
            string within a URL that signifies a link to a specific journal article
            ('/journals/omcl/2012/919832/' within 'https://www.hindawi.com/journals/omcl/2012/919832/')
    
        start_identifier (str):
            String with optional regex that identifies an html tag at at which to begin scraping text.
        
        end_identifier (str):
            String with optional regex that identifies an html tag at at which to stop scraping text.
            
    Returns:
        journal_text (str):
    """
    page_content = fetch_page(parent_link + search_link_modifier)

In [8]:
search_links = get_all_search_links(parent_link, search_link_modifier, 'search/')

In [15]:
journal_links = get_all_journal_links(parent_link, search_links, 'journals/[a-z]')

In [17]:
len(journal_links)

15691

In [19]:
journal_links[0:9]

['/journals/omcl/2012/919832/',
 '/journals/omcl/2017/2398696/',
 '/journals/omcl/2017/7941563/',
 '/journals/omcl/2017/2784153/',
 '/journals/omcl/2017/7928981/',
 '/journals/omcl/2012/616128/',
 '/journals/omcl/2017/7280690/',
 '/journals/omcl/2015/732914/',
 '/journals/jar/2011/814096/']