In [1]:
import re
import requests
import time
import pandas as pd

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
from tqdm import tqdm

In [2]:
## define variables for the parent website plus the URL for the first page of any search results
parent_link = 'https://www.hindawi.com'
search_link_modifier = '/search/all/cellular+aging/'

In [3]:
def fetch_page(page_link):
    """
    Args: 
        page_line (str): any valid URL
        
    Returns:
        page_content (obj): BeautifulSoup parsable webcontent associated with the input URL 
    
    To-do:
        Add error handling based on page_response
    """
    page_response = requests.get(page_link, timeout=5)
    page_content = BeautifulSoup(page_response.content, "html.parser")
    return page_content

In [4]:
def links_from_parent(parent_content, search_substring):
    """
    Args: 
        parent_content (obj): BeautifulSoup parsed html
            
        search_substring (str): string to search for within link URL
   
    Returns:
        list_links (list): List of links meeting search criteria
    """ 
    list_links = []
    for link in parent_content.findAll('a', href=re.compile(search_substring)):
        list_links.append(link.get('href'))
    return list_links
            
# def link_next_parent(parent_content, journal_name):
#     if journal_name == 'hindawi':
#         for link in parent_content.findAll('a', href=re.compile('search/')):
#             print(link.get('href'))

In [5]:
def get_all_search_links(parent_link, search_link_modifier, next_search_modifier, max_count = 1000, **kws):
    """
    Args:
        parent_link (str): 
            URL of parent site ('https://www.hindawi.com')
        
        search_link_modifier (str): 
            string to add to parent URL that links to the first search term
            ('/search/all/biological+aging/' within 'https://www.hindawi.com/search/all/biological+aging/')
    
        next_search_modifier (str): 
            string (plus optional regex) within a URL that signifies a link to the next
            page of search results ('search/' within 'https://www.hindawi.com/search/all/biological+aging/2/')
            
        max_count (int):
            maximum number of links to get from search to prevent the indefinite retrieval.
    
    Returns:
        list_links (list): 
            All links to all pages of search results within the parent site which correspond to the search term
    """
    list_links = [search_link_modifier]
    e = 0
    i = 0
    while (e == 0) and (i < max_count):
        search_content = fetch_page(parent_link + search_link_modifier)
        
        next_search_link = links_from_parent(search_content, next_search_modifier)
        if next_search_link[-1] not in list_links:
            list_links.append(next_search_link[-1])
            search_link_modifier = next_search_link[-1]
        else:
            e = 1
        i += 1
    return list_links

In [6]:
def get_all_journal_links(parent_link, search_link_list, journal_link_identifier):
    """
    Args:
        parent_link (str): 
            URL of parent site ('https://www.hindawi.com')
        
        search_link_list (list): 
            list of strings string to add to parent URL to get search pages
            (['/search/all/biological+aging/', '/search/all/biological+aging/2/'] 
            within 'https://www.hindawi.com/search/all/biological+aging/' and 
            'https://www.hindawi.com/search/all/biological+aging/2/, respectively.

    
        journal_link_identifier (str): 
            string (plus optional regex) within a URL that signifies a link to a journal
            ('journals/[a-z]' within 'https://www.hindawi.com/journals/[string starting with any letter]')
    
    Returns:
        list_links (list): 
            All links to all journals within the parent site which correspond to the search term
    """
    list_links = []
    for search_link_modifier in tqdm(search_link_list):
        search_content = fetch_page(parent_link + search_link_modifier)
        page_links = links_from_parent(search_content, journal_link_identifier)
        list_links = list_links + page_links
    return list_links


In [7]:
def get_journal_text(parent_link, journal_link_modifier, text_container, text_identifier):
    """
    Args:
        parent_link (str): 
            URL of parent site ('https://www.hindawi.com')
    
        journal_link_modifier (str):
            string within a URL that signifies a link to a specific journal article
            ('/journals/omcl/2012/919832/' within 'https://www.hindawi.com/journals/omcl/2012/919832/')
    
        text_container (str):
            string that determines what kind of html tag contains the text to scrape. 
            (most commonly 'div' but may be 'p' or some other tag)
        
        text_identifier (dict):
            dictionary that defines the properties of the text container from which to scrape the text.
            ({'class': 'article_type'} or {'id': 'article_text'})
            
    Returns:
        unicode_text (str):
            The text of the journal in unicode.
    """
    
    page_content = fetch_page(parent_link + journal_link_modifier)
    try:
        journal_text = page_content.find(text_container, text_identifier).parent.get_text(' ')
    except:
        journal_text = 'Text could not be retrieved.'
        
    
    #Convert journal text to unicode and return unicode markup.
    unicode_text = UnicodeDammit(journal_text).unicode_markup
    
    return unicode_text #Can be subbed with journal_text if unicode is not preferred

In [8]:
def process_text(text):
    text = text.replace('\t', '')
    text = text.replace('\n', '')
    return text

In [9]:
def get_all_journal_text(parent_link, all_journal_modifiers, text_container, text_identifier):
    """
    Args:
        parent_link (str): 
            URL of parent site ('https://www.hindawi.com')
    
        all_journal_modifiers (list):
            string within a URL that signifies a link to a specific journal article
            ('/journals/omcl/2012/919832/' within 'https://www.hindawi.com/journals/omcl/2012/919832/')
    
        text_container (str):
            string that determines what kind of html tag contains the text to scrape. 
            (most commonly 'div' but may be 'p' or some other tag)
        
        text_identifier (dict):
            dictionary that defines the properties of the text container from which to scrape the text.
            ({'class': 'article_type'} or {'id': 'article_text'})
            
    Returns:
        journal_text (dict):
            A dictionary of the text for each journal passed in list
    """
    journal_text = {}
    for journal_link in tqdm(all_journal_modifiers):
        text_preprocess = get_journal_text(parent_link, journal_link, text_container, text_identifier)
        text_processed = process_text(text_preprocess)
        journal_text[journal_link] = text_processed
    return journal_text

In [10]:
def save_journal_text(filepath, filename, journal_text_dict, 
                      method = 'a+', encoding = 'utf-8', **kwargs):
    textfile = open(filepath + filename, method, encoding=encoding)
    dividing_text = '\n\n\n\n\n'
    for text in tqdm(journal_text_dict.values()):
        textfile.write(text + dividing_text)
    textfile.close()

In [11]:
#def get_batches()

In [12]:
search_links = get_all_search_links(parent_link, search_link_modifier, 'search/')

In [13]:
journal_links = get_all_journal_links(parent_link, search_links, 'journals/[a-z]')

100%|████████████████████████████████████████████████████████████████████████████████| 629/629 [05:20<00:00,  1.56it/s]


In [14]:
journals = get_all_journal_text(parent_link, journal_links, 'div', {'class': 'article_type'})

100%|██████████████████████████████████████████████████████████████████████████| 15724/15724 [2:52:58<00:00,  1.39s/it]


In [15]:
filepath = './'
filename = 'ResearchJournals_Senescence.txt'
save_journal_text(filepath, filename, journals, method='w+')

100%|██████████████████████████████████████████████████████████████████████████| 15724/15724 [00:01<00:00, 8773.43it/s]


In [16]:
df_journals = pd.DataFrame.from_dict(journals, orient='index', columns=['ArticleText'])
df_journals.to_csv('./ResearchJournals_Senescence.csv', sep='\t')

In [17]:
df_journals

Unnamed: 0,ArticleText
/journals/omcl/2012/919832/,Oxidative Medicine and Cellular Longevity Volu...
/journals/omcl/2017/2398696/,Oxidative Medicine and Cellular Longevity Volu...
/journals/omcl/2017/7941563/,Oxidative Medicine and Cellular Longevity Volu...
/journals/omcl/2017/2784153/,Oxidative Medicine and Cellular Longevity Volu...
/journals/omcl/2017/7928981/,Oxidative Medicine and Cellular Longevity Volu...
/journals/omcl/2012/616128/,Oxidative Medicine and Cellular Longevity Volu...
/journals/omcl/2017/7280690/,Oxidative Medicine and Cellular Longevity Volu...
/journals/omcl/2015/732914/,Oxidative Medicine and Cellular Longevity Volu...
/journals/jar/2011/814096/,"Journal of Aging Research Volume 2011, Article..."
/journals/bmri/2016/3208429/,"BioMed Research International Volume 2016, Art..."
