In [2]:
from bs4 import BeautifulSoup

import requests
import re
import time

In [3]:
%%bash
pip list | grep beautifulsoup4

beautifulsoup4                4.11.1


# Functions

In [4]:
def soup(url, parser="html.parser"):
    """Get a web page and parse
    
    Args:
        url (string): url of a web page.
        parser (string): parser available in bs4.

    Returns:
        soup_obj (bs4.BeautifulSoup): soup oject
    """
    
    response = requests.get(url)
    #to avoid character corruption
    response.encoding = response.apparent_encoding
    soup_obj = BeautifulSoup(response.text, parser)
    
    return soup_obj

# American Economic Review :[url](https://www.aeaweb.org/journals/aer/issues)

In [5]:
base_url_aer = "https://www.aeaweb.org"
url_aer_issues = base_url_aer + "/journals/aer/issues"

## Get issue titles and links 

### notebooks

In [7]:
# get lists of issues from issues page

base_url = base_url_aer

soup_obj = soup(url_aer_issues)
lst_issues = soup_obj.find_all('a', href=re.compile('/issues/'))

dic_issue_link_title = {}

for issue in lst_issues:
    link = base_url + issue['href']
    title = issue.text
    
    dic_issue_link_title[link] = title

# debug
print("link: ",link, ", ", "title: ", title)

link:  https://www.aeaweb.org/issues/160 ,  title:  March 1999 (Vol. 89, No.1 )


## function

In [8]:
def scrape_aer_issues():
    """get issue titles and links
    
    Get issues from AER web page.
    Older issues is stored in JSTOR.
    
    Args:
    
    Returns:
        dic_link_title (dict): key is issue link, value is issue title.
    """

    dic_issue_link_title = {}
    
    soup_obj = soup(url_aer_issues)
    lst_issues = soup_obj.find_all('a', href=re.compile('/issues/'))

    for issue in lst_issues:
        link = base_url + issue['href']
        title = issue.text

        dic_issue_link_title[link] = title
        
        
    return dic_issue_link_title


## Get papers in one issue.

### notebook

In [27]:
# get lists of paper from a issue page


# ex:
url_aer_issue = "https://www.aeaweb.org/issues/687"

soup_obj = soup(url_aer_issue)
lst_papers = soup_obj.find_all('a', href=re.compile('/articles?'))


dic_paper_link_title = {}

for paper in lst_papers:
    link = base_url_aer + paper['href']
    title = paper.text
    
    if title=="Front Matter":
        continue
    
    dic_paper_link_title[link] = title

#debug
print("link: ",link, ", ", "title: ", title)

link:  https://www.aeaweb.org/articles?id=10.1257/mic.20200129 ,  title:  Stability of Experimental Results: Forecasts and Evidence


In [29]:
dic_paper_link_title

{'https://www.aeaweb.org/articles?id=10.1257/mic.20180033': 'Term Limits and Bargaining Power in Electoral Competition',
 'https://www.aeaweb.org/articles?id=10.1257/mic.20200041': 'Political Competition with Endogenous Party Formation and Citizen Activists',
 'https://www.aeaweb.org/articles?id=10.1257/mic.20200074': 'Adviser Compensation, Endogenous Entry, and the Advice Gap',
 'https://www.aeaweb.org/articles?id=10.1257/mic.20200059': 'Class Actions and Private Antitrust Litigation',
 'https://www.aeaweb.org/articles?id=10.1257/mic.20200128': 'Contract Duration and the Costs of Market Transactions',
 'https://www.aeaweb.org/articles?id=10.1257/mic.20190307': 'Platform Governance',
 'https://www.aeaweb.org/articles?id=10.1257/mic.20190339': 'A Theory of Crime and Vigilance',
 'https://www.aeaweb.org/articles?id=10.1257/mic.20200049': 'Trust and Promises over Time',
 'https://www.aeaweb.org/articles?id=10.1257/mic.20170139': 'Strategic Teaching and Learning in Games',
 'https://www.ae

### function

In [11]:
def scrape_aer_papers_in_issue(url_aer_issue):
    """ get paper titles and links of a issue.
    
    Args:
        url_aer_issue (str): link of a issue page.(ex: "https://www.aeaweb.org/issues/689")
    
    Returns:
        dic_paper_link_title (dict): key is paper link, value is paper title.
    """
    
    dic_paper_link_title = {}
    
    soup_obj = soup(url_aer_issue)
    lst_papers = soup_obj.find_all('a', href=re.compile('/articles?'))
    
    for paper in lst_papers:
        link = base_url_aer + paper['href']
        title = paper.text

        if title=="Front Matter":
            continue

        dic_paper_link_title[link] = title
        
    return dic_paper_link_title
    
    

## Get information from a paper page
### notebook

In [13]:
# get information from a paper page

#ex: 
url_aer_paper = "https://www.aeaweb.org/articles?id=10.1257/000282803322655482"

#ex2:
url_aer_paper = "https://www.aeaweb.org/articles?id=10.1257/aer.20181811"



soup_obj = soup(url_aer_paper)
lst_metadata = soup_obj.find_all('meta')


paper_info = {}
lst_dict_author = []
str_abstract = ''
str_title = ''

#TODO: get "Additional Material"information.
lst_additional_materials = []


dict_author = {
        'name': '',
        'institution': ''
    }

for metadata in lst_metadata:
    name_metadata = metadata.get('name')
    
    if(name_metadata == 'citation_author'):    
        author_name = metadata.get('content')
        dict_author['name'] = author_name
        continue
        
        
    if(name_metadata == 'citation_author_institution'):
        author_institution = metadata.get('content')
        dict_author['institution'] = author_institution

        copy_dict_author = dict_author.copy()        
        lst_dict_author.append(copy_dict_author)
        continue

    
    if(name_metadata == 'citation_title'):
        str_title = metadata.get('content')
        continue
    
    if(name_metadata == 'twitter:description'):
        str_abstract = metadata.get('content')
        continue
        
        
#TODO: get category text information.


# get category codes
lst_jel_categories = []
soup_class_code = soup_obj.find_all("strong", {"class":"code"})

for item in soup_class_code:
    category_code = item.contents[0]
    lst_jel_categories.append(category_code)


In [21]:
lst_jel_categories

['G21', 'G51', 'L25', 'O16', 'P34', 'G21', 'G51', 'L25', 'O16', 'P34']

### WIP: Get "Additional Material"

In [15]:
a = soup_obj.find(id="additionalMaterials")

#a.contents[1].contents[1]
#a.contents[1].contents[1]['href']
#a.contents[1].contents[1].contents[0].strip()

### WIP: Get category code and info

In [16]:
soup_class_code = soup_obj.find_all("strong", {"class":"code"})

for item in soup_class_code:
    category_code = item.contents[0]
    lst_jel_categories.append(category_code)

In [17]:
soup_class_code = soup_obj.find_all("ul", {"class":"jel-codes"})

for item in soup_class_code:
    print(item.contents[1].contents[2].text.strip())
    
    print(type(item.contents[1].contents[2].text))
    
    break

Banks; Depository Institutions; Micro Finance Institutions; Mortgages
<class 'str'>


### function

In [18]:
def scrape_aer_info_in_paper(url_aer_paper):
    """get information about a paper.
    
    returns paper information something like below.
    
        {
            "title_str": "Aggregating Distributional Treatment Effects: A Bayesian Hierarchical Analysis of the Microcredit Literature",
            "author_lst": [
                {
                    "name": "Meager, Rachael",
                    "institution": "London School of Economics and Political Science"
                }
            ],
            "abstract_str": "(June 2022) - Expanding credit access in developing contexts could help some households while harming others. Microcredit studies show different effects at different quantiles of household profit, including some negative effects; yet these findings also differ across studies. I develop new Bayesian hierarchical models to aggregate the evidence on these distributional effects for mixture-type outcomes such as household profit. Applying them to microcredit, I find a precise zero effect from the fifth to seventy-fifth quantiles, and uncertain yet large effects on the upper tails, particularly for households with business experience. These quantile estimates are more reliable than averages because the data are fat tailed.",
            "categorycode_lst": [
                "G21",
                "G51",
                "L25",
                "O16",
                "P34"
            ]
        }
    
    
    Args:
        url_aer_paper (str): link of a paper page.(ex: "https://www.aeaweb.org/articles?id=10.1257/aer.20181811")
    
    Returns:
        paper_info (dict): paper information
    """
    
    soup_obj = soup(url_aer_paper)
    lst_metadata = soup_obj.find_all('meta')

    paper_info = {}
    lst_dict_author = []
    str_title = ''
    str_abstract = ''

    #TODO: get "Additional Material"information.
    lst_additional_materials = []
    
    #TODO: get category text information.
    lst_dict_category = []
    
    lst_jel_categorycode = []


    dict_author = {
            'name': '',
            'institution': ''
        }

    for metadata in lst_metadata:
        name_metadata = metadata.get('name')

        if(name_metadata == 'citation_author'):    
            author_name = metadata.get('content')
            dict_author['name'] = author_name
            continue


        if(name_metadata == 'citation_author_institution'):
            author_institution = metadata.get('content')
            dict_author['institution'] = author_institution

            copy_dict_author = dict_author.copy()        
            lst_dict_author.append(copy_dict_author)
            continue
    
    
        if(name_metadata == 'citation_title'):
            str_title = metadata.get('content')
            continue
    

        if(name_metadata == 'twitter:description'):
            str_abstract = metadata.get('content')
            continue

    # get category codes
    soup_class_code = soup_obj.find_all("strong", {"class":"code"})

    for item in soup_class_code:
        category_code = item.contents[0]
        lst_jel_categorycode.append(category_code)

    
    paper_info["title_str"] = str_title
    paper_info["author_lst"] = lst_dict_author
    paper_info["abstract_str"] = str_abstract
    paper_info["categorycode_lst"] = lst_jel_categorycode
    
    #TODO
    #paper_info["category"] = xxx
    #paper_info["additional_material"] = xxx
    
    
    return paper_info

In [22]:
scrape_aer_info_in_paper("https://www.aeaweb.org/articles?id=10.1257/aer.20210220")

{'title_str': 'Supply Network Formation and Fragility',
 'author_lst': [{'name': 'Elliott, Matthew', 'institution': 'U of Cambridge'},
  {'name': 'Golub, Benjamin', 'institution': 'Northwestern U'},
  {'name': 'Leduc, Matthew V.',
   'institution': 'Paris School of Economics and U of Paris 1 Pantheon-Sorbonne'}],
 'abstract_str': '(August 2022) - We model the production of complex goods in a large supply network. Each firm sources several essential inputs through relationships with other firms. Individual supply relationships are at risk of idiosyncratic failure, which threatens to disrupt production. To protect against this, firms multisource inputs and strategically invest to make relationships stronger, trading off the cost of investment against the benefits of increased robustness. A supply network is called fragile if aggregate output is very sensitive to small aggregate shocks. We show that supply networks of intermediate productivity are fragile in equilibrium, even though this 

# All in Once

In [None]:
#Get paper information from a limited number's issues.

limit = 5

paper_info_lst = []

aer_issues = scrape_aer_issues()
keys_aer_issues = list(aer_issues.keys())

for i in range(limit):
    issue_url = keys_aer_issues[i]
    papers_dict = scrape_aer_papers_in_issue(issue_url)
    
    paper_url_lst = list(papers_dict.keys())
    for paper_url in paper_url_lst:
        paper_info = scrape_aer_info_in_paper(paper_url)
        
        # extract DOI from AER URL.
        keyword = '/articles?id='
        slice_index = paper_url.find(keyword)
        slice_index += len(keyword)
        
        # use DOI(Digital Object Identifier)  
        # replace '/' with '_' because this DOI will be used as dataID in DB. (or an error occur.)
        doi = paper_url[slice_index:]
        doi_replaced = doi.replace('/', '_')
        
        #debug:
        print("DOI before: ", doi)
        print("DOI after: ", doi_replaced)
        
        paper_info_lst.append(paper_info)
