In [3]:
import requests
import json
import re
import bs4
import time
import numpy as np
import pandas as pd

import sys
from os.path import exists
import signal

import CONFIG

In [4]:
def getBSFromURL(url):
    try:
        r = requests.get(url, headers = CONFIG.HEADER, timeout = 10)
        return getBS(r.text)
    
    except requests.exceptions.RequestException as e:
        print("Connection Error")
        raise e
    except requests.ReadTimeout as e:
        print("Timeout")
        raise e

def getBS(html):
    html_bs = bs4.BeautifulSoup(html, 'html.parser')
    return html_bs

## Get all Issues from selected decades

We scrape all the information of articles first from the main page of JSTOR.

By default, as in https://www.jstor.org/journal/amereconrevi, the titles of decades are first listed. Clicking on the decade makes an request to JSTOR backend to get the full list of the issues within the decade. 

The request are in the form of `https://www.jstor.org/journal/amereconrevi/decade/` + `ID`, where the `ID` is stored in the `data-filter` attribute in the element with class `expand-arrow`.

This returns a json object which contains the HTML element to append in the `ul` element, but we can simply turn them into `soup` objects and extract the link to the issues as well as the issue name, which are the `href` and the `text` of the `"a"` selector, respectively.

In [7]:
# a very simple sleeping function to avoid getting banned
def ImNotRobot(t = 5):
    sleepTime = np.random.uniform(t,t+5)
    print(f"sleeping {sleepTime} seconds")
    time.sleep(sleepTime)

# see if list of issue file already exist
def file(path_to_file):

    return exists(path_to_file)


# get url that returns all list if issues within the decade
def getURLinDecade(journal_url, decade_list):
    
    journal_bs = getBSFromURL(journal_url)
    all_decade_informations = journal_bs.select(".expand-arrow")
    
    needed_decade_informations = list(filter( 
        lambda x: (int(x['data-decade']) in decade_list), all_decade_informations))
    

    decade_issues_lists_url = [f"{journal_url}/decade/{u['data-filter']}" for u in needed_decade_informations]
    return decade_issues_lists_url

# go over the urls and get the url for all issues

def handleIssueText(t):
    return t.split('pp.')[0].strip()
    

    
def getAllIssuesInDecade(url):
    print ("processing " + url)
    issue_request = requests.get(url, headers = CONFIG.HEADER)

    issue_html = json.loads(issue_request.text)[0]
    soup = getBS(issue_html)
    all_issue = soup.select("a")
    
    print(all_issue)
    issues = []
    for issue in all_issue:
        issue_info = {
            'name':handleIssueText(issue.text),
            'url':issue['href']
        }
        issues.append(issue_info)
        
    return issues



In [8]:
all_journal_file_name = CONFIG.JOURNAL_NAME + '_all_issue.csv'

counter = 1
if exists(all_journal_file_name):
    pass
else:
    try:
        journal_url = CONFIG.URL
        URLinDecade = getURLinDecade(journal_url, CONFIG.SCRAPE_DECADES)

        allIssues = []
        for u in URLinDecade:
            ImNotRobot()
            allIssues += getAllIssuesInDecade(u)

        all_issues_df = pd.json_normalize(allIssues)
        all_issues_df['scraped'] = 0
        with open(all_journal_file_name, 'w') as fout:
            all_issues_df.to_csv(fout, header=True, index=False)
        
        if counter % 50 == 0 :
            ImNotRobot(120)
            
    except Exception as e:
        raise e
        

## Handling allIsue

We want to create a csv file, or xlsx file, that has the following column names


In [16]:
colNames = [
    "Source URL",
    "Title",
    "Long Volume",
    "Issue Vol.",
    "Issue No.",
    "Month",
    "Year",
    "Abstract"
]
# + Author List

I stored the resuls of issues into another file

In [6]:
def getArticles(url):
    issueBS = getBSFromURL(url)
    
    issue_name = issueBS.select('div.issue')
    if not issue_name:
        import Blocked
        raise Blocked.BlockedError("You might be blocked")
    
    issue_name = issue_name[0].text.strip()
    articlesWithAuthor = issueBS.select('div.contrib')
    return issue_name, articlesWithAuthor

def parseIssue(t):
    separated = t.split(',')
    
    # There are two different formats, yeez@@
    
    ## Vol. 77, No. 4, Jul., 2009
    if len(separated) == 4:
        vol = separated[0].strip()
        no = separated[1].strip()
        month = separated[2].strip()
        year = separated[3].strip()
    
    ##  Vol. 87, No. 4, July 2019
    else:
        vol = separated[0].strip()
        no = separated[1].strip()
        date = separated[2].strip().split(' ')
        month = date[0].strip()
        year = date[1].strip()
    return t, vol, no, month, year

def parseAuthor(a):
    authors = re.split(r'\s+and\s+|,(?!\s*Jr\.)\s*', a)
    split_authors = [e.strip() for e in authors]
    
    split_authors += [None]*(30-len(split_authors))
    return [a]+ split_authors

In [13]:
with open(all_journal_file_name) as f:
    all_issues_df = pd.read_csv(f)
needed_issues = all_issues_df[all_issues_df['scraped'] ==0 ]
display(needed_issues)

all_articles_file_name = CONFIG.JOURNAL_NAME + "_orig.csv"


counter = 1
for index, issue in needed_issues.iterrows():
    url = CONFIG.DOMAIN_URL + issue['url']
    try:     
        issue_name, articlesWithAuthor = getArticles(url)
        print(f"Processing : {counter}/{len(needed_issues)} \t {issue_name} ...")

        full_vol, vol, no, month, year = parseIssue(issue_name)

        tempArticles = []
        for article in articlesWithAuthor: 
            article_element = article.parent
            article_title = article_element.find('a').text.strip()
            article_url = CONFIG.DOMAIN_URL + article_element.find('a')['href']
            article_authors = parseAuthor( article.text.strip() )

            tempArticles.append([
                article_url,
                article_title,
                full_vol, vol, no, month, year,
                ''
            ] + 
            article_authors)
            print("Article Name  :  " + article_title)
        
        
        all_issues_df.loc[index, 'scraped'] = 1
        
        ## save
        with open(all_journal_file_name, 'w') as issue_f:
            all_issues_df.to_csv(issue_f, 
                                 header=True, 
                                 index=False)
        
        with open(all_articles_file_name, 'a') as article_f:
            tempDF = pd.DataFrame(tempArticles)
            tempDF.to_csv(article_f, 
                                header=False, 
                                index=False)
        print("="*30)
        if counter % 50 == 0 :
            ImNotRobot(120)
        counter += 1
        ImNotRobot()
        
    except Exception as e:
        print(e)
        sys.exit(0)

Unnamed: 0,name,url,scraped
309,"No. 3/4 Jul. - Oct., 1967",/stable/10.2307/i332542,0
310,"No. 2 Apr., 1967",/stable/10.2307/i332630,0
311,"No. 1 Jan., 1967",/stable/10.2307/i332629,0
312,"No. 4 Oct., 1966",/stable/10.2307/i332656,0
313,"No. 3 Jul., 1966",/stable/10.2307/i332655,0
...,...,...,...
373,"No. 1 Jan., 1951",/stable/10.2307/i332597,0
374,"No. 4 Oct., 1950",/stable/10.2307/i332596,0
375,"No. 3 Jul., 1950",/stable/10.2307/i332530,0
376,"No. 2 Apr., 1950",/stable/10.2307/i332592,0


Processing : 1/69 	 Vol. 35, No. 3/4, Jul. - Oct., 1967 ...
Article Name  :  An Experimental Study of Cooperative Duopoly
Article Name  :  A Model of the United Kingdom's Monetary Sector
Article Name  :  Estimation of Returns to Scale and the Elasticity of Substitution
Article Name  :  Utility Maximization and the Demand for New Zealand Meats
Article Name  :  Error Produced by Linearization in Mathematical Programming
Article Name  :  A Test for the Product Life Cycle
Article Name  :  Additive von Neumann-Morgenstern Utility Functions
Article Name  :  Elimination of Management Bias from Production Functions Fitted to Cross-Section Data: A Model and an Application to African Agriculture
Article Name  :  A Comparative Study of Alternative Estimators in a Distributed Lag Model
Article Name  :  A Continuous Leontief Production Model with Quadratic Objective Function
Article Name  :  A Note on Orderings
Article Name  :  Reply to "A Note on Orderings"
Article Name  :  Computation of Expansio

Processing : 5/69 	 Vol. 34, No. 3, Jul., 1966 ...
Article Name  :  Maximization by Quadratic Hill-Climbing
Article Name  :  Factor Analysis and Regression
Article Name  :  Simplification of Economic Models
Article Name  :  Pooling Cross Section and Time Series Data in the Estimation of a Dynamic Model: The Demand for Natural Gas
Article Name  :  A General Theory of Rational Behavior in Game Situations
Article Name  :  Revealed Preference Theory
Article Name  :  Testing Single-Equation Least Squares Regression Models for Autocorrelated Disturbances
Article Name  :  A Complete System of Consumer Demand Equations for the Australian Economy Fitted by a Model of Additive Preferences
Article Name  :  Stock Market Prices and Volumes of Sales
Article Name  :  Factor Market Distortions and the Shape of the Transformation Curve
Article Name  :  Household Production and Consumer Demand Functions
Article Name  :  Formal Structure of Majority Decision
Article Name  :  Comment on "The Distributed L

Processing : 9/69 	 Vol. 33, No. 3, Jul., 1965 ...
Article Name  :  A Survey of the Theory of International Trade: Part 1, The Classical Theory
Article Name  :  Existence and Uniqueness of Equilibrium Points for Concave N-Person Games
Article Name  :  International Comparisons of Prices and Quantities Consumed
Article Name  :  Optimal Programme of Capital Accumulation in a Multi-Sector Economy
Article Name  :  Further Analysis of the Short-Run Consumption Function with Emphasis on the Role of Liquid Assets
Article Name  :  Schooling and the Farm Problem
Article Name  :  Weak Gross Substitutability and the Existence of Competitive Equilibrium
Article Name  :  Measuring the Role of Price in International Trade: Some Further Tests
Article Name  :  A Tchebychev Inequality for the Convergence of a Generalized Classical Linear Estimator, Sample Size Being Fixed
Article Name  :  Consistency of Fisher's Tests
Article Name  :  The Relationship Between Transitive Preference and the Structure of 

Processing : 13/69 	 Vol. 32, No. 3, Jul., 1964 ...
Article Name  :  Spectral Analysis of Seasonal Adjustment Procedures
Article Name  :  Technology and Scale in Electricity Generation
Article Name  :  On the Economic Welfare Function
Article Name  :  A Stock-Adjustment Investment Model
Article Name  :  Optimal Savings in a Two-Sector Model of Growth
Article Name  :  Regional Programming of Efficient Agricultural Production Patterns
Article Name  :  A Note on Separability in Demand Analysis
Article Name  :  Multiple (s, S) Policies
Article Name  :  A Zeuthen-Hicks Theory of Bargaining
Article Name  :  Dynamic Utility
Article Name  :  Comments on "A Case Study in Prediction"
Article Name  :  A Case Study in Prediction: A Reply
Article Name  :  A Note on Optimum Savings
Article Name  :  Readings In Mathematical Programming
     by S. Vajda
Article Name  :  The Standard of Living: Some Problems of Analysis and of International Comparison
     by M. Mód, L. Drechsler, Zs. Ferge, L. Lengyel

Processing : 16/69 	 Vol. 31, No. 3, Jul., 1963 ...
Article Name  :  An Exploratory Quarterly Econometric Model of Effective Demand in the Postwar U. S. Economy
Article Name  :  Utility, Liquidity, and Debt Management
Article Name  :  A Monte Carlo Study of Alternative Estimates of the Cobb-Douglas Production Function
Article Name  :  A Monte Carlo Study of Alternative Estimates of the Cobb-Douglas Production Function: Reply
Article Name  :  A Monte Carlo Study of Alternative Estimates of the Cobb-Douglas Production Function: A Rejoinder
Article Name  :  Some Observations on the Index Number Problem
Article Name  :  Sales Forecasts and the Inventory Cycle
Article Name  :  Bayesian Statisticians and Remote Clients
Article Name  :  The Causal Interpretation of Non-Triangular Systems of Economic Relations
Article Name  :  The Causal Interpretability of Structural Parameters: A Reply
Article Name  :  On the Causal Interpretation of Non-Triangular Systems of Economic Relations: A Rejoinder


Processing : 20/69 	 Vol. 30, No. 2, Apr., 1962 ...
Article Name  :  United States Imports, 1947-1958
Article Name  :  Investment, Innovation, and Growth
Article Name  :  A Game Theory Model for Agricultural Crop Selection
Article Name  :  Buffer Stocks, Sales Expectations, and Stability: A Multi-Sector Analysis of the Inventory Cycle
Article Name  :  On Devising Unbiased Estimators for the Parameters of the Cobb-Douglas Production Function
Article Name  :  Preference Scales and Expenditure Systems
Article Name  :  Partial Trace Correlations
Article Name  :  A Note on the Evaluation of the Marginal Efficiency of Capital
Article Name  :  Note on Program Uncertainty in the Dynamic Programming Problem
Article Name  :  Note on the Computation of Full-Information Maximum-Likelihood Estimates of Coefficients of a Simultaneous System
Article Name  :  An Alternate Proof and Extension of Solow's Theorem on Nonnegative Square Matrices
Article Name  :  The Efficiency of the Coal Industry. An Appl

Processing : 24/69 	 Vol. 29, No. 2, Apr., 1961 ...
Article Name  :  Aggregation of Variables in Dynamic Systems
Article Name  :  On the Cost of Approximate Specification in Simultaneous Equation Estimation
Article Name  :  The Graduation of Income Distributions
Article Name  :  Behavioristic Foundations of Utility
Article Name  :  Aggregation in Leontief Matrices and the Labour Theory of Value
Article Name  :  A Note on "Aggregation in Leontief Matrices and the Labour Theory of Value"
Article Name  :  Nonlinear Programming by the Simplex Method
Article Name  :  A Note on the Residual Variance Estimation in Simultaneous Equations
Article Name  :  A Note on the General Possibility Theorem of the Social Welfare Function
Article Name  :  On the Existence of General Equilibrium: Some Corrections
Article Name  :  Letter to the Editor
Article Name  :  Ekonomicheski raschet nailuchshego ispol' zovania resursov (Economic Calculation of the Best Utilization of Resources)
     by L. V. Kantorovi

Processing : 28/69 	 Vol. 28, No. 2, Apr., 1960 ...
Article Name  :  Ragnar Frisch and the Founding of the Econometric Society
Article Name  :  The Work of Ragnar Frisch, Econometrician
Article Name  :  The Foundations of Utility
Article Name  :  Mathematical Proofs of the Breakdown of Capitalism
Article Name  :  Additive Preferences
Article Name  :  Rules of Thumb for the Expansion of Industries in a Process of Economic Growth
Article Name  :  Some Theoretical Issues in the Measurement of Capacity
Article Name  :  Stationary Ordinal Utility and Impatience
Article Name  :  The Output-Investment Ratio and Input-Output Analysis
Article Name  :  A Method of Fractile Graphical Analysis
Article Name  :  Economic Expansion and the Interest Rate in Generalized von Neumann Models
Article Name  :  A Short Note on the Transmission of Shocks in Simultaneous Models
Article Name  :  An Extension of the Lechatelier Principle
Article Name  :  Hans von Mangoldt on Price Theory: A Contribution to the H

Processing : 32/69 	 Vol. 27, No. 2, Apr., 1959 ...
Article Name  :  Substitution versus Fixed Production Coefficients in the Theory of Economic Growth: A Synthesis
Article Name  :  A Complete Scheme for Computing All Direct and Cross Demand Elasticities in a Model with Many Sectors
Article Name  :  The Validity of Cross-Sectionally Estimated Behavior Equations in Time Series Applications
Article Name  :  Programmes d'Expansion et Taux d'Interet
Article Name  :  A Model of Seasonal Inventories
Article Name  :  Simultaneous Equations and Canonical Correlation Theory
Article Name  :  The Aggregation Problem in Input-Output Analysis
Article Name  :  A Statistical Model of Friction in Economics
Article Name  :  Theoretical Welfare Economics
     by J. de V. Graaff
Article Name  :  Das Rechnungswesen im Dienste der Leitung
     by Hendrik Virkkunen
Article Name  :  Wahrscheinlichkeitstheorie
     by Hans Richter
Article Name  :  International Bibliography of Economics
Article Name  :  Théor

Processing : 37/69 	 Vol. 26, No. 1, Jan., 1958 ...
Article Name  :  Utilities, Attitudes, Choices: A Review Note
Article Name  :  Estimation of Relationships for Limited Dependent Variables
Article Name  :  A Sector Model--The Poultry Industry of the U.S.A
Article Name  :  A Linear Programming Model of the U. S. Petroleum Refining Industry
Article Name  :  The Impact of Changes in the Terms of Trade on Western Europe's Balance of Payments
Article Name  :  A Monte Carlo Study of Estimates of Simultaneous Linear Structural Equations
Article Name  :  Decision and Team Problems in Airline Reservations
Article Name  :  Sur une correlation possible entre production, importations et emploi dans les pays industriels
Article Name  :  Threshold in Choice and the Theory of Demand
Article Name  :  Gross Substitutes and the Dynamic Stability of General Equilibrium
Article Name  :  Letters to the Editor
Article Name  :  The Structural Interdependences of the Economy: Proceedings of an International

Processing : 42/69 	 Vol. 24, No. 4, Oct., 1956 ...
Article Name  :  Resource Allocation for Economic Development
Article Name  :  A Fundamental Theorem for the Aggregation Problem of Input-Output Analysis
Article Name  :  The Application of Linear Programming to Competitive Bond Bidding
Article Name  :  Complementarity and Long-Range Projections
Article Name  :  An Eclectic Approach to the Pure Theory of Consumer Behavior
Article Name  :  The Theory of Capital and Its Time Measures
Article Name  :  The Theory of Capital and Its Time Measures: A Note on Mr. Blyth's Article
Article Name  :  On Hatanaka's Note on Consolidation
Article Name  :  On the Stability of Certain Economic Systems
Article Name  :  The Alphabet of Economic Science
     by Philip H. Wicksteed
Article Name  :  Elementi di politica economica razionale
     by Eraldo Fossati
Article Name  :  Business Concentration and Price Policy
Article Name  :  Einführung in die Betriebswirtschaftslehre
     by Martin Lohmann
Articl

Processing : 47/69 	 Vol. 23, No. 3, Jul., 1955 ...
Article Name  :  Equality of Factor Prices in World Trade
Article Name  :  An Econometric Study of Supply and Demand for New Zealand's Exports
Article Name  :  Income Distributions: A New Model
Article Name  :  Optimal Solution of a Dynamic Leontief Model with Substitution
Article Name  :  Sur l'importance en économétrie de la distinction entre les probabilités rationnelles et irrationnelles
Article Name  :  A Model for Optimizing Production by Reference to Cost Surrogates
Article Name  :  A History of Economic Analysis
     by J. A. Schumpeter
Article Name  :  Sampling Techniques
     by William G. Cochran
Article Name  :  Readings in the Philosophy of Science
     by Herbert Feigl, May Brodbeck
Article Name  :  World Population and Production
     by W. S. Woytinsky, E. S. Woytinsky
Article Name  :  Wirschaftswissenschaft von heute. Ein Ueberblick über moderne ökonomische Forschungen
     by Wilhelm Weber
Article Name  :  A Survey o

Processing : 52/69 	 Vol. 22, No. 2, Apr., 1954 ...
Article Name  :  Report of the Evaluative Committee for Econometrica
Article Name  :  On Equilibrium in Graham's Model of World Trade and Other Competitive Systems
Article Name  :  Causality and Econometrics
Article Name  :  Standard Errors of Forecast of a Complete Econometric Model
Article Name  :  A Model for Programming and Sensitivity Analysis in an Integrated Oil Company
Article Name  :  An Example of Autocorrelated Disturbances in Linear Regression
Article Name  :  Autoregression in the United States Economy, 1870-1929
Article Name  :  An Inventory Problem
Article Name  :  An Introduction to Linear Programming
     by A. Charnes, W. W. Cooper, A. Henderson
Article Name  :  L'efficacité sociale du système économique
     by Roger Dehem
Article Name  :  Utility and All That
     by D. H. Robertson
Article Name  :  Soviet National Income and Product in 1937
     by Abram Bergson
Article Name  :  The Theory of Inventory Management


Processing : 58/69 	 Vol. 20, No. 4, Oct., 1952 ...
Article Name  :  A Survey of the Theory of Rationing
Article Name  :  On a Quantitative Method in Production Planning and Scheduling
Article Name  :  The Graduation of Income Distributions
Article Name  :  A Dynamic Model: I. Principles of Model Structure
Article Name  :  A Continuous Model of Transportation
Article Name  :  Ordinal Preferences or Cardinal Utility?
Article Name  :  The Strong Independence Assumption--Gasoline Blends and Probability Mixtures
Article Name  :  Probability, Utility, and the Independence Axiom
Article Name  :  Note on von Neumann-Morgenstern's Strong Independence Axiom
Article Name  :  A Set of Independent Necessary and Sufficient Conditions for Simple Majority Decision
Article Name  :  Comments on Solow's "Structure of Linear Models"
Article Name  :  A Note on Pierre Gorra's Contribution on Index Numbers
Article Name  :  Logical Foundations of Probability
     by Rudolph Carnap
Article Name  :  Economie e

Processing : 64/69 	 Vol. 19, No. 2, Apr., 1951 ...
Article Name  :  Some Personal Reminiscences on a Great Man
Article Name  :  The Rate of Interest
Article Name  :  Le "Revenu Distribuable" et les Pertes Economiques
Article Name  :  The Invalidity of Classical Monetary Theory
Article Name  :  Inconsistency and Indeterminacy in Classical Economics
Article Name  :  The Application of Pareto's Law of Income to Japanese Data
Article Name  :  Two Consequences of the Transposition Theorem on Linear Inequalities
Article Name  :  A Note on Motzkin's Transposition Theorem
Article Name  :  An Interesting General Form for a Production Function
sleeping 7.115564125543813 seconds
Processing : 65/69 	 Vol. 19, No. 1, Jan., 1951 ...
Article Name  :  The Nonlinear Accelerator and the Persistence of Business Cycles
Article Name  :  Consumer Substitutions between Butter and Margarine
Article Name  :  Equilibrium among Spatially Separated Markets: Solution by Electric Analogue
Article Name  :  Report o

## Filter Unwanted Articles

Some articles containing words like 
* Report on
* Report of
* annual report
don't have abstracts, and I exclude them

Also, there are 'Book Reviews', for which the author is in the form of 'Reviewed by:'

In [14]:
arts = pd.read_csv(all_articles_file_name, header = None)
print(len(arts))
exclude_words = ['Report of','Report on', 'Annual Reports','Criticism Invited']
dont_need = (arts[1].str.contains('|'.join(exclude_words),regex=True, case=False) | 
            arts[8].str.contains('Review by:'))
print("Articles to exclude : "+ str(sum(dont_need)))

articles_remain = arts[~dont_need]
articles_remain

## save those don't need into another file
arts[dont_need].to_csv('ECA_skipped.csv',index = False, header = False)

5713
Articles to exclude : 1110


We add the column name and create a new file with it

In [19]:
total_column_number = len(articles_remain.columns)
print(total_column_number)

col_name_with_author = colNames + ['Full Author'] + [ f"Author {i+1}" for i in range(total_column_number-9)]

39


In [None]:
# articles_remain.columns = col_name_with_author
# articles_remain['scraped'] = [0]*len(articles_remain)
# articles_remain.to_csv(f'{CONFIG.JOURNAL_NAME}.csv', index = False)

In [20]:
articles_remain.columns = col_name_with_author
articles_remain.to_csv(f'{CONFIG.JOURNAL_NAME}_author_fixed.csv', index = False)