# Get the Data to be Regression-ready

There are three part in this code.

* Configuration
* Create Gender Information
* Get the data to be ready for regression

In [1]:
import pandas as pd
import numpy as np
import json
import re
import itertools

In [2]:
with_abs = pd.read_csv('all_combined.csv')
with_abs = with_abs[~with_abs['Abstract'].isnull()]

In [3]:
with_abs

Unnamed: 0,Journal,Source URL,Title,Long Volume,Issue Vol.,Issue No.,Month,Year,Abstract,Full Author,Author 1,Author 2,Author 3,Author 4,Author 5,Author 6,Author 7,Author 8
0,AER,https://www.jstor.org/stable/116860,The Value of Weather Information Services for ...,"Vol. 88, No. 5, Dec., 1998",Vol. 88,No. 5,Dec.,1998.0,The U.S. government established a national wea...,Erik D. Craft,Erik D. Craft,,,,,,,
1,AER,https://www.jstor.org/stable/116861,The Rise and Fall of Bank Control in the Unite...,"Vol. 88, No. 5, Dec., 1998",Vol. 88,No. 5,Dec.,1998.0,This article studies how equity ownership and ...,Miguel Cantillo Simon,Miguel Cantillo Simon,,,,,,,
2,AER,https://www.jstor.org/stable/116862,Winners and Losers in Russia's Economic Transi...,"Vol. 88, No. 5, Dec., 1998",Vol. 88,No. 5,Dec.,1998.0,The transition to a market economy has produce...,Elizabeth Brainerd,Elizabeth Brainerd,,,,,,,
3,AER,https://www.jstor.org/stable/116863,Unemployment and the Social Safety Net during ...,"Vol. 88, No. 5, Dec., 1998",Vol. 88,No. 5,Dec.,1998.0,We investigate the remarkably short unemployme...,"John C. Ham, Jan Svejnar and Katherine Terrell",John C. Ham,Jan Svejnar,Katherine Terrell,,,,,
4,AER,https://www.jstor.org/stable/116864,Federalism and the Soft Budget Constraint,"Vol. 88, No. 5, Dec., 1998",Vol. 88,No. 5,Dec.,1998.0,The government's incentives to bail out ineffi...,Yingyi Qian and Gérard Roland,Yingyi Qian,Gérard Roland,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22301,RES,https://doi.org/10.1093/restud/rdaa033,Development Projects and Economic Networks: Le...,,Volume 88,Issue 3,May,2021.0,This article investigates the effects of devel...,,Simon Heß,Dany Jaimovich,Matthias Schündeln,,,,,
22302,RES,https://doi.org/10.1093/restud/rdaa054,Haste or Waste? Peer Pressure and Productivity...,,Volume 88,Issue 3,May,2021.0,Motivated by wide cross-sectional variations i...,,David Silver,,,,,,,
22303,RES,https://doi.org/10.1093/restud/rdaa045,Monetary Policy for a Bubbly World,,Volume 88,Issue 3,May,2021.0,What is the role of monetary policy in a bubbl...,,Vladimir Asriyan,Luca Fornaro,Alberto Martin,Jaume Ventura,,,,
22304,RES,https://doi.org/10.1093/restud/rdaa052,"Stability, Strategy-Proofness, and Cumulative ...",,Volume 88,Issue 3,May,2021.0,We characterize when a stable and strategy-pro...,,John William Hatfield,Scott Duke Kominers,Alexander Westkamp,,,,,


In [4]:
authors = pd.read_csv('../Authors/author_sex.csv')



In [5]:
with_abs_2015 = with_abs[(with_abs['Year']<=2015) & (with_abs['Journal'] != 'RES')]

In [7]:
all_authors = with_abs_2015.loc[:, [f"Author {i+1}" for i in range(7)]].values.flatten()

all_authors = sorted(list(set(all_authors))[1:])
all_authors  ## all authors in the list that need to match the gender

['A. A. Ryvkin',
 'A. A. Walters',
 'A. Abigail Payne',
 'A. Anastasopoulos',
 'A. Andrew John',
 'A. Araujo',
 'A. B. Atkinson',
 'A. B. Z. Salem',
 'A. Belloni',
 'A. Ben-Israel',
 'A. Ben-Tal',
 'A. C. Harvey',
 'A. Charnes',
 'A. Colin Cameron',
 'A. D. Woodland',
 'A. F. Shorrocks',
 'A. G. Doig',
 'A. H. Land',
 'A. Holly',
 'A. Joshua Strickland',
 'A. K. Skiba',
 'A. Khan',
 'A. Kirman',
 'A. L. Nagar',
 'A. Lans Bovenberg',
 'A. M. Buoncristiani',
 'A. M. Kshirsagar',
 'A. M. Robert Taylor',
 'A. Mas-Colell',
 'A. McLennan',
 'A. Michael Spence',
 'A. Mitchell Polinsky',
 'A. Monfort',
 'A. Mushfiq Mobarak',
 'A. Nataf',
 'A. P. Barten',
 'A. Pakes',
 'A. R. Bergstrom',
 'A. R. Ferguson',
 'A. R. Pagan',
 'A. Robert Nobay',
 'A. Rodney Dobell',
 'A. Ronald Gallant',
 'A. S. Deaton',
 'A. S. Goldberger',
 'A. S. Rao',
 'A. Simonovits',
 'A. T. Denzau',
 'A. Thomas King',
 'A. Trognon',
 'A. Zabalza',
 'A. Zellner',
 'A. de Palma',
 'A. ten Kate',
 'Aanund Hylland',
 'Aaron Roth

## Create Gender Information

This section tries to match each authors appearring in the combined dataset to his/her gender, according to Hangel's list of gender.

I constructed an transformation rule that deals with issues like abbreviations, neglection on middle-name, and so on, to match as many as possible the gender.

In [10]:
with open('sex_json.json','r') as j:    
    erin_author_sex = json.loads(j.read())
    

In [11]:
# abbreviations
new_sex = dict()
for name, sex in erin_author_sex.items():
    name_sections = re.split(' (?!Jr.)', name)
    last_name = name_sections[-1]
    name_section_num = len(name_sections)
    
    if name_section_num == 1: continue
    other_than_lastname = name_sections[:-1]
    abbr_other = [f"{s[0]}." for s in other_than_lastname]
    choosing_list = [[other_than_lastname[i], abbr_other[i]] for i in range(name_section_num-1)]
    for combination in list(itertools.product([0, 1], repeat=name_section_num-1)):
        new_name = ' '.join(
            [choosing_list[i][combination[i]] for i in range(name_section_num-1)]+[last_name])
        new_sex.update({new_name:sex})
                      
    if name_section_num == 3:
        no_middle_name = f"{name_sections[0]} {name_sections[2]}"
        new_sex.update({no_middle_name:sex})
    

In [12]:
sexes = []
# match each name that appears to the gender list created above
for a in all_authors:
    sex = None
    if a in new_sex:
        sex = new_sex[a]
    
    sexes.append(sex)

In [15]:
sex_df = pd.DataFrame(zip(all_authors, sexes), columns=['Name', 'Gender'])
sex_df

Unnamed: 0,Name,Gender
0,A. A. Ryvkin,1
1,A. A. Walters,1
2,A. Abigail Payne,0
3,A. Anastasopoulos,1
4,A. Andrew John,1
...,...,...
7207,Òscar Jordà,1
7208,Ö. Éltetö,
7209,Ľuboš Pástor,1
7210,Şevin Yeltekin,


In [207]:
sex_df.to_csv('sex_noRes.csv')

In [278]:
with open("sex_json_full_temp.json",'w') as f:
    json.dump(new_sex, f)

## Get the data to be ready for regression

Assume that now the datas (articles that are needed is filtered, and genders are all assigned) are complete.

Rerun the configuration section to load the data.

In [208]:
with_abs_2015  

Unnamed: 0,Journal,Source URL,Title,Long Volume,Issue Vol.,Issue No.,Month,Year,Abstract,Full Author,Author 1,Author 2,Author 3,Author 4,Author 5,Author 6,Author 7,Author 8
0,AER,https://www.jstor.org/stable/116860,The Value of Weather Information Services for ...,"Vol. 88, No. 5, Dec., 1998",Vol. 88,No. 5,Dec.,1998.0,The U.S. government established a national wea...,Erik D. Craft,Erik D. Craft,,,,,,,
1,AER,https://www.jstor.org/stable/116861,The Rise and Fall of Bank Control in the Unite...,"Vol. 88, No. 5, Dec., 1998",Vol. 88,No. 5,Dec.,1998.0,This article studies how equity ownership and ...,Miguel Cantillo Simon,Miguel Cantillo Simon,,,,,,,
2,AER,https://www.jstor.org/stable/116862,Winners and Losers in Russia's Economic Transi...,"Vol. 88, No. 5, Dec., 1998",Vol. 88,No. 5,Dec.,1998.0,The transition to a market economy has produce...,Elizabeth Brainerd,Elizabeth Brainerd,,,,,,,
3,AER,https://www.jstor.org/stable/116863,Unemployment and the Social Safety Net during ...,"Vol. 88, No. 5, Dec., 1998",Vol. 88,No. 5,Dec.,1998.0,We investigate the remarkably short unemployme...,"John C. Ham, Jan Svejnar and Katherine Terrell",John C. Ham,Jan Svejnar,Katherine Terrell,,,,,
4,AER,https://www.jstor.org/stable/116864,Federalism and the Soft Budget Constraint,"Vol. 88, No. 5, Dec., 1998",Vol. 88,No. 5,Dec.,1998.0,The government's incentives to bail out ineffi...,Yingyi Qian and Gérard Roland,Yingyi Qian,Gérard Roland,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20203,QJE,https://doi.org/10.1093/qje/qjv025,Human Capital and Industrialization: Evidence ...,,Volume 130,Issue 4,November,2015.0,While human capital is a strong predictor of e...,,Mara P. Squicciarini,Nico Voigtländer,,,,,,
20204,QJE,https://doi.org/10.1093/qje/qjv030,Radio and the Rise of The Nazis in Prewar Germany,,Volume 130,Issue 4,November,2015.0,How do the media affect public support for dem...,,Maja Adena,Ruben Enikolopov,Maria Petrova,Veronica Santarosa,Ekaterina Zhuravskaya,,,
20205,QJE,https://doi.org/10.1093/qje/qjv023,The Unfavorable Economics of Measuring the Ret...,,Volume 130,Issue 4,November,2015.0,Twenty-five large field experiments with major...,,Randall A. Lewis,Justin M. Rao,,,,,,
20206,QJE,https://doi.org/10.1093/qje/qjv026,Why you Can’t Find a Taxi in the Rain and Othe...,,Volume 130,Issue 4,November,2015.0,I replicate and extend the seminal work of Cam...,,Henry S. Farber,,,,,,,


In [209]:
from textatistic import Textatistic, punct_clean

In [279]:
with open('sex_json_full.json','r') as j:    
    author_sex_dict = json.loads(j.read())

In [308]:
def mutate_author(row):
    sex_rate = []
    for i in range(8):
        current_aut_col = f"Author {i+1}"
        if row[current_aut_col] == '':
            continue
        sex_col = f"Gender {i+1}"
        
        author = row[current_aut_col]
        
        if author not in author_sex_dict:
            row['Author Error'] = 1
            return row
        sex = author_sex_dict[author]
        row[sex_col] = sex
        sex_rate.append(int(sex))
    row['Gender Score'] = np.mean(sex_rate)
    return row
        

        

In [292]:
## "with_abs_2015" represents the final combined result. It is for now uncomplete
## once the list of articals are constructed, change it to that.

with_abs_2015 = with_abs_2015.replace({np.nan: ''})
with_abs_2015

Unnamed: 0,Journal,Source URL,Title,Long Volume,Issue Vol.,Issue No.,Month,Year,Abstract,Full Author,Author 1,Author 2,Author 3,Author 4,Author 5,Author 6,Author 7,Author 8
0,AER,https://www.jstor.org/stable/116860,The Value of Weather Information Services for ...,"Vol. 88, No. 5, Dec., 1998",Vol. 88,No. 5,Dec.,1998.0,The U.S. government established a national wea...,Erik D. Craft,Erik D. Craft,,,,,,,
1,AER,https://www.jstor.org/stable/116861,The Rise and Fall of Bank Control in the Unite...,"Vol. 88, No. 5, Dec., 1998",Vol. 88,No. 5,Dec.,1998.0,This article studies how equity ownership and ...,Miguel Cantillo Simon,Miguel Cantillo Simon,,,,,,,
2,AER,https://www.jstor.org/stable/116862,Winners and Losers in Russia's Economic Transi...,"Vol. 88, No. 5, Dec., 1998",Vol. 88,No. 5,Dec.,1998.0,The transition to a market economy has produce...,Elizabeth Brainerd,Elizabeth Brainerd,,,,,,,
3,AER,https://www.jstor.org/stable/116863,Unemployment and the Social Safety Net during ...,"Vol. 88, No. 5, Dec., 1998",Vol. 88,No. 5,Dec.,1998.0,We investigate the remarkably short unemployme...,"John C. Ham, Jan Svejnar and Katherine Terrell",John C. Ham,Jan Svejnar,Katherine Terrell,,,,,
4,AER,https://www.jstor.org/stable/116864,Federalism and the Soft Budget Constraint,"Vol. 88, No. 5, Dec., 1998",Vol. 88,No. 5,Dec.,1998.0,The government's incentives to bail out ineffi...,Yingyi Qian and Gérard Roland,Yingyi Qian,Gérard Roland,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20203,QJE,https://doi.org/10.1093/qje/qjv025,Human Capital and Industrialization: Evidence ...,,Volume 130,Issue 4,November,2015.0,While human capital is a strong predictor of e...,,Mara P. Squicciarini,Nico Voigtländer,,,,,,
20204,QJE,https://doi.org/10.1093/qje/qjv030,Radio and the Rise of The Nazis in Prewar Germany,,Volume 130,Issue 4,November,2015.0,How do the media affect public support for dem...,,Maja Adena,Ruben Enikolopov,Maria Petrova,Veronica Santarosa,Ekaterina Zhuravskaya,,,
20205,QJE,https://doi.org/10.1093/qje/qjv023,The Unfavorable Economics of Measuring the Ret...,,Volume 130,Issue 4,November,2015.0,Twenty-five large field experiments with major...,,Randall A. Lewis,Justin M. Rao,,,,,,
20206,QJE,https://doi.org/10.1093/qje/qjv026,Why you Can’t Find a Taxi in the Rain and Othe...,,Volume 130,Issue 4,November,2015.0,I replicate and extend the seminal work of Cam...,,Henry S. Farber,,,,,,,


In [309]:
with_sex = with_abs_2015.apply(mutate_author, axis=1)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [330]:
with_sex.columns

Index(['Abstract', 'Author 1', 'Author 2', 'Author 3', 'Author 4', 'Author 5',
       'Author 6', 'Author 7', 'Author 8', 'Author Error', 'Full Author',
       'Issue No.', 'Issue Vol.', 'Journal', 'Long Volume', 'Month', 'Sex 1',
       'Sex 2', 'Sex 3', 'Sex 4', 'Sex 5', 'Sex 6', 'Sex 7', 'Sex Score',
       'Source URL', 'Title', 'Year'],
      dtype='object')

In [315]:
def readability_columns(row):
    abstract = row['Abstract']
    try:
        r = Textatistic(abstract)
        row['Dale-Chall'] = r.dalechall_score
        row['Flesch Reading Ease'] = r.flesch_score
        row['Flesch-Kincaid'] = r.fleschkincaid_score
        row['Gunning Fog'] = r.gunningfog_score
        row['SMOG'] = r.smog_score
        return row
    except:
        print(row['Title'])
        row['Readability Score Err'] = 1
        return row

In [316]:
with_read = with_sex.apply(readability_columns, axis = 1)

A Dual-Self Model of Impulse Control
Inherited Control and Firm Performance
Cognition and Behavior in Two-Person Guessing Games: An Experimental Study
The Speed of Learning in Noisy Games: Partial Reinforcement and the Sustainability of Cooperation
Individual Preferences, Monetary Gambles, and Stock Market Participation: A Case for Narrow Framing
Shocks and Government Beliefs: The Rise and Fall of American Inflation
Stock Prices, News, and Economic Fluctuations


In [None]:
col_names = [
    "Journal",
    "Source URL",
    "Title",
    "Long Volume", "Issue Vol.", "Issue No.",
    "Month", "Year",
    "Abstract",
    "Full Author"] + \
    [f"Author {i+1}" for i in range(7)] + \
    [f"Sex {i+1}" for i in range(7)] +\
    ["Sex Score"] +\
    ['Dale-Chall','Flesch Reading Ease','Flesch-Kincaid','Gunning Fog', 'SMOG'] +\
    ['Readability Score Err']
with_read = with_read[col_names]

In [327]:
with_read.to_csv('reg_dat.csv', index=None)