# Analysing gender in ecology publications

### Loading the data

In [19]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import csv
from sklearn.metrics import mean_squared_error

%matplotlib inline

Let us load the data as a pandas dataframe.

In [20]:
data = pd.read_csv("new_journals_data.csv", sep = ",")
len(data)

2532

### Exploring the data
Now take a look at the dataset and its features:

In [21]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,TI,SO,LeadAuthorFirst,LeadAuthorLast,LeadInitOnly,first.gender.manual,first.gender,first.gender.manual.method,EditorFirst,EditorLast,editor.gender.manual,FirstInst2,Editor.Institution
0,1,"Ecology Letters, and Transparency and Openness...",ECOLOGY LETTERS,,daniels,no,,,,,,,,
1,2,The relative importance of trait vs. genetic d...,ECOLOGY,Jessica,hoareau,no,female,female,,Jonathan,Grabowski,male,University of California Davis,Northeastern University
2,3,Test of biotic and abiotic correlates of latit...,JOURNAL OF ECOLOGY,Luis,morard,no,male,male,,Jennifer,Lau,female,Universidad Autonoma de Yucatan,Michigan State University
3,4,Multi-trophic consequences of plant genetic va...,ECOLOGY,Luis,soul,no,male,male,,James,Cronin,male,Universidad Autonoma de Yucatan,Louisiana State University
4,5,Parameterisation and validation of a resource ...,ECOLOGY LETTERS,Tomoyuki,farrell,no,,,,Liza,Comita,female,,Yale University


In [22]:
data.dtypes

Unnamed: 0                     int64
TI                            object
SO                            object
LeadAuthorFirst               object
LeadAuthorLast                object
LeadInitOnly                  object
first.gender.manual           object
first.gender                  object
first.gender.manual.method    object
EditorFirst                   object
EditorLast                    object
editor.gender.manual          object
FirstInst2                    object
Editor.Institution            object
dtype: object

### Clear the data from fields we don't need
Delete all data about gender

In [23]:
data = data.drop(columns=["LeadInitOnly",
                             "first.gender.manual",
                             "first.gender",
                             "first.gender.manual.method",
                             "editor.gender.manual",
                            ])

### Fix columns naming
For the simplicity let's fix the naming to a more logical one

In [24]:
data.rename(columns={'Unnamed': 'index',
                     'TI':'title',
                     'SO':'type',
                     'LeadAuthorFirst':'author_first_name',
                     'LeadAuthorLast':'author_last_name',
                     'EditorFirst':'editor_first_name',
                     'EditorLast':'editor_last_name',
                     'FirstInst2':'author_institution',
                     'Editor.Institution':'editor_institution',
                    }, 
                 inplace=True)

In [25]:
data.dtypes

Unnamed: 0             int64
title                 object
type                  object
author_first_name     object
author_last_name      object
editor_first_name     object
editor_last_name      object
author_institution    object
editor_institution    object
dtype: object

Take only those records where author_first_name & editor_first_name are valid

In [26]:
len(data)

2532

In [27]:
data = data[data['author_first_name'].notnull()]
# data = data[not(len(data['author_first_name']) == 2 and data['author_first_name'][1] == ".")]
data = data[data['editor_first_name'].notnull()]
# data = data[len(data['editor_first_name']) > 2 and data['editor_first_name'][1] != "."]

In [28]:
len(data)

2425

### TODO: label women or men using genderize.io API and (cp + 4 / p + 2) normalization

In [29]:
hashmap = {}
rows_to_del = []

indx = 0

for index, row in data.iterrows():
    if len(row['author_first_name']) == 2 and row['author_first_name'][1] == ".":
        rows_to_del.append(indx)
    elif row['author_first_name'] in hashmap:
        hashmap[row['author_first_name']] += 1
    else:
        hashmap[row['author_first_name']] = 1
    indx += 1

for index, row in data.iterrows():
    if len(row['editor_first_name']) == 2 and row['editor_first_name'][1] == ".":
        rows_to_del.append(indx)
    elif row['editor_first_name'] in hashmap:
        hashmap[row['editor_first_name']] += 1
    else:
        hashmap[row['editor_first_name']] = 1
    indx += 1

for idx in rows_to_del: 
    data.drop(data.index[[idx]])
print(hashmap.keys())

dict_keys(['Jessica', 'Luis', 'Tomoyuki', 'Andre', 'Peter', 'Dean', 'Vanessa', 'Michelle', 'Anurag', 'Ken', 'Dirk', 'Masahiro', 'Christina', 'Marcelo', 'Susanne', 'Patrick', 'Diogo', 'Arnaud', 'Christoffer', 'Joerg', 'Helen', 'Adam', 'Hamid', 'Eric', 'Daniel', 'Maximilian', 'Jacob', 'Andrew', 'David', 'Maria', 'Florian', 'Karie', 'Diego', 'Isabel', 'Priyanga', 'Roberto', 'Tomotsune', 'Aitor', 'Francis', 'Sarah', 'Mette', 'Marti', 'Matthew', 'Thomas', 'Weston', 'Sonia', 'Dvir', 'Patricia', 'Yimen', 'Gerardo', 'Miguel', 'Alicia', 'Hector', 'Jonathan', 'Xavier', 'Jennifer', 'Paula', 'Samuel', 'Sonya', 'Marie', 'Gabriel', 'Willem', 'Amy', 'Colin', 'Tal', 'Jose', 'Ranen', 'Eliran', 'Sandro', 'Benura', 'Benedicte', 'Ruxandra', 'Guy', 'Sara', 'Lander', 'Robert', 'Wenming', 'Ida', 'Liam', 'Diane', 'Andy', 'EmilyClare', 'Timothy', 'Jesus', 'Aabir', 'Natalie', 'Gyoergy', 'Marion', 'Shannon', 'Nicholas', 'Albert', 'Allison', 'Lewis', 'Felipe', 'Luke', 'Ceres', 'Frederic', 'Louise', 'Andres', 'Mat

In [30]:
print("There are {} unique names".format(len(hashmap.keys())))

There are 1485 unique names


In [31]:
import json
import requests
# Example of API output
requests.get(url='https://api.genderize.io/?name=' + 'tanya').content

b'{"name":"tanya","gender":"female","probability":1,"count":960}'

In [32]:
# Commented sending requests to genderize API
#for name in hashmap.keys(): 
#    hashmap[name] = requests.get(url='https://api.genderize.io/?name=' + name.lower()).content

In [33]:
print(hashmap)

{'Jessica': 17, 'Luis': 5, 'Tomoyuki': 1, 'Andre': 3, 'Peter': 59, 'Dean': 1, 'Vanessa': 2, 'Michelle': 3, 'Anurag': 1, 'Ken': 10, 'Dirk': 4, 'Masahiro': 1, 'Christina': 3, 'Marcelo': 2, 'Susanne': 4, 'Patrick': 9, 'Diogo': 1, 'Arnaud': 1, 'Christoffer': 1, 'Joerg': 3, 'Helen': 6, 'Adam': 14, 'Hamid': 1, 'Eric': 18, 'Daniel': 55, 'Maximilian': 2, 'Jacob': 3, 'Andrew': 43, 'David': 115, 'Maria': 22, 'Florian': 4, 'Karie': 1, 'Diego': 5, 'Isabel': 3, 'Priyanga': 5, 'Roberto': 10, 'Tomotsune': 1, 'Aitor': 1, 'Francis': 4, 'Sarah': 20, 'Mette': 2, 'Marti': 2, 'Matthew': 42, 'Thomas': 56, 'Weston': 1, 'Sonia': 6, 'Dvir': 1, 'Patricia': 11, 'Yimen': 1, 'Gerardo': 1, 'Miguel': 9, 'Alicia': 3, 'Hector': 6, 'Jonathan': 48, 'Xavier': 4, 'Jennifer': 27, 'Paula': 6, 'Samuel': 6, 'Sonya': 1, 'Marie': 1, 'Gabriel': 4, 'Willem': 1, 'Amy': 31, 'Colin': 1, 'Tal': 6, 'Jose': 14, 'Ranen': 1, 'Eliran': 1, 'Sandro': 1, 'Benura': 1, 'Benedicte': 1, 'Ruxandra': 1, 'Guy': 1, 'Sara': 3, 'Lander': 1, 'Robert': 

In [34]:
# Store current results to the file
# import json
# with open('data.json', 'w') as outfile:
#     json.dump(hashmap, outfile)

In [36]:
# Get results from file. Fix some names were not loaded because of requests limit reached
with open('data.json') as f:
    hashmap = json.load(f)

In [50]:
io27 = 0
for key in hashmap.keys():
    if hashmap[key] == '{"error":"Request limit reached"}':
        hashmap[key] = requests.get(url='https://api.genderize.io/?name=' + key.lower()).content
        io27 += 1
        print(io27)
        print(key)

In [60]:
with open('data_final.json') as f:
    hashmap = json.load(f)

In [65]:
data['author_gender'] = pd.Series(np.random.randn(len(data)), index=data.index)
data['editor_gender'] = pd.Series(np.random.randn(len(data)), index=data.index)
data.head(5)

Unnamed: 0.1,Unnamed: 0,title,type,author_first_name,author_last_name,editor_first_name,editor_last_name,author_institution,editor_institution,author_gender,editor_gender
1,2,The relative importance of trait vs. genetic d...,ECOLOGY,Jessica,hoareau,Jonathan,Grabowski,University of California Davis,Northeastern University,-0.090737,-0.072297
2,3,Test of biotic and abiotic correlates of latit...,JOURNAL OF ECOLOGY,Luis,morard,Jennifer,Lau,Universidad Autonoma de Yucatan,Michigan State University,-1.294677,-1.585269
3,4,Multi-trophic consequences of plant genetic va...,ECOLOGY,Luis,soul,James,Cronin,Universidad Autonoma de Yucatan,Louisiana State University,-0.648089,0.220762
4,5,Parameterisation and validation of a resource ...,ECOLOGY LETTERS,Tomoyuki,farrell,Liza,Comita,,Yale University,0.425052,-0.728908
5,6,An Efficient Independence Sampler for Updating...,SYSTEMATIC BIOLOGY,Andre,grogan,Peter,Foster,Heidelberg University,University of Edinburgh,-1.851933,-0.065474


In [93]:
for index, row in data.iterrows():
    for col_name, col_gender in [['editor_first_name', 'editor_gender'], ['author_first_name', 'author_gender']]:
        if row[col_name] in hashmap:
            curr_item = json.loads(hashmap[row[col_name]])

            if curr_item["gender"] == None:
                data[col_gender][index] = -1
            else: 
                p = curr_item["probability"]
                c = curr_item["count"]
                new_prob = (p*c + 2) / (p + 4)
                if new_prob > 0.85:
                    data[col_gender][index] = curr_item["gender"]
                else: data[col_gender][index] = -1
        else:
            print('Error. No name in hashmap')
            print(row)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Error. No name in hashmap
Unnamed: 0                                                           32
title                 Social living mitigates the costs of a chronic...
type                                                    ECOLOGY LETTERS
author_first_name                                                    E.
author_last_name                                                  heard
editor_first_name                                                 Marco
editor_last_name                                         Festa-Bianchet
author_institution                                           Penn State
editor_institution                             University of Sherbrooke
author_gender                                                 -0.147273
editor_gender                                                      male
Name: 31, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Error. No name in hashmap
Unnamed: 0                                                           73
title                 Disentangling the biogeography of ship biofoul...
type                                    GLOBAL ECOLOGY AND BIOGEOGRAPHY
author_first_name                                                    G.
author_last_name                                                peoples
editor_first_name                                           Marie-Josee
editor_last_name                                                 Fortin
author_institution                                                  NaN
editor_institution                                University of Toronto
author_gender                                                -0.0477614
editor_gender                                                    female
Name: 72, dtype: object
Error. No name in hashmap
Unnamed: 0                                                          112
title                 fuzzySim: applying fuzzy logic to bina

Error. No name in hashmap
Unnamed: 0                                                          475
title                 Effects of ocean acidification on Posidonia oc...
type                                                 JOURNAL OF ECOLOGY
author_first_name                                                    T.
author_last_name                                        razafindratsima
editor_first_name                                                 Brian
editor_last_name                                               Silliman
author_institution                                  Universite Paris 06
editor_institution                                      Duke University
author_gender                                                 -0.615712
editor_gender                                                      male
Name: 474, dtype: object
Error. No name in hashmap
Unnamed: 0                                                          487
title                 Energetic costs of mange in wolves es

Error. No name in hashmap
Unnamed: 0                                                          638
title                 Eutrophication triggers contrasting multilevel...
type                                                            ECOLOGY
author_first_name                                                    W.
author_last_name                                                bennett
editor_first_name                                                Joseph
editor_last_name                                                 Yavitt
author_institution                                University of Antwerp
editor_institution                                   Cornell University
author_gender                                                   -1.0891
editor_gender                                                      male
Name: 637, dtype: object
Error. No name in hashmap
Unnamed: 0                                                          648
title                 Fire evolution in the radioactive for

Error. No name in hashmap
Unnamed: 0                                                          940
title                 Invaders do not require high resource levels t...
type                                                            ECOLOGY
author_first_name                                                    J.
author_last_name                                                 janzen
editor_first_name                                                Daniel
editor_last_name                                               Laughlin
author_institution                                  Syracuse University
editor_institution                                University of Waikato
author_gender                                                 -0.554377
editor_gender                                                      male
Name: 939, dtype: object
Error. No name in hashmap
Unnamed: 0                                                          942
title                 Tree of Life Reveals Clock-Like Speci

Error. No name in hashmap
Unnamed: 0                                                         1081
title                 Climate tolerances and trait choices shape con...
type                                    GLOBAL ECOLOGY AND BIOGEOGRAPHY
author_first_name                                                    G.
author_last_name                                zechmeister-boltenstern
editor_first_name                                                  Karl
editor_last_name                                                  Evans
author_institution                   University of California Riverside
editor_institution                              University of Sheffield
author_gender                                                  0.515589
editor_gender                                                      male
Name: 1080, dtype: object
Error. No name in hashmap
Unnamed: 0                                                         1088
title                 Relatively stable response of fruiti

Error. No name in hashmap
Unnamed: 0                                                         1491
title                 Exploring the spatially explicit predictions o...
type                                    GLOBAL ECOLOGY AND BIOGEOGRAPHY
author_first_name                                                    D.
author_last_name                                               hardesty
editor_first_name                                                  Jose
editor_last_name                                            Diniz-Filho
author_institution                                College of Charleston
editor_institution                        Universidade Federal de Goias
author_gender                                                   1.14915
editor_gender                                                      male
Name: 1490, dtype: object
Error. No name in hashmap
Unnamed: 0                                                         1528
title                 Avoiding the crowds: the evolution o

Error. No name in hashmap
Unnamed: 0                                                         1707
title                 Suppression of savanna ants alters invertebrat...
type                                                            ECOLOGY
author_first_name                                                    C.
author_last_name                                               cardillo
editor_first_name                                                Nathan
editor_last_name                                                Sanders
author_institution                              University of Liverpool
editor_institution                              University of Tennessee
author_gender                                                  0.292642
editor_gender                                                      male
Name: 1706, dtype: object
Error. No name in hashmap
Unnamed: 0                                                         1754
title                 Variation in recruitment and the est

Error. No name in hashmap
Unnamed: 0                                                         2106
title                 Analysis of stable states in global savannas: ...
type                                    GLOBAL ECOLOGY AND BIOGEOGRAPHY
author_first_name                                                    A.
author_last_name                                                 hansen
editor_first_name                                               Niklaus
editor_last_name                                             Zimmermann
author_institution                                      Yale University
editor_institution                     Swiss Federal Research Institute
author_gender                                                  0.324901
editor_gender                                                      male
Name: 2105, dtype: object
Error. No name in hashmap
Unnamed: 0                                                         2216
title                 Crop rotational diversity enhances b

### Data preprocessing

Now let's cleanup and set the probabilities + perform (cp + 4 / p + 2) normalization

In [94]:
data.head()

Unnamed: 0.1,Unnamed: 0,title,type,author_first_name,author_last_name,editor_first_name,editor_last_name,author_institution,editor_institution,author_gender,editor_gender
1,2,The relative importance of trait vs. genetic d...,ECOLOGY,Jessica,hoareau,Jonathan,Grabowski,University of California Davis,Northeastern University,female,male
2,3,Test of biotic and abiotic correlates of latit...,JOURNAL OF ECOLOGY,Luis,morard,Jennifer,Lau,Universidad Autonoma de Yucatan,Michigan State University,male,female
3,4,Multi-trophic consequences of plant genetic va...,ECOLOGY,Luis,soul,James,Cronin,Universidad Autonoma de Yucatan,Louisiana State University,male,male
4,5,Parameterisation and validation of a resource ...,ECOLOGY LETTERS,Tomoyuki,farrell,Liza,Comita,,Yale University,-1,female
5,6,An Efficient Independence Sampler for Updating...,SYSTEMATIC BIOLOGY,Andre,grogan,Peter,Foster,Heidelberg University,University of Edinburgh,male,male


In [98]:
print("Ratio of female / male in authors", len(data[data['author_gender'] == "female"]) / len(data[data['author_gender'] == "male"]))

Ratio of female / male in authors 0.62004662004662


In [99]:
print("Ratio of female / male in editors", len(data[data['editor_gender'] == "female"]) / len(data[data['editor_gender'] == "male"]))

Ratio of female / male in editors 0.2918502202643172
