# Analysing gender in ecology publications

### Loading the data

In [110]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import csv
from sklearn.metrics import mean_squared_error

%matplotlib inline

Let us load the data as a pandas dataframe.

In [111]:
data = pd.read_csv("new_journals_data.csv", sep = ",")
len(data)

2532

### Exploring the data
Now take a look at the dataset and its features:

In [112]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,TI,SO,LeadAuthorFirst,LeadAuthorLast,LeadInitOnly,first.gender.manual,first.gender,first.gender.manual.method,EditorFirst,EditorLast,editor.gender.manual,FirstInst2,Editor.Institution
0,1,"Ecology Letters, and Transparency and Openness...",ECOLOGY LETTERS,,daniels,no,,,,,,,,
1,2,The relative importance of trait vs. genetic d...,ECOLOGY,Jessica,hoareau,no,female,female,,Jonathan,Grabowski,male,University of California Davis,Northeastern University
2,3,Test of biotic and abiotic correlates of latit...,JOURNAL OF ECOLOGY,Luis,morard,no,male,male,,Jennifer,Lau,female,Universidad Autonoma de Yucatan,Michigan State University
3,4,Multi-trophic consequences of plant genetic va...,ECOLOGY,Luis,soul,no,male,male,,James,Cronin,male,Universidad Autonoma de Yucatan,Louisiana State University
4,5,Parameterisation and validation of a resource ...,ECOLOGY LETTERS,Tomoyuki,farrell,no,,,,Liza,Comita,female,,Yale University


In [113]:
data.dtypes

Unnamed: 0                     int64
TI                            object
SO                            object
LeadAuthorFirst               object
LeadAuthorLast                object
LeadInitOnly                  object
first.gender.manual           object
first.gender                  object
first.gender.manual.method    object
EditorFirst                   object
EditorLast                    object
editor.gender.manual          object
FirstInst2                    object
Editor.Institution            object
dtype: object

### Clear the data from fields we don't need
Delete all data about gender

In [114]:
data = data.drop(columns=["LeadInitOnly",
                             "first.gender.manual",
                             "first.gender",
                             "first.gender.manual.method",
                             "editor.gender.manual",
                            ])

### Fix columns naming
For the simplicity let's fix the naming to a more logical one

In [115]:
data.rename(columns={'Unnamed': 'index',
                     'TI':'title',
                     'SO':'type',
                     'LeadAuthorFirst':'author_first_name',
                     'LeadAuthorLast':'author_last_name',
                     'EditorFirst':'editor_first_name',
                     'EditorLast':'editor_last_name',
                     'FirstInst2':'author_institution',
                     'Editor.Institution':'editor_institution',
                    }, 
                 inplace=True)

In [116]:
data.dtypes

Unnamed: 0             int64
title                 object
type                  object
author_first_name     object
author_last_name      object
editor_first_name     object
editor_last_name      object
author_institution    object
editor_institution    object
dtype: object

Take only those records where author_first_name & editor_first_name are valid

In [117]:
len(data)

2532

In [118]:
data = data[data['author_first_name'].notnull()]
# data = data[not(len(data['author_first_name']) == 2 and data['author_first_name'][1] == ".")]
data = data[data['editor_first_name'].notnull()]
# data = data[len(data['editor_first_name']) > 2 and data['editor_first_name'][1] != "."]

In [119]:
len(data)

2425

### TODO: label women or men using genderize.io API and (cp + 4 / p + 2) normalization

In [121]:
hashmap = {}
rows_to_del = []

indx = 0

for index, row in data.iterrows():
    if len(row['author_first_name']) == 2 and row['author_first_name'][1] == ".":
        rows_to_del.append(indx)
    elif row['author_first_name'] in hashmap:
        hashmap[row['author_first_name']] += 1
    else:
        hashmap[row['author_first_name']] = 1
    indx += 1

for index, row in data.iterrows():
    if len(row['editor_first_name']) == 2 and row['editor_first_name'][1] == ".":
        rows_to_del.append(indx)
    elif row['editor_first_name'] in hashmap:
        hashmap[row['editor_first_name']] += 1
    else:
        hashmap[row['editor_first_name']] = 1
    indx += 1

for idx in rows_to_del: 
    data.drop(data.index[[idx]])
print(hashmap.keys())

['Andreas', 'Stuart', 'Bailey', 'Erfan', 'Madelon', 'Claus', 'Ronald', 'Henri', 'Jean-Benoist', 'Valeriano', 'Shuli', 'Bret', 'Thomas', 'Guarino', 'Kohtaro', 'Ge', 'Jon', 'Guray', 'Paul', 'Leanne', 'Mitch', 'Claude', 'Joana', 'Hai-Jing', 'Matthew', 'Sally', 'Imroze', 'Stephanie', 'Asier', 'Zdenka', 'Angie', 'Dominique', 'Benedicte', 'Xiao', 'Elisabeth', 'Andre', 'Niall', 'Jes', 'Arik', 'Joerg', 'Witawas', 'Jef', 'Romain', 'Clotilde', 'Vernon', 'Stan', 'Johanna', 'Christina', 'Christine', 'Arndt', 'Olga', 'Coleen', 'Dimitri', 'Lynette', 'Ramiro', 'Stefanie', 'Oriol', 'Junjie', 'Le', 'Sara', 'Li', 'Brian', 'Bruno', 'Hui-Ling', 'Annalis', 'Jesus', 'Jiangxiao', 'Benoit', 'Deli', 'Kai', 'Josef', 'Jordan', 'Miranda', 'Monika', 'Josep', 'Neus', 'Renee', 'Dylan', 'Joscha', 'Lucas', 'Kangchon', 'Emanuel', 'Carlos', 'Ali', 'Zhenying', 'Maxwell', 'Ruxandra', 'Liliane', 'Eric', 'Yazmin', 'DevinW.', 'Shimon', 'Irene', 'Matti', 'Gabriela', 'Gabriele', 'Lovisa', 'Laurence', 'Menno', 'Drew', 'Sin-Yeon

In [122]:
print("There are {} unique names".format(len(hashmap.keys())))

There are 1485 unique names


In [4]:
import json
import requests
# Example of API output
requests.get(url='https://api.genderize.io/?name=' + 'tanya').content

'{"name":"ranen","gender":"male","probability":1,"count":1}'

In [124]:
for name in hashmap.keys(): 
    hashmap[name] = requests.get(url='https://api.genderize.io/?name=' + name.lower()).content

In [125]:
print(hashmap)

{'Andreas': '{"name":"andreas","gender":"male","probability":1,"count":1021}', 'Stuart': '{"name":"stuart","gender":"male","probability":0.99,"count":759}', 'Bailey': '{"name":"bailey","gender":"female","probability":0.85,"count":215}', 'Erfan': '{"name":"erfan","gender":"male","probability":1,"count":9}', 'Madelon': '{"name":"madelon","gender":"female","probability":1,"count":2}', 'Claus': '{"name":"claus","gender":"male","probability":0.88,"count":119}', 'Ronald': '{"name":"ronald","gender":"male","probability":1,"count":755}', 'Henri': '{"name":"henri","gender":"male","probability":0.98,"count":138}', 'Jean-Benoist': '{"name":"jean-benoist","gender":null}', 'Valeriano': '{"name":"valeriano","gender":"male","probability":1,"count":3}', 'Shuli': '{"name":"shuli","gender":"female","probability":0.88,"count":8}', 'Bret': '{"name":"bret","gender":"male","probability":0.99,"count":88}', 'Thomas': '{"name":"thomas","gender":"male","probability":1,"count":3753}', 'Guarino': '{"name":"guarin

In [126]:
import json
with open('data.json', 'w') as outfile:
    json.dump(hashmap, outfile)

### Data preprocessing

Now let's cleanup and set the probabilities + perform (cp + 4 / p + 2) normalization

### Group by

### Further Analysis