In [95]:
import requests
import json
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [96]:
# Define the function that makes call to the API
key = '2s2piMwI01ZboAS5H451mWKEK4E5i93kvrUGSlxp'



def make_call(chamber, congress_number):
    s = 'https://api.propublica.org/congress/v1/'+ str(congress_number) + '/'+ chamber + '/members.json'

    response = requests.get(s, headers = {'X-API-Key': key})

    r = json.loads(response.content.decode('utf-8'))

    return r


In [97]:
# Define df that we'll be putting the data in 
results = pd.DataFrame(columns=['first_name', 'last_name', 
                                'party', 'gender', 'chamber', 'congress_number', 'DOB'])


# Create a dictionary that will allow you to loop through house and senate or just senate, 
# depending on the congress number that you're looking at

bar = {}
for i in range(102, 117):
    bar[i] = ['house', 'senate']

for i in range(80, 102):
    bar[i] = ['senate']


In [98]:
# Make the call, looping through both house and senate for congresses 102 - 116 inclusive
# Looping through only senate for congresses 80-101 inclusive
for i in range(80, 117):
    for c in bar[i]:
        call = make_call(c, i)

        first_name = []
        last_name = []
        party = []
        gender = []
        chamber = []
        congress_number = []
        DOB = []

        temp = pd.DataFrame(columns=['first_name', 'last_name', 'party', 'gender', 'chamber', 'congress_number', 'DOB'])

        for element in call['results'][0]['members']:
            first_name.append(element['first_name'])
            last_name.append(element['last_name'])
            party.append(element['party'])
            gender.append(element['gender'])
            chamber.append(c)
            congress_number.append(i)
            DOB.append(element['date_of_birth'][0:4])

        temp.first_name, temp.last_name, temp.party, temp.gender, temp.chamber, temp.congress_number, temp.DOB = first_name, last_name, party, gender, chamber, congress_number, DOB


        results = results.append(temp, ignore_index = True)

    print(i)

80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116


In [99]:
results.to_csv('raw_results_from_propublica.csv')

In [100]:
# Now make sure we don't have any duplicates:
# 'chamber' is there in case someone is both a senator and a member of the house (not technically possible)
# 'congress_number' is here to prevent deleting members that have been there for more than one congress
# Duplicates arise when people make it to congress and change party

results.drop_duplicates(['first_name', 'last_name', 'chamber', 'congress_number'], inplace=True)

# Currently there are ID and I are two different notations for independent
# Need to consolidate both labels to a single one

results.replace({'party': {'ID': 'I'}}, inplace = True)


In [101]:
# Approx 1e4 rows
results.shape

(10567, 7)

In [102]:
# Here we replace the empty cells with NaN as the empty cells are not detected by panda's isna() function
results.replace(r'^\s*$', np.nan, regex=True, inplace = True)
# We also replace python's None with NaN
results.fillna(value=pd.np.nan, inplace=True)

In [103]:
# Now we have the dataframe that we want
results.head(10)

Unnamed: 0,first_name,last_name,party,gender,chamber,congress_number,DOB
0,George,Aiken,R,M,senate,80,1892.0
1,Raymond,Baldwin,R,M,senate,80,1893.0
2,Joseph,Ball,R,M,senate,80,1905.0
3,Alben,Barkley,D,,senate,80,1877.0
4,Theodore,Bilbo,D,M,senate,80,
5,Ralph,Brewster,R,M,senate,80,1888.0
6,John,Bricker,R,M,senate,80,
7,Henry,Bridges,R,M,senate,80,1898.0
8,Charles,Brooks,R,M,senate,80,1897.0
9,Joseph,Broughton,D,M,senate,80,1888.0


In [104]:
# Annoyingly, there are a four people for whom we have neither the DOB nor the gender: 
# Scott Lucas, Elmer Thomas, Lyndon Johnson and Noris Cotton. 

results[(results.DOB.isna() == True) & (results.gender.isna() == True)]

Unnamed: 0,first_name,last_name,party,gender,chamber,congress_number,DOB
52,Scott,Lucas,D,,senate,80,
88,Elmer,Thomas,D,,senate,80,
150,Lyndon,Johnson,D,,senate,81,
162,Scott,Lucas,D,,senate,81,
198,Elmer,Thomas,D,,senate,81,
259,Lyndon,Johnson,D,,senate,82,
340,Norris,Cotton,R,,senate,83,
375,Lyndon,Johnson,D,,senate,83,
451,Norris,Cotton,R,,senate,84,
480,Lyndon,Johnson,D,,senate,84,


In [105]:
# Manually checked that they were all men, 
# so we can just replace the gender field for these rows with 'M', as otherwise the gender_df function will 
# throw an error

results.loc[(results.DOB.isna() == True) & (results.gender.isna() == True), ['gender']] = 'M'


In [106]:
# Now we don't have that problem anymore: returns an empty dataframe
results[(results.DOB.isna() == True) & (results.gender.isna() == True)]

Unnamed: 0,first_name,last_name,party,gender,chamber,congress_number,DOB


In [107]:
# How many people are missing gender? 
len(results[results.gender.isna() == True])
# 500

500

In [108]:
# Now we need to isolate the subset of members that has NA gender but has a DOB
# should be the same number as cell above
len(results[(results.gender.isna() == True) & (results.DOB.isna() == False)])

# But it looks like we have DOB info for all those whose gender is missing, which is good

500

In [109]:
# We want to change the DOB of members to be at a minimum of 1880, because that's the lowest parameter for age
# accepted by the gender_df function
# Last thing we need to do before predicting gender is to create a series which has the method that the
# gender() function in R will use to predict gender.
# 'ssa' for DOBs that are 1930 and above, 'ipums' for everything else

# It looks like the data in our DOB series is of str type. First need to convert it to numeric

results.DOB = pd.to_numeric(results.DOB)

results['method'] = results.DOB.apply(lambda x: 'ssa' if x >= 1930 else 'ipums')

In [110]:
results

Unnamed: 0,first_name,last_name,party,gender,chamber,congress_number,DOB,method
0,George,Aiken,R,M,senate,80,1892.0,ipums
1,Raymond,Baldwin,R,M,senate,80,1893.0,ipums
2,Joseph,Ball,R,M,senate,80,1905.0,ipums
3,Alben,Barkley,D,,senate,80,1877.0,ipums
4,Theodore,Bilbo,D,M,senate,80,,ipums
5,Ralph,Brewster,R,M,senate,80,1888.0,ipums
6,John,Bricker,R,M,senate,80,,ipums
7,Henry,Bridges,R,M,senate,80,1898.0,ipums
8,Charles,Brooks,R,M,senate,80,1897.0,ipums
9,Joseph,Broughton,D,M,senate,80,1888.0,ipums


In [111]:
# Now let's take a subset of this data that doesn't have a predicted gender, and put it into a csv file 
# for R to read
gender_to_be_determined = results[(results.gender.isna() == True) & (results.DOB.isna() == False)]

gender_to_be_determined.to_csv('gender_to_be_determined.csv')
gender_to_be_determined.shape[0]

500

In [112]:
gender_to_be_determined[gender_to_be_determined.duplicated(subset=['first_name', 'last_name', 'DOB'], keep= False) == True].sort_values(by = 'first_name')

Unnamed: 0,first_name,last_name,party,gender,chamber,congress_number,DOB,method
1580,Adlai,Stevenson,D,,senate,94,1930.0,ssa
1367,Adlai,Stevenson,D,,senate,92,1930.0,ssa
1692,Adlai,Stevenson,D,,senate,95,1930.0,ssa
1794,Adlai,Stevenson,D,,senate,96,1930.0,ssa
1472,Adlai,Stevenson,D,,senate,93,1930.0,ssa
1260,Adlai,Stevenson,D,,senate,91,1930.0,ssa
2785,Alan,Cranston,D,,senate,102,1914.0,ipums
1176,Alan,Bible,D,,senate,91,1909.0,ipums
2131,Alan,Cranston,D,,senate,100,1914.0,ipums
1188,Alan,Cranston,D,,senate,91,1914.0,ipums


In [137]:

# R commands
# new <- read.csv('gender_to_be_determined.csv', stringsAsFactors = FALSE)

# The snippet below leads to a lot of duplicates generated, which means you get way more rows than you should
# when you merge back with the original df
# > test <- new %>% rowwise() %>% do(results = gender(.$first_name, years = .$DOB, method = .$method)) 
# %>% do(bind_rows(.$results))

# > write.csv(test, 'propublica_gender_determined.csv')


# Don't use the code below
### > gender_to_be_determined <- read.csv('gender_to_be_determined.csv', stringsAsFactors = FALSE)
### > test <- gender_df(gender_to_be_determined, name_col = 'first_name', year_col = c('DOB', 'DOB'), method = 'ssa')
### > write.csv(test, 'gender_determined.csv')

In [113]:
# Now let's open gender_determined.csv and modify it to make the merging with the original dataset somewhat easier

gender_determined = pd.read_csv('propublica_gender_determined.csv', index_col = 0)
gender_determined.shape


(440, 6)

In [114]:
gender_determined.head()

Unnamed: 0,name,proportion_male,proportion_female,gender,year_min,year_max
1,Alben,1.0,0.0,male,1877,1877
2,Clayton,0.9743,0.0257,male,1890,1890
3,Dennis,1.0,0.0,male,1888,1888
4,Forrest,1.0,0.0,male,1884,1884
5,Sheridan,1.0,0.0,male,1884,1884


In [115]:
# We can drop the year_min column (same as year_max), and rename the year_max to DOB and use that to merge
gender_determined.drop('year_min', axis = 1, inplace= True)
gender_determined.rename(columns = {'year_max': 'DOB', 'name' : 'first_name', 'gender' : 'predicted_gender'}
                         , inplace = True)
# Also want to change 'male' to 'M' and 'female' to 'F'
gender_determined.replace({'predicted_gender' : {'male': 'M', 'female' : 'F'}}, inplace = True)

In [116]:
# At this point, probably a good idea to inspect whether there are any false positives, false negatives

#gender_determined[gender_determined.predicted_gender == 'F']
gender_determined[(gender_determined.predicted_gender == 'F') & (gender_determined.proportion_female < 0.90)]

# Beryl is actually a false positive! Need to change that in the merged datframe

Unnamed: 0,first_name,proportion_male,proportion_female,predicted_gender,DOB
38,Francis,0.4519,0.5481,F,1896
57,Francis,0.4519,0.5481,F,1896
79,Francis,0.4519,0.5481,F,1896
98,Francis,0.4519,0.5481,F,1896
116,Francis,0.4519,0.5481,F,1896
137,Francis,0.4519,0.5481,F,1896
384,Beryl,0.2107,0.7893,F,1938


In [117]:
# need to remove rows from gender_determined, currently has ~ 17000 rows! 
# Drops down to 2720

gender_determined.drop_duplicates(subset = ['first_name', 'DOB', 'proportion_male'], inplace= True)
#gender_determined.shape

In [119]:
gender_determined[gender_determined.first_name == 'Ed']

Unnamed: 0,first_name,proportion_male,proportion_female,predicted_gender,DOB
428,Ed,1.0,0.0,M,1948


In [120]:
gender_to_be_determined[gender_to_be_determined.first_name == 'Ed']

Unnamed: 0,first_name,last_name,party,gender,chamber,congress_number,DOB,method
3464,Ed,Bryant,R,,house,104,1948.0,ssa
4022,Ed,Bryant,R,,house,105,1948.0,ssa
4575,Ed,Bryant,R,,house,106,1948.0,ssa
5120,Ed,Bryant,R,,house,107,1948.0,ssa


In [117]:
# I think we need to take the df returned from R, remove the duplicates, then merge it back with the dataframe
# that was given to R.
# Keep only the rows from the dataframe that was given to R, adding to it the relevant rows from 
# the gender_determined dataframe. Should have a resulting df whose length is equal that of the original dataframe

10567

In [None]:
# Need to do a left join to make sure we don't get any duplicates. 
# The R function is throwing a bunch of duplicates (probably why it's slow), but can't quite figure out why.
# It gives the data we want though, and fixed easily enough with a left join once you've removed duplicates

In [121]:
# This is the right length (i.e. the same length as gender_to_be_determined, 500 rows)
len(gender_to_be_determined.merge(gender_determined, on = ['first_name', 'DOB'], how = 'left'))


500

In [122]:
# This dataset will have to be re-merged with the original one it came from, 'results'
# Let's call this to_merge, and look at Nas, false positives
to_merge = gender_to_be_determined.merge(gender_determined, on = ['first_name', 'DOB'], how = 'left')
to_merge.head()

Unnamed: 0,first_name,last_name,party,gender,chamber,congress_number,DOB,method,proportion_male,proportion_female,predicted_gender
0,Alben,Barkley,D,,senate,80,1877.0,ipums,1.0,0.0,M
1,Clayton,Buck,R,,senate,80,1890.0,ipums,0.9743,0.0257,M
2,Dennis,Chavez,D,,senate,80,1888.0,ipums,1.0,0.0,M
3,Forrest,Donnell,R,,senate,80,1884.0,ipums,1.0,0.0,M
4,Sheridan,Downey,D,,senate,80,1884.0,ipums,1.0,0.0,M


In [123]:
# Of the 500 rows we set up for analyis, how many don't have a predicted gender?

to_merge.predicted_gender.isna().value_counts()

# 60, which isn't too bad

False    440
True      60
Name: predicted_gender, dtype: int64

In [124]:

# Let's take a look at the rows for which we're missing names, to see if there's anything odd there
to_merge[to_merge.predicted_gender.isna() == True]


Unnamed: 0,first_name,last_name,party,gender,chamber,congress_number,DOB,method,proportion_male,proportion_female,predicted_gender
5,Zales,Ecton,R,,senate,80,1898.0,ipums,,,
7,Bourke,Hickenlooper,R,,senate,80,1896.0,ipums,,,
9,Spessard,Holland,D,,senate,80,1892.0,ipums,,,
14,Burnet,Maybank,D,,senate,80,1899.0,ipums,,,
27,Zales,Ecton,R,,senate,81,1898.0,ipums,,,
29,Bourke,Hickenlooper,R,,senate,81,1896.0,ipums,,,
31,Spessard,Holland,D,,senate,81,1892.0,ipums,,,
36,Burnet,Maybank,D,,senate,81,1899.0,ipums,,,
50,Zales,Ecton,R,,senate,82,1898.0,ipums,,,
53,Bourke,Hickenlooper,R,,senate,82,1896.0,ipums,,,


In [125]:
# It looks like just a few members, let's see how many
set(to_merge[to_merge.predicted_gender.isna() == True].first_name)
# Turns out they're all male, let's put that information in

{'Adlai',
 'Bourke',
 'Brockman',
 'Burnet',
 'Druie',
 'Kaneaster',
 'Mervyn',
 'Sedgwick',
 'Spark',
 'Spessard',
 'Thruston',
 'Wes',
 'Wyche',
 'Zales'}

In [126]:
# Counting them all as male, as they're all male, just with unusual names
to_merge.loc[to_merge.predicted_gender.isna() == True, 'predicted_gender'] = 'M'

In [127]:
# No more missing values
to_merge.predicted_gender.isna().value_counts()

False    500
Name: predicted_gender, dtype: int64

In [128]:
# Let's take a look at who's predicted to be female:
to_merge[to_merge.predicted_gender == 'F']
# Francis Case, Almer Monroney, Beryl Anthony and Joan Horn
# Low confidence in Francis and Beryl , with a proportion female of 0.55 and 0.79,
# i.e. lower than the cutoff of 0.9 that we used in the previous dataset. 

Unnamed: 0,first_name,last_name,party,gender,chamber,congress_number,DOB,method,proportion_male,proportion_female,predicted_gender
45,Francis,Case,R,,senate,82,1896.0,ipums,0.4519,0.5481,F
61,Almer,Monroney,D,,senate,82,1902.0,ipums,0.0,1.0,F
68,Francis,Case,R,,senate,83,1896.0,ipums,0.4519,0.5481,F
84,Almer,Monroney,D,,senate,83,1902.0,ipums,0.0,1.0,F
93,Francis,Case,R,,senate,84,1896.0,ipums,0.4519,0.5481,F
106,Almer,Monroney,D,,senate,84,1902.0,ipums,0.0,1.0,F
114,Francis,Case,R,,senate,85,1896.0,ipums,0.4519,0.5481,F
125,Almer,Monroney,D,,senate,85,1902.0,ipums,0.0,1.0,F
135,Francis,Case,R,,senate,86,1896.0,ipums,0.4519,0.5481,F
148,Almer,Monroney,D,,senate,86,1902.0,ipums,0.0,1.0,F


In [129]:
to_merge[to_merge.first_name == 'Joan']

Unnamed: 0,first_name,last_name,party,gender,chamber,congress_number,DOB,method,proportion_male,proportion_female,predicted_gender
458,Joan,Horn,D,,house,102,1936.0,ssa,0.0047,0.9953,F


In [130]:
# Francis case was male, Beryl anthony too
# Almer is a male, even though we have a 100% female proportion! For that name, the ssa and ipums have completely
# opposite predictions, 100% female in one case, 100% male in the other. 
# Only Joan is female, so let's change all the predicted genders back to male, save for joan horn

to_merge.loc[to_merge.first_name != 'Joan', 'predicted_gender'] = 'M'

In [131]:
# Only one predicted female. No false positives, there could be some false negatives
to_merge.predicted_gender.value_counts()

M    499
F      1
Name: predicted_gender, dtype: int64

In [132]:
# Check for false negatives, but all those are male
to_merge[(to_merge.predicted_gender == 'M') & (to_merge.proportion_male < 0.6)]

Unnamed: 0,first_name,last_name,party,gender,chamber,congress_number,DOB,method,proportion_male,proportion_female,predicted_gender
17,Francis,Myers,D,,senate,80,1901.0,ipums,0.5379,0.4621,M
39,Francis,Myers,D,,senate,81,1901.0,ipums,0.5379,0.4621,M
45,Francis,Case,R,,senate,82,1896.0,ipums,0.4519,0.5481,M
61,Almer,Monroney,D,,senate,82,1902.0,ipums,0.0,1.0,M
68,Francis,Case,R,,senate,83,1896.0,ipums,0.4519,0.5481,M
84,Almer,Monroney,D,,senate,83,1902.0,ipums,0.0,1.0,M
93,Francis,Case,R,,senate,84,1896.0,ipums,0.4519,0.5481,M
106,Almer,Monroney,D,,senate,84,1902.0,ipums,0.0,1.0,M
114,Francis,Case,R,,senate,85,1896.0,ipums,0.4519,0.5481,M
125,Almer,Monroney,D,,senate,85,1902.0,ipums,0.0,1.0,M


In [133]:
# We now have a dataset that we're happy to merge back with the original one that it was extracted from
# Easiest thing is probably to append this dataset to the previous one, and then remove duplicates 
# based on first name, last name, chamber, party, and DOB

results.head()
# Let's put the 

Unnamed: 0,first_name,last_name,party,gender,chamber,congress_number,DOB,method
0,George,Aiken,R,M,senate,80,1892.0,ipums
1,Raymond,Baldwin,R,M,senate,80,1893.0,ipums
2,Joseph,Ball,R,M,senate,80,1905.0,ipums
3,Alben,Barkley,D,,senate,80,1877.0,ipums
4,Theodore,Bilbo,D,M,senate,80,,ipums


In [134]:
results['proportion_male'] = np.nan
results['proportion_female'] = np.nan 
results['predicted_gender'] = np.nan

In [135]:
results.head()

Unnamed: 0,first_name,last_name,party,gender,chamber,congress_number,DOB,method,proportion_male,proportion_female,predicted_gender
0,George,Aiken,R,M,senate,80,1892.0,ipums,,,
1,Raymond,Baldwin,R,M,senate,80,1893.0,ipums,,,
2,Joseph,Ball,R,M,senate,80,1905.0,ipums,,,
3,Alben,Barkley,D,,senate,80,1877.0,ipums,,,
4,Theodore,Bilbo,D,M,senate,80,,ipums,,,


In [136]:
to_merge.head()

Unnamed: 0,first_name,last_name,party,gender,chamber,congress_number,DOB,method,proportion_male,proportion_female,predicted_gender
0,Alben,Barkley,D,,senate,80,1877.0,ipums,1.0,0.0,M
1,Clayton,Buck,R,,senate,80,1890.0,ipums,0.9743,0.0257,M
2,Dennis,Chavez,D,,senate,80,1888.0,ipums,1.0,0.0,M
3,Forrest,Donnell,R,,senate,80,1884.0,ipums,1.0,0.0,M
4,Sheridan,Downey,D,,senate,80,1884.0,ipums,1.0,0.0,M


In [137]:
results = results.append(to_merge)

In [138]:
# Now, results will have duplicates, Alben for instance
results[results.first_name == 'Alben']

Unnamed: 0,first_name,last_name,party,gender,chamber,congress_number,DOB,method,proportion_male,proportion_female,predicted_gender
3,Alben,Barkley,D,,senate,80,1877.0,ipums,,,
105,Alben,Barkley,D,,senate,81,1877.0,ipums,,,
433,Alben,Barkley,D,,senate,84,1877.0,ipums,,,
0,Alben,Barkley,D,,senate,80,1877.0,ipums,1.0,0.0,M
21,Alben,Barkley,D,,senate,81,1877.0,ipums,1.0,0.0,M
89,Alben,Barkley,D,,senate,84,1877.0,ipums,1.0,0.0,M


Unnamed: 0,first_name,last_name,party,gender,chamber,congress_number,DOB,method,proportion_male,proportion_female,predicted_gender


In [141]:
# Since we annotated all the names from to_merge, the only duplicates are going to have NaN in the predicted gender
# and in the gender columns. Let's drop those
# We started off with 10567 rows, should end up with that again

results.dropna(subset= ['gender', 'predicted_gender'], how = 'all', inplace= True)

In [142]:
results

Unnamed: 0,first_name,last_name,party,gender,chamber,congress_number,DOB,method,proportion_male,proportion_female,predicted_gender
0,George,Aiken,R,M,senate,80,1892.0,ipums,,,
1,Raymond,Baldwin,R,M,senate,80,1893.0,ipums,,,
2,Joseph,Ball,R,M,senate,80,1905.0,ipums,,,
4,Theodore,Bilbo,D,M,senate,80,,ipums,,,
5,Ralph,Brewster,R,M,senate,80,1888.0,ipums,,,
6,John,Bricker,R,M,senate,80,,ipums,,,
7,Henry,Bridges,R,M,senate,80,1898.0,ipums,,,
8,Charles,Brooks,R,M,senate,80,1897.0,ipums,,,
9,Joseph,Broughton,D,M,senate,80,1888.0,ipums,,,
11,Vera,Bushfield,R,F,senate,80,1889.0,ipums,,,


In [None]:
# We finally have a dataset with all the info we need! Jeez that took long. 
# Let's create a new column, final gender, that consolidates the gender and predicted gender columns

In [154]:
def f(x):
    if x['predicted_gender'] != 'M' and x['predicted_gender'] != 'F':
        return x['gender']
    else:
        return x['predicted_gender']

In [160]:
results['final_gender'] = results.apply(f, axis = 1)

In [162]:
results.final_gender.isna().value_counts()

False    10567
Name: final_gender, dtype: int64

In [163]:
results.to_csv('cleaned_propublica_data.csv')

In [165]:
results.groupby(['congress_number', 'party', 'chamber']).final_gender.value_counts()

congress_number  party  chamber  final_gender
80               D      senate   M                49
                 R      senate   M                52
                                 F                 1
81               D      senate   M                61
                 I      senate   M                 1
                 R      senate   M                46
                                 F                 1
82               D      senate   M                50
                 I      senate   M                 1
                 R      senate   M                53
                                 F                 1
83               D      senate   M                54
                 I      senate   M                 2
                 R      senate   M                55
                                 F                 3
84               D      senate   M                49
                 I      senate   M                 3
                 R      senate   M                48


In [None]:
# Let's see if a streamgraph is doable on this, would look very cool. 
# Seems to be something you can do in R.