# Retrieving Data with ProPublica API

In [1]:
import numpy as np
import pandas as pd
import pickle

import requests
import json
import configparser

from bs4 import BeautifulSoup
import urllib.request
import re

In [None]:
# Config API
config = configparser.ConfigParser()
config.read('config.ini')
api_key = config.get('propublica', 'PROPUBLICA_API_KEY')

## Retrieve Senators
Through ProPublica, a list of senators can be retrieved for each congress starting with the 80th Congress.  For this project, we will start from the 101st Congress (1989-1990) and end with the 116th Congress (2019-2021).  The API calls will return a response that can be read as a json file.

In [None]:
# Set parameters
congresses = range(101, 117) # from the 101st congress to the 116th congress

In [None]:
# Get all congresses
list_of_congresses = []

for n in congresses:
    r = requests.get(
        f'https://api.propublica.org/congress/v1/{n}/senate/members.json',
        headers={'X-API-Key': api_key}
    )
    
    results = r.json()['results'][0]
    members = results['members']
    list_of_congresses.append(members)

In [None]:
# Function to grab data for senators
def get_senators(members):
    senators = []
    for member in members:
        senator = {
            'id': member['id'],
            'first_name': member['first_name'],
            'last_name': member['last_name'],
            'party': member['party'],
            'gender': member['gender'],
            'state': member['state'],
        }
        senators.append(senator)
    return senators

In [None]:
senates = []
for members in list_of_congresses:
    senators = get_senators(members)
    senates.append(senators)

In [None]:
# Create list of all unique senators across 101st to 116th congress
senator_ids = []
all_senators = []
for senate in senates:
    for senator in senate:
        if senator['id'] in senator_ids:
            continue
        else:
            senator_ids.append(senator['id'])
            all_senators.append(senator)

In [None]:
with open('temp_senators.p', 'rb') as f:
    dict_senators = pickle.load(f)

## Web Scrape Total Roll Call Votes
The API only returns the 20 most recent votes by a senator so searching votes by senator is not possible.  Instead, votes will be retrieve through votes by roll call.  Each senate session (2 sessions per congress) has a varying number of bills voted on so the total number of roll call votes needs to be web scraped per session.  This is done on the [US Senate](https://www.senate.gov/) website.

In [None]:
# Create BeautifulSoup object to find number of roll call votes for each session
html = urllib.request.urlopen('https://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_101_1.htm')
soup = BeautifulSoup(html, 'lxml')

In [None]:
# Element holding last roll call vote in session
last = soup.find('td').find('a').text
last

In [None]:
# Last roll call vote in session
re.search(r'^...(?=\\)*', last).group(0)

In [None]:
# Script to pull total roll call votes per session
roll_calls = {}
for congress in congresses:
    # First session
    html = urllib.request.urlopen(
        f'https://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_{congress}_1.htm'
    )
    soup = BeautifulSoup(html, 'lxml')
    last = soup.find('td').find('a').text
    last_roll_call = re.search(r'^...(?=\\)*', last).group(0)
    roll_calls[f'{congress}_1'] = int(last_roll_call)
    
    # Second session
    html = urllib.request.urlopen(
        f'https://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_{congress}_2.htm'
    )
    soup = BeautifulSoup(html, 'lxml')
    last = soup.find('td').find('a').text
    last_roll_call = re.search(r'^...(?=\\)*', last).group(0)
    roll_calls[f'{congress}_2'] = int(last_roll_call)

In [None]:
with open('roll_calls.p', 'rb') as f:
    roll_calls = pickle.load(f)

## Retrieve Votes by Roll Calls
There is a limit of 5000 API calls per day for ProPublica.  So we will make API calls by chunks: 101st-105th Congress (3443 roll calls), 106th-110th Congress (3282), and 11th-116th Congress (3509).  The API call response can be read as a json file.

In [None]:
# Set range of congresses to make API calls
rc_1 = list(range(101, 106))
rc_2 = list(range(106, 111))
rc_3 = list(range(111, 117))

In [None]:
# Make API calls for each roll call vote, for each session, for each congress
new_set = []
for key, value in roll_calls.items():
    c = int(key[:3])
    s = int(key[-1])
    if c in rc_3: # Set the range here
        for n in range(1, value+1):
            html = f'https://api.propublica.org/congress/v1/{c}/senate/sessions/{s}/votes/{n}.json'
            r = requests.get(
                html,
                headers={'X-API-Key': api_key}
            )
            new_set.append(r)
    else:
        continue

In [None]:
# Function to extract relevent information from json file
def clean_votes(votes):
    all_votes = []
    for i, v in enumerate(votes):
        try:
            results = v.json()['results']['votes']['vote']
            vote = {
                'congress': results['congress'],
                'session': results['session'],
                'roll_call': results['roll_call'], 
                'bill_id': results['bill']['bill_id'],
                'date': results['date'],
                'positions': results['positions']
            }
        except:
            continue
        all_votes.append(vote)
    return all_votes

In [None]:
with open('c101_105.p', 'rb') as f:
    dict_votes1 = pickle.load(f)
    
with open('c106_110.p', 'rb') as f:
    dict_votes2 = pickle.load(f)
    
with open('c111_116.p', 'rb') as f:
    dict_votes3 = pickle.load(f)

In [None]:
# Merge all roll call votes
all_votes = dict_votes1 + dict_votes2 + dict_votes3

In [None]:
# Function to create columns of data
def roll_call_vote(vote):
    congress = vote['congress']
    session = vote['session']
    roll_call = vote['roll_call']
    date = vote['date']
    positions = []
    member_ids = []
    for position in vote['positions']:
        positions.append(position['vote_position'])
        member_ids.append(position['member_id'])
    
    return congress, session, roll_call, date, positions, member_ids

In [None]:
# Create data columns for DataFrame
congs = []
sessions = []
rcs = []
dates = []
vote_positions = []
voters = []

for vote in all_votes:
    cong, session, roll_call, date, positions, member_ids = roll_call_vote(vote)
    if len(positions) != 0: # Remove roll calls where bills were killed on the floor
        congs.append(cong)
        sessions.append(session)
        rcs.append(roll_call)
        dates.append(date)
        vote_positions.append(positions)
        voters.append(member_ids)

## Storing data as a DataFrame (final product)
Temporarily, the data will be stored as a DataFrame.  In future iterations senate votes will be periodically updated such that an online/cloud database will be necessary to store the data.  Here we will simply load all the data onto a DataFrame such that it can be used for exploratory data analysis, data modeling, and data visualizations.

In [None]:
# DataFrame with bill information
df = pd.DataFrame(index=range(len(congs)))
df['congress'] = congs
df['session'] = sessions
df['roll_call'] = rcs
df['date'] = dates

In [None]:
# Make a senators dictionary (id:name)
id2name = {}
for dicty in dict_senators:
    id2name[dicty['id']] = f'{dicty["last_name"]}, {dicty["first_name"]}'

In [None]:
# Add senator votes and names to DataFrame
for n in range(100):
    sen_votes = []
    sen = []
    for i in range(len(congs)):
        try:
            sen_votes.append(vote_positions[i][n])
            sen.append(id2name[voters[i][n]])
        except:
            sen_votes.append(np.nan)
            sen.append(np.nan)
            
    df[f'vote_{n}'] = sen_votes
    df[f'senator_{n}'] = sen

In [2]:
with open('senate_bills.p', 'rb') as f:
    df = pickle.load(f)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10004 entries, 0 to 10003
Columns: 204 entries, congress to senator_99
dtypes: int64(3), object(201)
memory usage: 15.6+ MB


In [4]:
df.head()

Unnamed: 0,congress,session,roll_call,date,vote_0,senator_0,vote_1,senator_1,vote_2,senator_2,...,vote_95,senator_95,vote_96,senator_96,vote_97,senator_97,vote_98,senator_98,vote_99,senator_99
0,101,1,1,1989-01-25,Yes,"Adams, Brockman",Yes,"Armstrong, William",Yes,"Baucus, Max",...,Yes,"Thurmond, Strom",Yes,"Wallop, Malcolm",Yes,"Warner, John",Yes,"Wilson, Pete",Yes,"Wirth, Timothy"
1,101,1,2,1989-01-25,Yes,"Adams, Brockman",Yes,"Armstrong, William",Yes,"Baucus, Max",...,Yes,"Thurmond, Strom",Yes,"Wallop, Malcolm",Yes,"Warner, John",Yes,"Wilson, Pete",Yes,"Wirth, Timothy"
2,101,1,3,1989-01-25,Yes,"Adams, Brockman",Yes,"Armstrong, William",Yes,"Baucus, Max",...,Yes,"Thurmond, Strom",Yes,"Wallop, Malcolm",Yes,"Warner, John",Yes,"Wilson, Pete",Yes,"Wirth, Timothy"
3,101,1,4,1989-01-31,Yes,"Adams, Brockman",Yes,"Armstrong, William",Yes,"Baucus, Max",...,Yes,"Thurmond, Strom",Yes,"Wallop, Malcolm",Yes,"Warner, John",Yes,"Wilson, Pete",Yes,"Wirth, Timothy"
4,101,1,5,1989-01-31,Yes,"Adams, Brockman",Yes,"Armstrong, William",Yes,"Baucus, Max",...,Yes,"Thurmond, Strom",Yes,"Wallop, Malcolm",Yes,"Warner, John",Yes,"Wilson, Pete",Yes,"Wirth, Timothy"
