# Retrieving Data with Propublica API

In [11]:
import numpy as np
import pandas as pd
import pickle

import requests
import json
import configparser

from bs4 import BeautifulSoup
import urllib.request
import re

In [44]:
# Config API
config = configparser.ConfigParser()
config.read('config.ini')
api_key = config.get('propublica', 'PROPUBLICA_API_KEY')

## Retrieve Senators
Through ProPublica, a list of senators can be retrieved starting from the 101st Congress (1989-1990) to the 116th Congress (2019-2021).

In [33]:
# Set parameters
congresses = range(101, 117) # from the 101tst congress to the 116th congress

In [None]:
# Get all congresses
list_of_congresses = []

for n in congresses:
    r = requests.get(
        f'https://api.propublica.org/congress/v1/{n}/senate/members.json',
        headers={'X-API-Key': api_key}
    )
    
    results = r.json()['results'][0]
    members = results['members']
    list_of_congresses.append(members)

In [None]:
# Function to grab data for senators
def get_senators(members):
    senators = []
    for member in members:
        senator = {
            'id': member['id'],
            'first_name': member['first_name'],
            'last_name': member['last_name'],
            'party': member['party'],
            'gender': member['gender'],
            'state': member['state'],
        }
        senators.append(senator)
    return senators

In [None]:
senates = []
for members in list_of_congresses:
    senators = get_senators(members)
    senates.append(senators)

In [None]:
# Create list of all unique senators across 101st to 116th congress
senator_ids = []
all_senators = []
for senate in senates:
    for senator in senate:
        if senator['id'] in senator_ids:
            continue
        else:
            senator_ids.append(senator['id'])
            all_senators.append(senator)

In [None]:
print('Senators:', len(all_senators))

In [None]:
# with open('temp_senators.p', 'wb') as f:
#     pickle.dump(all_senators, f)

## Web Scrape Total Roll Call Votes
The API only returns the 20 most recent votes by a senator so searching votes by senator is not possible.  Instead, votes will be retrieve through votes by roll call.  Each senate session (2 sessions per congress) has a varying number of bills voted on so the total number of roll call votes needs to be web scraped.  This is done on the [US Senate](https://www.senate.gov/) website.

In [2]:
# Create BeautifulSoup object to find number of roll call votes for each session
html = urllib.request.urlopen('https://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_101_1.htm')
soup = BeautifulSoup(html, 'lxml')

In [13]:
# Element holding last roll call vote in session
last = soup.find('td').find('a').text
last

'312\xa0(99-0)'

In [32]:
# Last roll call vote in session
re.search(r'^...(?=\\)*', last).group(0)

'312'

In [34]:
# Script to pull total roll call votes per session
roll_calls = {}
for congress in congresses:
    # First session
    html = urllib.request.urlopen(
        f'https://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_{congress}_1.htm'
    )
    soup = BeautifulSoup(html, 'lxml')
    last = soup.find('td').find('a').text
    last_roll_call = re.search(r'^...(?=\\)*', last).group(0)
    roll_calls[f'{congress}_1'] = int(last_roll_call)
    
    # Second session
    html = urllib.request.urlopen(
        f'https://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_{congress}_2.htm'
    )
    soup = BeautifulSoup(html, 'lxml')
    last = soup.find('td').find('a').text
    last_roll_call = re.search(r'^...(?=\\)*', last).group(0)
    roll_calls[f'{congress}_2'] = int(last_roll_call)

In [41]:
sum(roll_calls.values())

10234

## Retrieve Votes by Roll Calls
There is a limit of 5000 API calls per day for ProPublica.  So in order to be safe, the roll calls will be split into chunks of 101st-105th Congress (3443 roll calls), 106th-110th Congress (3282), and 11th-116th Congress (3509).

In [52]:
rc_1 = list(range(101, 106))
rc_2 = list(range(106, 111))
rc_3 = list(range(111, 117))

In [55]:
html = f'https://api.propublica.org/congress/v1/101/senate/sessions/1/votes/1.json'
r = requests.get(
    html,
    headers={'X-API-Key': api_key}
)

In [72]:
first_set = []
for congress in congresses:
    for key, value in roll_calls.items():
        c = int(key[:3])
        s = int(key[-1])
        if c in rc_1:
            for n in range(1, value+1):
                html = f'https://api.propublica.org/congress/v1/{c}/senate/sessions/{s}/votes/{n}.json'
                r = requests.get(
                    html,
                    headers={'X-API-Key': api_key}
                )
                first_set.append(r)
        else:
            continue

In [73]:
with open('temp_1st_set.p', 'wb') as f:
    pickle.dump(first_set, f)

In [78]:
first_set[-1].json()['results']['votes']['vote']

{'congress': 105,
 'session': 2,
 'chamber': 'Senate',
 'roll_call': 314,
 'source': 'https://www.senate.gov/legislative/LIS/roll_call_votes/vote1052/vote_105_2_00314.xml',
 'url': 'https://www.senate.gov/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress=105&session=2&vote=00314',
 'bill': {'bill_id': 'hr4328-105',
  'number': 'H.R.4328',
  'api_uri': 'https://api.propublica.org/congress/v1/105/bills/hr4328.json',
  'title': 'Making omnibus consolidated and emergency appropriations for the fiscal year ending September 30, 1999, and for other purposes.',
  'short_title': 'Omnibus Consolidated and Emergency Supplemental Appropriations Act, 1999',
  'latest_action': 'Became Public Law No: 105-277.'},
 'amendment': {},
 'question': 'On the Conference Report',
 'question_text': '',
 'description': "Conference Report H.R. 4328; Women's Health and Cancer Rights Act of 1998",
 'vote_type': '1/2',
 'date': '1998-10-21',
 'time': '09:02:00',
 'result': 'Agreed to',
 'tie_breaker': 

In [86]:
def clean_votes(votes):
    all_votes = []
    for v in votes:
        results = v.json()['results']['votes']['vote']
        vote = {
            'congress': results['congress'],
            'session': results['session'],
            'roll_call': results['roll_call'], 
            'bill_id': results['bill']['bill_id'],
            'date': results['date'],
            'positions': results['positions']
        }
        all_votes.append(vote)
    return all_votes

In [87]:
c101_105 = clean_votes(first_set)

In [88]:
with open('c101_105.p', 'wb') as f:
    pickle.dump(c101_105, f)

In [89]:
c101_105[0]

{'congress': 101,
 'session': 1,
 'roll_call': 1,
 'bill_id': '-101',
 'date': '1989-01-25',
 'positions': [{'member_id': 'A000031',
   'name': 'Brockman Adams',
   'party': 'D',
   'state': 'WA',
   'vote_position': 'Yes',
   'dw_nominate': None},
  {'member_id': 'A000219',
   'name': 'William Lester Armstrong',
   'party': 'R',
   'state': 'CO',
   'vote_position': 'Yes',
   'dw_nominate': None},
  {'member_id': 'B000243',
   'name': 'Max Baucus',
   'party': 'D',
   'state': 'MT',
   'vote_position': 'Yes',
   'dw_nominate': None},
  {'member_id': 'B000401',
   'name': 'Lloyd Millard Bentsen',
   'party': 'D',
   'state': 'TX',
   'vote_position': 'Yes',
   'dw_nominate': None},
  {'member_id': 'B000444',
   'name': 'Joseph R. Biden Jr.',
   'party': 'D',
   'state': 'DE',
   'vote_position': 'Yes',
   'dw_nominate': None},
  {'member_id': 'B000468',
   'name': 'Jeff Bingaman',
   'party': 'D',
   'state': 'NM',
   'vote_position': 'Yes',
   'dw_nominate': None},
  {'member_id': 'B0