# Initial Data Acquisition Notebook
- Data from https://projects.propublica.org/api-docs/congress-api/
- Notebook for code to be refactored in the ```db-build``` directory
- Database will hold 2 most recent congresses

## Initial Configuration
- ProPublica API and Firebase/Firestore Configuration
- Class object built in ```auth.py```

In [1]:
# Config reader
import configparser

# Database connection and authentication
import firebase_admin
from firebase_admin import credentials, firestore

# Read/Scrape data
import requests
import json
import re
from datetime import datetime

In [2]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('./auth/config.ini')

# Get ProPublica Auth
PROPUBLICA_API_KEY = config.get('propublica', 'PROPUBLICA_API_KEY')
API_ROOT = 'https://api.propublica.org/congress/v1/'
header = {'X-API-Key': f'{PROPUBLICA_API_KEY}'}

# Get Google Firebase Auth
GCP_AUTH_PATH = config.get('firebase', 'GCP_AUTH_PATH')
cred = credentials.Certificate(GCP_AUTH_PATH)
app = firebase_admin.initialize_app(cred)

# Instantiate connection to database
db = firestore.client()

## Members of US House of Representatives Data
- Script to build the US Representatives collection, ```reps```, in the Google Firestore database: ```build_house_db.py```

### Get Initial Congress and Clean Data/Select Features

In [3]:
def get_house_ids(congress):
    '''
    Function to get house members' ProPublica ID by congress number
    '''

    call_string = API_ROOT + f'{congress}/house/members.json'
    r = requests.get(call_string, headers=header)
    result = r.json()['results'][0]['members']
    member_ids = [ member['id'] for member in result ]
    
    return member_ids

In [4]:
# House member ids of the 116-117th Congress (2018-2022)
ids_116 = get_house_ids(116)
ids_117 = get_house_ids(117)

In [5]:
# Create list of unique IDs to remove redundant API calls
all_ids = list(set(ids_116 + ids_117))

In [6]:
def get_member_data(member):
    '''
    Function to get house member's data
    '''
    
    call_string = API_ROOT + f'members/{member}.json'
    r = requests.get(call_string, headers=header)
    result = r.json()['results'][0]
    
    return result

In [7]:
# Sample
sample_id = all_ids[0]
sample = get_member_data(sample_id)
for k, v in list(sample.items())[:5]:
    print(f'{k}: {v}')

id: H001088
member_id: H001088
first_name: Jim
middle_name: None
last_name: Hagedorn


In [8]:
# Create generator, split list in half to ensure under batch write limit
half = int(len(all_ids)/2)
members1 = ( get_member_data(member) for member in all_ids[:half] )
members2 = ( get_member_data(member) for member in all_ids[half:] )

In [9]:
def to_date(s):
    date = datetime.strptime(s, '%Y-%m-%d')
    return date

In [10]:
def member_cleaner(member):
    '''
    Function to keep relevent information on congress member
    '''
    
    roles = member['roles']
    roles.sort(key=lambda x: x['congress'], reverse=True)
    fec_id = roles[0]['fec_candidate_id'] # Most recent FEC candidate ID
    congresses = [ role['congress'] for role in roles ]
    
    mem_dict = {
        '_id': member['id'],
        'first_name': member['first_name'],
        'middle_name': member['middle_name'],
        'last_name': member['last_name'],
        'dob': to_date(member['date_of_birth']),
        'gender': member['gender'],
        'current_party': member['current_party'],
        'google_id': member['google_entity_id'],
        'votesmart_id': member['votesmart_id'],
        'govtrack_id': member['govtrack_id'],
        'cspan_id': member['cspan_id'],
        'crp_id': member['crp_id'],
        'fec_id': fec_id,
        'congresses': congresses,   
    }
    
    return mem_dict

### Inserting into Google Firestore

In [11]:
members_insert1 = map(member_cleaner, members1)
members_insert2 = map(member_cleaner, members2)

In [12]:
def batch_insert_members(members, db):
    '''
    Function to batch insert house members into database
    '''
    
    batch = db.batch()
    members_len = 0
    for member in members:
        _id = member['_id']
        insert_ref = db.collection("reps").document(f"{_id}")
        batch.set(insert_ref, member)
        members_len += 1
            
    batch.commit()
    
    batch_len = len(batch.write_results)
    f_string = f'Batch Length: {batch_len}, Members Length: {members_len}'
    assert batch_len == members_len, f_string
    
    print(f'{batch_len} members inserted')

In [13]:
# batch_insert_members(members_insert1, db)
# batch_insert_members(members_insert2, db)

## US House Roll Call Data
- Script to build US House roll call collection, ```bills```, in Firestore database: ```build_bills_db.py```

### Get Initial Congress Roll Calls

In [14]:
def get_roll_call_vote(congress, session, roll_call_number):
    '''
    Function to get singular roll call vote
    '''
    
    call_string = API_ROOT + f'{congress}/house/sessions/{session}/votes/{roll_call_number}.json'
    r = requests.get(call_string, headers=header)
    
    # Ignore nominations, quorum, and non-bill actions
    try:
        result = r.json()['results']['votes']['vote']
        if (result['bill'] == {}) or (result['bill']['number'] == 'QUORUM'):
            return None
        else:
            return result
        
    except:
        return None

In [15]:
def rc_clean(rc_vote):
    '''
    Function to keep relevent information from roll call
    '''

    if rc_vote != None:
        bill = rc_vote['bill']
        vote_dict = {
            'congress': rc_vote['congress'],
            'session': rc_vote['session'],
            'roll_call': rc_vote['roll_call'],
            'bill_id': bill['bill_id'],
            'api_call_id': ''.join(bill['number'].split('.')).lower(),
            'title': bill['title'],
            'description': rc_vote['description'],
            'date': to_date(rc_vote['date']),
            'result': rc_vote['result'],
            'yes': [],
            'no': [],
            'not voting': [],
            'present': [],
            'speaker': [],
        }
        for mem in rc_vote['positions']:
            mem_id = mem['member_id']
            pos = mem['vote_position'].lower()
            vote_dict[pos].append(mem_id)
    
    else:
        vote_dict = None
        
    return vote_dict

In [16]:
# Sample
sample_rc = get_roll_call_vote(116, 1, 4)
sample = rc_clean(sample_rc)
for k, v in list(sample.items())[:5]:
    print(f'{k}: {v}')

congress: 116
session: 1
roll_call: 4
bill_id: hres5-116
api_call_id: hres5


### Set Initial Roll Call Range
- Roll calls: https://clerk.house.gov/Votes

In [17]:
# Roll calls from 1st and 2nd sessions of 116th congress
rc_116_1 = range(1, 702) # 701 roll calls
rc_116_2 = range(1, 254) # 253 roll calls

# Roll calls from 1st session of 117th congress (as of 2021-02-02)
rc_117_1 = range(1, 19) # 18 roll calls

In [18]:
# Generators for batch insert
votes_116_1 = ( rc_clean(get_roll_call_vote(116, 1, rc)) for rc in rc_116_1 )
votes_116_2 = ( rc_clean(get_roll_call_vote(116, 2, rc)) for rc in rc_116_2 )
votes_117_1 = ( rc_clean(get_roll_call_vote(117, 1, rc)) for rc in rc_117_1 )

In [19]:
def batch_insert_rc(members):
    '''
    Function to keep relevent information from roll call
    '''
    batch = db.batch()
    mem_num = 0
    b_num = 1
    total = 0
    for mem in members:
        if mem != None:
            c = mem['congress']
            s = mem['session']
            r = mem['roll_call']
            _id = f'{c}_{s}_{r}'
            insert_ref = db.collection("votes").document(_id)
            batch.set(insert_ref, mem)
            mem_num += 1
            total += 1
            if mem_num > 399:
                batch.commit()
                print(f'{mem_num} bills inserted in batch #{b_num}')
                mem_num = 0
                b_num += 1
                batch = db.batch()
    
    batch.commit()
    print(f'{mem_num} bills inserted in batch #{b_num}')
    print(f'Total inserted: {total}')

In [20]:
# batch_insert_rc(votes_116_1)