# Data Acquisition Notebook
- Notebook detailing ```database.py```
- Database will hold 2 most recent congresses

## ProPublica Data
- Functions and parameters

In [1]:
# API Key
import configparser

# Read/Scrape data
import requests
import json
import re
from bs4 import BeautifulSoup

In [2]:
config = configparser.ConfigParser()
config.read('auth/config.ini')
PROPUBLICA_API_KEY = config.get('propublica', 'PROPUBLICA_API_KEY')

In [3]:
API_ROOT = 'https://api.propublica.org/congress/v1/'
header = {'X-API-Key': f'{PROPUBLICA_API_KEY}'}

In [4]:
def get_house_ids(congress):
    '''
    Function to get house members' ProPublica ID by congress number
    '''

    call_string = API_ROOT + f'{congress}/house/members.json'
    r = requests.get(call_string, headers=header)
    result = r.json()['results'][0]['members']
    member_ids = [ member['id'] for member in result ]
    
    return member_ids

In [5]:
def get_member_data(member):
    '''
    Function to get house member's data
    '''
    
    call_string = API_ROOT + f'members/{member}.json'
    r = requests.get(call_string, headers=header)
    result = r.json()['results'][0]
    
    return result

## Get Initial Congress and Clean Data/Select Features

In [16]:
# House members of the 116th Congress (2018-2020)
ids_116 = get_house_ids(116)
members_116 = [ get_member_data(member) for member in ids_116 ]

In [55]:
from pprint import pprint
from datetime import datetime

In [144]:
def to_date(s):
    date = datetime.strptime(s, '%Y-%m-%d')
    return date

In [145]:
def member_cleaner(member):
    '''
    Function to keep relevent information on congress member
    '''
    
    roles = member['roles']
    roles.sort(key=lambda x: x['congress'], reverse=True)
    fec_id = roles[0]['fec_candidate_id']
    
    mem_dict = {
        '_id': member['id'],
        'bio': {
            'first_name': member['first_name'],
            'middle_name': member['middle_name'],
            'last_name': member['last_name'],
            'dob': to_date(member['date_of_birth']),
            'gender': member['gender'],
            'current_party': member['current_party'],
        },
        'activity': {
            'last_updated': to_date(member['last_updated'][:10]), # Ignore time
            'in_office': member['in_office'],
        },
        'other_ids': {
            'google_id': member['google_entity_id'],
            'votesmart_id': member['votesmart_id'],
            'govtrack_id': member['govtrack_id'],
            'cspan_id': member['cspan_id'],
            'crp_id': member['crp_id'],
            'fec_id': fec_id,
        },
        'roles': member['roles'],   
    }
    
    return mem_dict

## Inserting into Google Firestore

In [10]:
# Database connection and authentication
import firebase_admin
from firebase_admin import credentials, firestore
from google.cloud.firestore_v1.batch import WriteBatch

In [11]:
GCP_AUTH_PATH = config.get('firebase', 'GCP_AUTH_PATH')
cred = credentials.Certificate(GCP_AUTH_PATH)
app = firebase_admin.initialize_app(cred)

In [138]:
# Database
db = firestore.client()

In [154]:
members_insert = map(member_cleaner, members_116)

In [155]:
def batch_insert_members(members, db):
    '''
    Function to batch insert house members into database
    '''
    
    batch = db.batch()
    members_len = 0
    for member in members:
        _id = member['_id']
        insert_ref = db.collection("reps").document(f"{_id}")
        batch.set(insert_ref, member)
        members_len += 1
            
    batch.commit()
    
    batch_len = len(batch.write_results)
    f_string = f'Batch Length: {batch_len}, Members Length: {members_len}'
    assert batch_len == members_len, f_string
    
    return f'{batch_len} members inserted'

In [156]:
batch_insert_members(members_insert, db)

'451 members inserted'

## Creating Query Functions
- Google Firestore requires composite indices to be created when query across multiple fields.

In [159]:
reps_ref = db.collection('reps')

In [None]:
reps_red.document('')

In [161]:
date = datetime(1980, 1, 1)

In [162]:
date

datetime.datetime(1980, 1, 1, 0, 0)

In [177]:
result = reps_ref.where('bio.dob', '>=', date).order_by('bio.dob').stream()

In [178]:
young_reps = [ r.to_dict()['bio'] for r in result ]

In [179]:
young_reps

[{'gender': 'M',
  'current_party': 'R',
  'first_name': 'Lee',
  'dob': DatetimeWithNanoseconds(1980, 1, 30, 0, 0, tzinfo=<UTC>),
  'last_name': 'Zeldin',
  'middle_name': None},
 {'current_party': 'I',
  'gender': 'M',
  'middle_name': None,
  'dob': DatetimeWithNanoseconds(1980, 4, 18, 0, 0, tzinfo=<UTC>),
  'last_name': 'Amash',
  'first_name': 'Justin'},
 {'dob': DatetimeWithNanoseconds(1980, 5, 22, 0, 0, tzinfo=<UTC>),
  'gender': 'F',
  'middle_name': None,
  'first_name': 'Sharice',
  'last_name': 'Davids',
  'current_party': 'D'},
 {'last_name': 'Pappas',
  'current_party': 'D',
  'gender': 'M',
  'dob': DatetimeWithNanoseconds(1980, 6, 4, 0, 0, tzinfo=<UTC>),
  'middle_name': None,
  'first_name': 'Chris'},
 {'gender': 'M',
  'dob': DatetimeWithNanoseconds(1980, 6, 16, 0, 0, tzinfo=<UTC>),
  'first_name': 'Jason',
  'middle_name': '',
  'current_party': 'R',
  'last_name': 'Smith'},
 {'dob': DatetimeWithNanoseconds(1980, 7, 10, 0, 0, tzinfo=<UTC>),
  'current_party': 'R',
  '

In [158]:
def where_(ref, field, operator, value):
    ref = ref.where(field, operator, value)
    return ref

In [None]:
def order_by_(ref, field):
    ref = ref.order_by(field)
    return

In [109]:
result = query('reps', 'bio.gender', '==', 'F')

In [111]:
result = result.stream()

In [115]:
a = next(result)

In [119]:
a = a.to_dict()

In [142]:
b = datetime.date(a['activity']['last_updated'])

In [143]:
b

datetime.date(2021, 1, 28)

In [70]:
reps_ref = db.collection('reps')
query = reps_ref.where('bio.current_party', '==', 'D').order_by('dob').limit(10).stream()

## Get Number of Current Congress

In [None]:
# Year of 1st Congress
begin = 1789

# Determine year to use
now = datetime.now().date()
if now.day >= 20:
    year = now.year
else:
    year = now.year - 1
    
# Determine current congress
current_cong_num = int(1 + (year - begin)/2)

In [None]:
print('Current Congress:', current_cong_num)

In [None]:
# House members of current congress
current_members = get_house_ids(current_cong_num)
members = [ get_member_data(mem) for mem in current_members ]

In [None]:
member = members[0]
sample_dict(member)

In [None]:
def get_roll_call_vote(congress, session_number, roll_call_num):
    query = f'{congress}/house/sessions/{session_number}/votes/{roll_call_num}.json'
    call_string = API_ROOT + query
    r = requests.get(call_string, headers=header)
    result = r.json()['results']['votes']
    
    return result