# Development Database Notebook

## Configuration
- ```config.ini``` used to store configurations and private keys.

In [1]:
import configparser

In [2]:
# Get config file
config = configparser.ConfigParser()
config.read('../database-dev/auth/config.ini')

# Get ProPublica config
PROPUBLICA_KEY = config.get('propublica', 'PROPUBLICA_API_KEY')
API_ROOT = config.get('propublica', 'API_ROOT')
PROPUBLICA_HEADER = {'X-API-Key': f'{PROPUBLICA_KEY}'}

## Acquire US House of Representatives Biographical Data
- Current US House of Representatives *(117th Congress, 2021-2022)* pulled utilizing ProPublica's [Congress API](https://projects.propublica.org/api-docs/congress-api/).
- Retrieve Wikipedia URL for future educational background scraping.

### ProPublica API - Members Endpoint
- Utilize ```requests``` to make API calls for JSON responses.
- ProPublica IDs correspond to IDs for [Biographical Directory of the United States Congress](https://bioguide.congress.gov/).

In [3]:
import requests

In [4]:
def get_house_ids(congress, api_root, header):
    '''
    Function to retrieve all member IDs for a particular house in congress
    '''
    
    call_string = api_root + f'{congress}/house/members.json'
    r = requests.get(call_string, headers=header)
    result = r.json()['results'][0]['members']
    member_ids = [ member['id'] for member in result ]
    
    return member_ids

In [5]:
# Get all house members of the 117th congress
members = get_house_ids(117, API_ROOT, PROPUBLICA_HEADER)

# Sample
print(members[0])

A000370


In [6]:
# Function to retrieve JSON for a specified member of the house
def get_mem_json(member, api_root, header):
    call_string = api_root + f'members/{member}.json'
    r = requests.get(call_string, headers=header)
    result = r.json()['results'][0]
    
    return result

In [7]:
def clean_role(role):
    role_dict = {
        'congress': role['congress'],
        'state': role['state'],
        'party': role['party'],
        'district': role['district'],
        'committees': [
            {'name': comm['name'], 'code': comm['code']}
            for comm in role['committees']
        ],
        'subcommittees': [
            {'name': comm['name'], 'code': comm['code'], 'parent_code': comm['parent_committee_id']}
            for comm in role['subcommittees']
        ]
    }
    
    return role_dict

In [8]:
def get_member(member_id, api_root, header):
    '''
    Function to get house member data as python dictionary
    '''
    
    member = get_mem_json(member_id, api_root, header)
    current = member['roles'][0]
    fec_id = current['fec_candidate_id'] # Most recent FEC candidate ID
    state = current['state'] # Most recent state represented
    
    mem_dict = {
        '_id': member['id'],
        'first_name': member['first_name'],
        'middle_name': member['middle_name'],
        'last_name': member['last_name'],
        'dob': member['date_of_birth'],
        'gender': member['gender'],
        'current_party': member['current_party'],
        'state': state,
        'google_id': member['google_entity_id'],
        'votesmart_id': member['votesmart_id'],
        'govtrack_id': member['govtrack_id'],
        'cspan_id': member['cspan_id'],
        'crp_id': member['crp_id'],
        'fec_id': fec_id,
        'in_office': member['in_office'],
        'roles': [ clean_role(role) for role in member['roles'] ]
    }
    
    return mem_dict

In [9]:
# Sample
sample_rep = get_member(members[0], API_ROOT, PROPUBLICA_HEADER)
for k, v in list(sample_rep.items())[:10]: # First 10 key-value pairs
    print(f'{k}: {v}')

_id: A000370
first_name: Alma
middle_name: 
last_name: Adams
dob: 1946-05-27
gender: F
current_party: D
state: NC
google_id: /m/02b45d
votesmart_id: 5935


### Google Knowledge Graph API and MediaWiki - Retrieve Wikipedia URLs
- Utilize ```googleapiclient``` and ```mediawiki``` to retrieve Wikipedia URLs of Representatives.
- Wrapper function for error logging.
- Functions to get wikipedia URL of US House Representative based on errors.

In [10]:
import functools
import re
import googleapiclient
from googleapiclient.discovery import build
from mediawiki import MediaWiki

In [11]:
# Get API key for GKG
GKG_API_KEY = config.get('gcpkeys', 'GKG_API_KEY')
GKG = config.get('gcpkeys', 'GKG')
GKG_VERSION = config.get('gcpkeys', 'GKG_VERSION')

# Instantiate service connection
service = build(GKG, GKG_VERSION, developerKey=GKG_API_KEY)
entities = service.entities()

# Instantiate wikipedia object
wikipedia = MediaWiki()

In [12]:
# Wrapper for error logging
def error_logging(func):
    @functools.wraps(func)
    def wrapper_error(*args):
        data = None
        error = None
        try:
            data = func(*args)
        except Exception as e:
            error = type(e)
        return data, error
    return wrapper_error

In [13]:
@error_logging
def get_wiki_url(rep, entities):
    '''
    Function to get wikipedia URL from Google Knowledge Graph with Google entity ID query
    '''
    
    _id = rep['google_id']
    r = entities.search(ids=_id).execute()
    result = r['itemListElement'][0]['result']
    wiki_url = result['detailedDescription']['url']
    
    return wiki_url

In [14]:
@error_logging
def gkg_search(rep, entities):
    '''
    Function to get google_id and wikipedia URL from Google Knowledge Graph with search term query
    '''
    
    query = f"{rep['first_name']} {rep['last_name']} politician"
    r = entities.search(query=query).execute()
    result = r['itemListElement'][0]['result']
    _id = result['@id']
    gid = re.search('(?<=:).*', _id)[0]
    try:
        wiki_url = result['detailedDescription']['url']
    except:
        wiki_url = None
    
    return gid, wiki_url

In [15]:
@error_logging
def mediawiki_search(rep, wikipedia):
    '''
    Function to get wikipedia URL from MediaWiki
    '''
    
    query = f"{rep['first_name']} {rep['last_name']} politician"
    wiki_url = wikipedia.page(query).url
    
    return wiki_url

In [16]:
def get_rep_data(member_id, api_root, header, entities, wikipedia):
    '''
    Function to retrieve data for US Representative
    '''
    
    # Retrieve from ProPublica representative JSON
    rep = get_member(member_id, api_root, header) # Outside function
    
    # Initial attempt to retrieve wikipedia URL
    wiki_url, error = get_wiki_url(rep, entities) # Outside function
    rep['wiki_url'] = wiki_url
    
    # Missing or wrong google_id in ProPublica data
    if error == (googleapiclient.errors.HttpError or IndexError):
        data, error = gkg_search(rep, entities) # Outside function
        gid = data[0]
        wiki_url = data[1]
        rep['google_id'] = gid
        rep['wiki_url'] = wiki_url
        
    # Missing wikipedia URL in Google Knowledge Graph
    if (error == KeyError) or (rep['wiki_url'] == None):
        wiki_url, error = mediawiki_search(rep, wikipedia) # Outside function
        rep['wiki_url'] = wiki_url
        
    return rep

In [17]:
# Sample, initially missing wikipedia URL in Google Knowledge Graph
sample_rep = get_rep_data('D000624', API_ROOT, PROPUBLICA_HEADER, entities, wikipedia)
sample_rep['wiki_url']

'https://en.wikipedia.org/wiki/Debbie_Dingell'

## MongoDB Bulk Write

In [18]:
import pymongo
from pymongo import InsertOne

In [19]:
# Get MongoDB config
MONGO_LOCAL = config.get('mongodb', 'MONGO_LOCAL')
MONGO_DB = config.get('mongodb', 'MONGO_DB')
client = pymongo.MongoClient(MONGO_LOCAL)

# Connect to database
db = client.get_database(MONGO_DB)

# Instantiate connection to collection
collection = db['reps']

In [20]:
# Bulk write insert statements
# inserts = []
# for member in members:
#     data = get_rep_data(member, API_ROOT, PROPUBLICA_HEADER, entities, wikipedia)
#     inserts.append(InsertOne(data))

In [21]:
# Bulk write to collection
# result = collection.bulk_write(inserts)

In [22]:
# print(result.bulk_api_result)

In [23]:
len(members) 

440