In [302]:
# Database connection and authentication
import configparser
import firebase_admin
from firebase_admin import credentials, firestore

# Scraping
import re
import requests
import time
from bs4 import BeautifulSoup

In [2]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('./auth/config.ini')

# Get Google Firebase Auth
GCP_AUTH_PATH = config.get('firebase', 'GCP_AUTH_PATH')
cred = credentials.Certificate(GCP_AUTH_PATH)
app = firebase_admin.initialize_app(cred)

# Instantiate connection to database
db = firestore.client()

In [4]:
# Instantiate collection reference
ref = db.collection("reps")

In [337]:
# Query ProPublica and VoteSmart IDs of representatives
query = ref.where("first_name", "!=", "").select(["_id", "votesmart_id"]).stream()

In [338]:
# Retrieve IDs
rep_ids = [ doc.to_dict() for doc in query ]

In [339]:
def get_box_titles(vs_id):
    vs_url = 'https://justfacts.votesmart.org/candidate/biography/'
    r = requests.get(vs_url+vs_id).text
    soup = BeautifulSoup(r)
    box_titles = soup.find_all("b")
    return box_titles

In [340]:
def get_bio_data(card, box_titles):
    for box in box_titles:
        if box.text == f'{card}':
            ps = box.find_parent().find_parent().find_next_sibling().find_all("p")
            return [ p.text for p in ps ]

In [341]:
# Get education and professional experience
for rep in rep_ids:
    if rep['votesmart_id'] != None:
        vs_id = rep['votesmart_id']
        box_titles = get_box_titles(vs_id)
        edu = get_bio_data('Education', box_titles)
        pro = get_bio_data('Professional Experience', box_titles)
        rep['education'] = edu
        rep['profession'] = pro
        time.sleep(0.1)
        
    else: 
        rep['education'] = {}
        rep['profession'] = {}

In [342]:
# Standardize null data
null_list = [[], [''], ['No education information on file.']]
for rep in rep_ids:
    if rep['education'] in null_list: 
        rep['education'] = {}
    else:
        pass

In [437]:
def degree_dict():
    bach = { 
        deg: 'Bachelor'
        for deg in [
            'BA', 'BS', "Bachelor's", 'BSBA', 'BABS', 'AB', 
            'BBA', 'Bachelors', 'LLB', 'BSIE', 'BSBAGE', 'BSFS',
            'ALB', 'BD', 'BSN', 'BM', 'BS/BA', 'BSA', 'BSE'
        ]
    }
    cert = {
        deg: 'Graduated'
        for deg in [
            'Graduated', 'Certificate', 'Certified', 'Licensed', 
            'Gradated', 'Sixth-Year Degree', 'Ceritifed'
        ]
    }
    associates = {
        deg: 'Associate'
        for deg in [
            'AA', 'AS', 'AAS', "Associate's", 'Associates'
        ]
    }
    non_deg = {
        deg: 'Attended'
        for deg in [
            'Attended'
        ]
    }
    master = {
        deg: 'Master'
        for deg in [
            'MA', 'MS', 'MSW', 'MPhil', "Master's", 'Masters', 'MEd', 'ML', 'MF',
            'MSEd', 'BA/MA', 'MHS', 'MPH', 'MSPA', 'MPP', 'MPA', 'MFA', 'MDiv',
            'MDV', 'ThM', 'MEM', 'Master\x92s', 'MC'
        ]
    }
    mba = {
        deg: 'MBA'
        for deg in [
            'MBA'
        ]
    }
    med = {
        deg: 'Medical'
        for deg in [
            'MD', 'DO', 'DPM'
        ]
    }
    law = {
        deg: 'Law'
        for deg in [
            'JD'
        ]
    }
    dental = {
        deg: 'Dental'
        for deg in [
            'DDS', 'DMD'
        ]
    }
    nurse = {
        deg: 'Nursing'
        for deg in [
            'RN', 'PA', 'MSN', 'MSN/MPH'
        ]
    }
    vet = {
        deg: 'Veterinary'
        for deg in [
            'DVM'
        ]
    }
    phd = {
        deg: 'Doctorate'
        for deg in [
            'PhD', 'DMin', 'EdD'
        ]
    }
    deg_dict = dict(bach, **cert)
    deg_dict.update(associates)
    deg_dict.update(non_deg)
    deg_dict.update(master)
    deg_dict.update(mba)
    deg_dict.update(med)
    deg_dict.update(law)
    deg_dict.update(dental)
    deg_dict.update(nurse)
    deg_dict.update(vet)
    deg_dict.update(phd)
    
    return deg_dict

In [438]:
def clean_education(rep):
    edu = rep['education']
    clean_edu = {}
    deg_dict = degree_dict()
    for e in edu:
        ins = None
        if e != '':
            div_e = e.split(', ')
            if len(div_e) == 1:
                pass
            else:
                deg = deg_dict[div_e[0]]
                for s in div_e[1:]:
                    if re.search(
                        '(?=.*School)|(?=.*College)|(?=.*University)|(?=.*Institute)|(?=.*Center)', s
                    ):
                        ins = s
                clean_edu[deg] = ins

    return clean_edu

In [439]:
# Find all KeyErrors for dictionary
for rep in rep_ids:
    try:
        clean_education(rep)
    except Exception as e:
        print(e)
        print(rep)

In [442]:
# Clean educational data
for rep in rep_ids:
    rep['education'] = clean_education(rep)

In [445]:
# Sample
for rep in rep_ids[:10]:
    print(rep['education'])

{'Dental': 'Medical College of Georgia', 'Graduated': 'The University of Georgia'}
{'Master': 'Virginia Union University Samuel DeWitt Proctor School of Theology', 'Law': 'University of Virginia School of Law', 'Bachelor': 'American University'}
{'Bachelor': 'Drake University'}
{'Bachelor': 'University of Virginia', 'MBA': 'German International School of Management and Administration (GISMA)/Purdue University'}
{'Bachelor': 'Illinois State University'}
{'Attended': 'Western Washington University', 'Law': 'University of Washington School of Law', 'Bachelor': 'Fordham University'}
{'Law': 'Harvard University Law School', 'Bachelor': 'Stanford University'}
{'Attended': 'Liberty University', 'Bachelor': 'University of Nebraska at Lincoln'}
{'Attended': 'New York University', 'Bachelor': 'Queens College'}
{'Attended': 'Florida Agricultural and Mechanical University', 'Law': 'Texas Southern University'}


In [446]:
# Missing educational data
i = 0
for rep in rep_ids:
    if rep['education'] == {}:
        i += 1

print(i)

58
