In [1]:
# Database connection and authentication
import configparser
import firebase_admin
from firebase_admin import credentials, firestore

# Scraping
import re
import requests
import time
from bs4 import BeautifulSoup

In [2]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('./auth/config.ini')

# Get Google Firebase Auth
GCP_AUTH_PATH = config.get('firebase', 'GCP_AUTH_PATH')
cred = credentials.Certificate(GCP_AUTH_PATH)
app = firebase_admin.initialize_app(cred)

# Instantiate connection to database
db = firestore.client()

In [3]:
# Instantiate collection reference
ref = db.collection("reps")

In [None]:
# Query ProPublica and VoteSmart IDs of representatives
query = ref.where("first_name", "!=", "").select(["_id", "votesmart_id"]).stream()

In [None]:
# Retrieve IDs
rep_ids = [ doc.to_dict() for doc in query ]

In [None]:
def get_box_titles(vs_id):
    vs_url = 'https://justfacts.votesmart.org/candidate/biography/'
    r = requests.get(vs_url+vs_id).text
    soup = BeautifulSoup(r)
    box_titles = soup.find_all("b")
    return box_titles

In [None]:
def get_bio_data(card, box_titles):
    for box in box_titles:
        if box.text == f'{card}':
            ps = box.find_parent().find_parent().find_next_sibling().find_all("p")
            return [ p.text for p in ps ]

In [None]:
# Get education and professional experience
for rep in rep_ids:
    if rep['votesmart_id'] != None:
        vs_id = rep['votesmart_id']
        box_titles = get_box_titles(vs_id)
        edu = get_bio_data('Education', box_titles)
        pro = get_bio_data('Professional Experience', box_titles)
        rep['education'] = edu
        rep['profession'] = pro
        time.sleep(0.1)
        
    else: 
        rep['education'] = {}
        rep['profession'] = {}

In [None]:
# Standardize null data
null_list = [[], [''], ['No education information on file.']]
for rep in rep_ids:
    if rep['education'] in null_list: 
        rep['education'] = {}
    else:
        pass

In [None]:
def degree_dict():
    bach = { 
        deg: 'Bachelor'
        for deg in [
            'BA', 'BS', "Bachelor's", 'BSBA', 'BABS', 'AB', 
            'BBA', 'Bachelors', 'LLB', 'BSIE', 'BSBAGE', 'BSFS',
            'ALB', 'BD', 'BSN', 'BM', 'BS/BA', 'BSA', 'BSE'
        ]
    }
    cert = {
        deg: 'Graduated'
        for deg in [
            'Graduated', 'Certificate', 'Certified', 'Licensed', 
            'Gradated', 'Sixth-Year Degree', 'Ceritifed'
        ]
    }
    associates = {
        deg: 'Associate'
        for deg in [
            'AA', 'AS', 'AAS', "Associate's", 'Associates'
        ]
    }
    non_deg = {
        deg: 'Attended'
        for deg in [
            'Attended'
        ]
    }
    master = {
        deg: 'Master'
        for deg in [
            'MA', 'MS', 'MSW', 'MPhil', "Master's", 'Masters', 'MEd', 'ML', 'MF',
            'MSEd', 'BA/MA', 'MHS', 'MPH', 'MSPA', 'MPP', 'MPA', 'MFA', 'MDiv',
            'MDV', 'ThM', 'MEM', 'Master\x92s', 'MC'
        ]
    }
    mba = {
        deg: 'MBA'
        for deg in [
            'MBA'
        ]
    }
    med = {
        deg: 'Medical'
        for deg in [
            'MD', 'DO', 'DPM'
        ]
    }
    law = {
        deg: 'Law'
        for deg in [
            'JD'
        ]
    }
    dental = {
        deg: 'Dental'
        for deg in [
            'DDS', 'DMD'
        ]
    }
    nurse = {
        deg: 'Nursing'
        for deg in [
            'RN', 'PA', 'MSN', 'MSN/MPH'
        ]
    }
    vet = {
        deg: 'Veterinary'
        for deg in [
            'DVM'
        ]
    }
    phd = {
        deg: 'Doctorate'
        for deg in [
            'PhD', 'DMin', 'EdD'
        ]
    }
    deg_dict = dict(bach, **cert)
    deg_dict.update(associates)
    deg_dict.update(non_deg)
    deg_dict.update(master)
    deg_dict.update(mba)
    deg_dict.update(med)
    deg_dict.update(law)
    deg_dict.update(dental)
    deg_dict.update(nurse)
    deg_dict.update(vet)
    deg_dict.update(phd)
    
    return deg_dict

In [None]:
def clean_education(rep):
    edu = rep['education']
    clean_edu = {}
    deg_dict = degree_dict()
    for e in edu:
        ins = None
        if e != '':
            div_e = e.split(', ')
            if len(div_e) == 1:
                pass
            else:
                deg = deg_dict[div_e[0]]
                for s in div_e[1:]:
                    if re.search(
                        '(?=.*School)|(?=.*College)|(?=.*University)|(?=.*Institute)|(?=.*Center)', s
                    ):
                        ins = s
                clean_edu[deg] = ins

    return clean_edu

In [None]:
# Find all KeyErrors for dictionary
for rep in rep_ids:
    try:
        clean_education(rep)
    except Exception as e:
        print(e)
        print(rep)

In [None]:
# Clean educational data
for rep in rep_ids:
    rep['education'] = clean_education(rep)

In [None]:
# Sample
for rep in rep_ids[:10]:
    print(rep['education'])

In [None]:
def simple_ed_query(query):
    total = 0
    found = 0
    missing = 0
    not_missing = 0
    for rep in rep_ids:
        total += 1
        if rep['education'] == {}:
            missing += 1
        else:
            not_missing += 1
            if f'{query}' in rep['education'].keys():
                found += 1
    f_string_1 = f'{round(100*found/total, 2)}% of reps have a {query} degree'
    f_string_2 = f'{round(100*found/not_missing, 2)}% of reps with educational data have a {query} degree'
    f_string_3 = f'{missing} of {total} have missing educational data'
    print(f_string_1)
    print(f_string_2)
    print(f_string_3)

In [None]:
simple_ed_query('Dental')

In [None]:
missing_vs = []
missing_pull = []
for rep in rep_ids:
    if rep['votesmart_id'] == None:
        missing_vs.append(rep['_id'])
    elif rep['education'] == {}:
        missing_pull.append(rep)

In [None]:
chopped = []
old_list = []
for i in range(len(missing_vs)):
    if i % 10 == 0:
        if old_list != []:
            chopped.append(old_list)
        old_list = []
        old_list.append(missing_vs[i])
    elif i == len(missing_vs) - 1:
        old_list.append(missing_vs[i])
        chopped.append(old_list)
    else:
        old_list.append(missing_vs[i])

In [None]:
queries = [ ref.where("_id", "in", chopped[i]).stream() for i in range(len(chopped)) ]

In [None]:
no_vs = [ doc.to_dict() for query in queries for doc in query ]

In [30]:
import json
from mediawiki import MediaWiki

In [5]:
GCP_API_KEY = config.get('gcpkeys', 'GCP_API_KEY')

In [6]:
def get_wiki_url(google_id):
    '''
    Function to get wikipedia page from Google Knowledge Graph API
    '''
    
    params = {
        'ids': google_id,
        'limit': 10,
        'indent': True,
        'key': GCP_API_KEY,
    }

    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    url = service_url + '?'
    r = requests.get(url, params=params)
    result = r.json()
    wiki_url = result['itemListElement'][0]['result']['detailedDescription']['url']
    
    return wiki_url

In [7]:
def get_education(wiki_url):
    '''
    Function to get education from wikipedia infobox
    '''
    
    r = requests.get(wiki_url).text
    soup = BeautifulSoup(r)
    box = soup.find('table', attrs={'class': 'infobox vcard'})
    sibling = True
    edus = box.find('th', text='Education').next_sibling
    edu = [ a.text for a in edus.find_all('a') ]
    return edu

In [10]:
query = ref.where("_id", "!=", "").select(["_id", "google_id", "first_name", "middle_name", "last_name"]).stream()
reps = [ doc.to_dict() for doc in query ]

In [73]:
def find_google_id(rep):
    '''
    Function to find Google Entity ID
    '''
    
    name = f"{rep['first_name']} {rep['last_name']} politician"
    params = {
        'query': name,
        'limit': 10,
        'indent': True,
        'key': GCP_API_KEY,
    }

    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    url = service_url + '?'
    r = requests.get(url, params=params)
    result = r.json()['itemListElement'][0]['result']
    _id = result['@id'][3:]
    
    return _id

In [74]:
# Find Google Entity IDs
for rep in reps:
    if rep['google_id'] == None:
        rep['google_id'] = find_google_id(rep)

In [31]:
wikipedia = MediaWiki()