In [22]:
# Config Reader
import configparser

# Database Connection
import firebase_admin
from firebase_admin import credentials, firestore

# Read/Scrape data
import requests
import json
import re
from bs4 import BeautifulSoup

In [2]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('./auth/config.ini')

# Get Google Firebase Auth
GCP_AUTH_PATH = config.get('firebase', 'GCP_AUTH_PATH')
cred = credentials.Certificate(GCP_AUTH_PATH)
app = firebase_admin.initialize_app(cred)

# Instantiate connection to database
db = firestore.client()

In [3]:
# Collection refs
reps_ref = db.collection("reps")
edu_ref = db.collection("edu")

In [76]:
# Get ProPublica and document IDs of reps where degree = 'HS' in edu collection
hs_query = edu_ref.where("degree", "==", "HS").select(["_id"]).stream()
hs = [ (doc.get('_id'), doc.id) for doc in hs_query ]

In [78]:
# Sample
print('Sample:', hs[0])
print('Number of Reps:', len(hs))

Sample: ('G000577', '09jDHasMSchzzQECYt0j')
Number of Reps: 23


In [79]:
# Create lists of max length 10 for query ("in" comparison operator only accepts arrays <= 10)
query_lists = []
new_list = []
i = 0
for rep in hs:
    new_list.append(rep[0])
    i += 1
    if i > 9:
        query_lists.append(new_list)
        new_list = []
        i = 0
query_lists.append(new_list)

# List of reps with 'HS' degrees
reps = []
for q_list in query_lists:
    q = reps_ref.where("_id", "in", q_list).stream()
    result = [ doc.to_dict() for doc in q ]
    reps += result

In [80]:
def get_vs_edu(rep):
    url = 'https://justfacts.votesmart.org/candidate/biography/' + rep['votesmart_id']
    r = requests.get(url).content
    soup = BeautifulSoup(r)

    # Collapsable card object
    edu_card = soup.find('b', text='Education').parent.parent.parent
    
    # Education paragraph objects
    edu = [ p.text for p in edu_card.find_all('p') ]
    
    edus = []
    for e in edu:
        entry = e.split(',')
        if len(entry[0]) < 5:
            degree = entry[0]
            for s in entry[1:]:
                if re.search('(?=.*College)|(?=.*University)|(?=.*School)|(?=.*Institute)', s):
                    institution = s.strip()
                    edus.append([degree, institution])
    
    if edus != []:
        return edus
    else:
        return None

In [81]:
# Get all degrees to be inserted
for_update = []
for rep in reps:
    if rep['votesmart_id'] != None:
        result = get_vs_edu(rep)
        if result != None:
            for r in result:
                data = {
                    '_id': rep['_id'],
                    'degree': r[0],
                    'institution': r[1]
                }
                for_update.append(data)

In [83]:
# Find docs that need to be deleted
for_delete = [ doc[1] for doc in hs if doc[0] in [ rep['_id'] for rep in for_update ] ]

In [95]:
# Batch delete edu docs
batch = db.batch()
for doc in for_delete:
    ref = db.collection("edu").document(doc)
    batch.delete(ref)
    
com = batch.commit()

In [100]:
# Check to see if rep educations have been updated
new_query = edu_ref.where("degree", "==", "HS").select(["_id"]).stream()
new_hs = [ (doc.get('_id'), doc.id) for doc in new_query ]
print('Number of reps with HS:', len(new_hs)) # Originally 23

Number of reps with HS: 20


In [101]:
# Sample
for_update[0]

{'_id': 'E000215', 'degree': 'AA', 'institution': 'Cañada College'}

In [102]:
# Batch insert new docs
batch = db.batch()
for doc in for_update:
    ref = db.collection("edu").document()
    batch.set(ref, doc)
    
com = batch.commit()

In [108]:
len(com)

3