# Additional Data Acquisition and Cleaning
- The following code is not in a script
- Future checks and cleans will be integrated into ```db-update``` folder

In [11]:
# Config Reader
import configparser

# Database Connection
import firebase_admin
from firebase_admin import credentials, firestore

# EDA for Data Cleaning and Reacquisition
import numpy as np
import pandas as pd

# Read/Scrape data
import requests
import json
import re
from bs4 import BeautifulSoup

In [3]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('./auth/config.ini')

# Get Google Firebase Auth
GCP_AUTH_PATH = config.get('firebase', 'GCP_AUTH_PATH')
cred = credentials.Certificate(GCP_AUTH_PATH)
app = firebase_admin.initialize_app(cred)

# Instantiate connection to database
db = firestore.client()

In [4]:
# Collection refs
reps_ref = db.collection("reps")
edu_ref = db.collection("edu")

## Find Missing Educational Backgrounds
- 6 US Representatives missing educational background

In [19]:
all_degrees = edu_ref.get()
all_reps = reps_ref.get()

In [36]:
degrees = pd.DataFrame([ doc.to_dict() for doc in all_degrees ])
reps = pd.DataFrame([ doc.to_dict() for doc in all_reps ])

In [37]:
df = degrees.groupby(['_id'])['degree'].apply(list).reset_index()
print(df.shape)

(502, 2)


In [38]:
print(reps.shape)

(508, 16)


In [39]:
df = reps.merge(df, on='_id', how='left')

## Notes
- The 6 Representatives with missing educational backgrounds were manually checked
- Using length < 5 of string resulted in error for Representative Cori Bush (GrDip - Nursing Degree)
- Function to grab educational institutions without finished degrees additionally did not return the correct 'HS' coding

In [56]:
missing_df = df.loc[df['degree'].isna()]

In [57]:
missing_df

Unnamed: 0,google_id,congresses,state,fec_id,gender,wiki_url,govtrack_id,votesmart_id,cspan_id,middle_name,crp_id,dob,last_name,current_party,first_name,_id,degree
19,/g/11cntkcbkt,[117],MO,H8MO01143,F,https://en.wikipedia.org/wiki/Cori_Bush,456829,,,,,1976-07-21 00:00:00+00:00,Bush,D,Cori,B001224,
49,/m/09ry4tm,"[117, 116, 115]",OH,H8OH12180,M,https://en.wikipedia.org/wiki/Troy_Balderson,412747,102781.0,,,N00042194,1962-01-16 00:00:00+00:00,Balderson,R,Troy,B001306,
209,/m/0_qf84y,[117],NM,H8NM02156,F,https://en.wikipedia.org/wiki/Yvette_Herrell,456834,121681.0,,,,1964-03-16 00:00:00+00:00,Herrell,R,Yvette,H001084,
273,/m/0ds17_9,"[117, 116, 115, 114, 113, 112]",MO,H0MO07113,M,https://en.wikipedia.org/wiki/Billy_Long,412445,123401.0,61880.0,,N00030676,1955-08-11 00:00:00+00:00,Long,R,Billy,L000576,
339,/m/024v0s,"[117, 116, 115, 114, 113, 112, 111, 110, 109, ...",CA,H8CA34068,F,https://en.wikipedia.org/wiki/Grace_Napolitano,400290,8393.0,57873.0,F.,N00006789,1936-12-04 00:00:00+00:00,Napolitano,D,Grace,N000179,
435,/m/05t08cn,"[117, 116, 115]",PA,H6PA16320,M,https://en.wikipedia.org/wiki/Lloyd_Smucker,412722,102454.0,103540.0,,N00038781,1964-01-23 00:00:00+00:00,Smucker,R,Lloyd,S001199,


## 'HS' Encoding

In [58]:
# Get ProPublica and document IDs of reps where degree = 'HS' in edu collection
hs_query = edu_ref.where("degree", "==", "HS").select(["_id"]).stream()
hs = [ (doc.get('_id'), doc.id) for doc in hs_query ]

In [59]:
# Sample
print('Sample:', hs[0])
print('Number of Reps:', len(hs))

Sample: ('W000827', '4qzTARSp1eH92Vl7Au4F')
Number of Reps: 23


In [60]:
# Create lists of max length 10 for query ("in" comparison operator only accepts arrays <= 10)
query_lists = []
new_list = []
i = 0
for rep in hs:
    new_list.append(rep[0])
    i += 1
    if i > 9:
        query_lists.append(new_list)
        new_list = []
        i = 0
query_lists.append(new_list)

# List of reps with 'HS' degrees
reps = []
for q_list in query_lists:
    q = reps_ref.where("_id", "in", q_list).stream()
    result = [ doc.to_dict() for doc in q ]
    reps += result

In [61]:
def get_vs_edu(rep):
    url = 'https://justfacts.votesmart.org/candidate/biography/' + rep['votesmart_id']
    r = requests.get(url).content
    soup = BeautifulSoup(r)

    # Collapsable card object
    edu_card = soup.find('b', text='Education').parent.parent.parent
    
    # Education paragraph objects
    edu = [ p.text for p in edu_card.find_all('p') ]
    
    edus = []
    for e in edu:
        entry = e.split(',')
        if len(entry[0]) < 5:
            degree = entry[0]
            for s in entry[1:]:
                if re.search('(?=.*College)|(?=.*University)|(?=.*School)|(?=.*Institute)', s):
                    institution = s.strip()
                    edus.append([degree, institution])
    
    if edus != []:
        return edus
    else:
        return None

In [62]:
# Get all fixed degrees to be inserted
for_update = []
for rep in reps:
    if rep['votesmart_id'] != None:
        result = get_vs_edu(rep)
        if result != None:
            for r in result:
                data = {
                    '_id': rep['_id'],
                    'degree': r[0],
                    'institution': r[1]
                }
                for_update.append(data)

In [64]:
# Find docs that need to be deleted
for_delete = [ doc[1] for doc in hs if doc[0] in [ rep['_id'] for rep in for_update ] ]

In [69]:
# Append missing_df data
m_update = []
for rep in missing_df['_id']:
    if rep == 'B001224': # Cori Bush
        data = {
            '_id': rep,
            'degree': 'GrDip',
            'institution': 'Lutheran School of Nursing'
        }
    else:
        data = {
            '_id': rep,
            'degree': 'HS',
            'institution': 'HS'
        }
    m_update.append(data)
    
# Add missing data to batch
for_update += m_update

In [71]:
# Batch delete edu docs
batch = db.batch()
for doc in for_delete:
    ref = db.collection("edu").document(doc)
    batch.delete(ref)
    
com = batch.commit()

In [73]:
# Sample
for_update[0]

{'_id': 'N000188', 'degree': 'AS', 'institution': 'Camden County College'}

In [74]:
# Batch insert new docs
batch = db.batch()
for doc in for_update:
    ref = db.collection("edu").document()
    batch.set(ref, doc)
    
com = batch.commit()

In [75]:
len(com)

9

In [76]:
# Check to see if rep educations have been updated
new_query = edu_ref.where("degree", "==", "HS").select(["_id"]).stream()
new_hs = [ (doc.get('_id'), doc.id) for doc in new_query ]
print('Number of reps with HS:', len(new_hs)) # Originally 23

Number of reps with HS: 25
