# Acquiring Educational Data for Reps

In [1]:
# Database connection and authentication
import configparser
import firebase_admin
from firebase_admin import credentials, firestore

# Scraping
import re
import requests
import time
import json
from bs4 import BeautifulSoup
from mediawiki import MediaWiki

## Database and Credentials

In [2]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('./auth/config.ini')

# Get Google Firebase Auth
GCP_AUTH_PATH = config.get('firebase', 'GCP_AUTH_PATH')
cred = credentials.Certificate(GCP_AUTH_PATH)
app = firebase_admin.initialize_app(cred)

# Instantiate connection to database
db = firestore.client()

# Google Knowledge Graph API Key
GCP_API_KEY = config.get('gcpkeys', 'GCP_API_KEY')

In [3]:
# Instantiate collection reference
ref = db.collection("reps")

## Acquire Rep Data

In [4]:
# Retrieve all reps
query = ref.where("_id", "!=", "").select(["_id", "google_id", "first_name", "middle_name", "last_name"]).stream()
reps = [ doc.to_dict() for doc in query ]

### Get Missing Google Entity IDs
- Certain reps have missing Google Entity IDs
- Acquire Google Entity IDs utilizing the Google Knowledge Graph API

In [5]:
def find_google_id(rep):
    '''
    Function to find Google Entity ID
    '''
    
    first_name = rep["first_name"]
    last_name = rep["last_name"]
    name = f'{first_name} {last_name} politician'
    params = {
        'query': name,
        'limit': 10,
        'indent': True,
        'key': GCP_API_KEY,
    }

    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    url = service_url + '?'
    r = requests.get(url, params=params)
    result = r.json()['itemListElement'][0]['result']
    _id = result['@id'][3:]
    
    return _id

In [6]:
# Find and assign Google Entity IDs for reps with missing google_ids
for rep in reps:
    if rep['google_id'] == None:
        rep['google_id'] = find_google_id(rep)

### Get Wikipedia Page from Google Knowledge Graph API

In [7]:
def get_google_entity(rep):
    '''
    Function to get Google entity JSON
    '''
    
    google_id = rep['google_id']
    params = {
        'ids': google_id,
        'limit': 10,
        'indent': True,
        'key': GCP_API_KEY,
    }

    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    url = service_url + '?'
    r = requests.get(url, params=params)
    result = r.json()
    
    return result

In [8]:
def get_wiki_url(rep):
    '''
    Function to get wiki_url if it exists in Google Knowledge Graph,
    returns tuple (wiki_url, error)
    '''
    
    result = get_google_entity(rep)
    
    wiki_url = None
    try:
        wiki_url = result['itemListElement'][0]['result']['detailedDescription']['url']
        return wiki_url, None
    except Exception as e:
        return wiki_url, e

In [9]:
# Get Wikipedia URLs for reps
for rep in reps:
    rep['wiki_url'], rep['error'] = get_wiki_url(rep)

### Handle Errors

In [10]:
# Check errors
errors = {}
for rep in reps:
    error = rep['error']
    if error == None:
        pass
    else:
        e_type = type(error)
        if e_type not in errors.keys():
            errors[e_type] = []
        
        errors[e_type].append(rep)

In [11]:
# Check number of errors by type
for k in errors.keys():
    print(k, 'errors:', len(errors[k]))

<class 'IndexError'> errors: 16
<class 'KeyError'> errors: 14


#### IndexError: Incorrect Google Entity IDs from ProPublica database
- Use Google Knowledge Graph to get new IDs

In [12]:
# Determine if ProPublica google_id is incorrect
i = 0
for rep in errors[IndexError]:
    if find_google_id(rep) != rep['google_id']:
        i += 1
print(i)

16


In [13]:
# Use new Google Entity ID to get Wikipedia URL
i = 0
for rep in errors[IndexError]:
    rep['google_id'] = find_google_id(rep)
    rep['wiki_url'], rep['error'] = get_wiki_url(rep)
    if rep['error'] != None:
        i += 1
print(i) # Number of reps still with error

0


#### KeyError: Google Entity missing Wikipedia page URL
- Use MediaWiki package to get page URL

In [14]:
wikipedia = MediaWiki()

In [15]:
# Get Wikipedia URLs
i = 0
for rep in errors[KeyError]:
    try:
        name = get_google_entity(rep)['itemListElement'][0]['result']['name']
        p = wikipedia.page(f'{name} politician')
        rep['wiki_url'] = p.url
        rep['error'] = None
    except:
        i += 1
print(i) # Number of reps still with error

0


### Get Educational Background with Wikipedia Scrape
- Note: All rep errors are None

In [16]:
def get_education(wiki_url):
    '''
    Function to get education from wikipedia infobox,
    returns tuple (education, error)
    '''
    
    try:
        r = requests.get(wiki_url).text
        soup = BeautifulSoup(r)
        box = soup.find('table', attrs={'class': 'infobox vcard'})
        sibling = True
        edus = box.find('th', text='Education').next_sibling
        edu = [ a.text for a in edus.find_all('a') ]
        return edu, None
    except Exception as e:
        return None, e

In [17]:
# Get education from Wikipedia with error saving
errors = {}
for rep in reps:
    wiki_url = rep['wiki_url']
    rep['education'], rep['error'] = get_education(wiki_url)
    if rep['error'] == None:
        pass
    else:
        e_type = type(rep['error'])
        if e_type not in errors.keys():
            errors[e_type] = []
        
        errors[e_type].append(rep)

In [18]:
# Check number of errors by type
for k in errors.keys():
    print(k, 'errors:', len(errors[k]))

<class 'AttributeError'> errors: 13


#### Funny Detour
- The Google Entity ID provided by ProPublica for Texas Politician John Carter was the ID for the movie *John Carter*

In [19]:
# John Carter from ProPublica provided Google Entity ID
get_google_entity(errors[AttributeError][2])

{'@context': {'detailedDescription': 'goog:detailedDescription',
  '@vocab': 'http://schema.org/',
  'EntitySearchResult': 'goog:EntitySearchResult',
  'kg': 'http://g.co/kg',
  'resultScore': 'goog:resultScore',
  'goog': 'http://schema.googleapis.com/'},
 '@type': 'ItemList',
 'itemListElement': [{'result': {'detailedDescription': {'articleBody': 'John Carter is a 2012 American science fiction action film directed by Andrew Stanton, written by Stanton, Mark Andrews, and Michael Chabon, and based on A Princess of Mars, the first book in the Barsoom series of novels by Edgar Rice Burroughs. ',
     'license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',
     'url': 'https://en.wikipedia.org/wiki/John_Carter_(film)'},
    '@id': 'kg:/m/03whyr',
    'url': 'http://movies.disney.com/john-carter',
    '@type': ['Thing', 'Movie'],
    'name': 'John Carter',
    'description': '2012 film'},
   '@type': 'EntitySearchResult',
 

In [26]:
# Find results for John Carter
rep = errors[AttributeError][2]
first_name = rep["first_name"]
last_name = rep["last_name"]
name = f'{first_name} {last_name}'
params = {
    'query': name,
    'limit': 10,
    'indent': True,
    'key': GCP_API_KEY,
}
service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
url = service_url + '?'
r = requests.get(url, params=params)
result = r.json()

In [35]:
# Narrow to only 'Person' entities
possible = []
for r in result['itemListElement']:
    if 'Person' in r['result']['@type']:
        possible.append(r)

In [38]:
# Sort by highest resultScore
possible.sort(key=lambda x: x['resultScore'], reverse=True)
possible[0] # Correct John Carter

{'@type': 'EntitySearchResult',
 'resultScore': 1094.960693359375,
 'result': {'@type': ['Thing', 'Person'],
  'detailedDescription': {'url': 'https://en.wikipedia.org/wiki/John_Carter_(Texas_politician)',
   'license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',
   'articleBody': "John Rice Carter is the U.S. Representative serving Texas's 31st congressional district since 2003. He is a Republican. The district includes the northern suburbs of Austin, as well as Fort Hood."},
  'description': 'U.S. Representative',
  'url': 'http://carter.house.gov/',
  'name': 'John Carter',
  '@id': 'kg:/m/04m7rg'}}

In [41]:
# Replace John Carter (movie) with John Carter (politician)
errors[AttributeError][2]['google_id'] = possible[0]['result']['@id']
errors[AttributeError][2]['wiki_url'] = possible[0]['result']['detailedDescription']['url']

# Get educational background
rep = errors[AttributeError][2]
rep['education'], rep['error'] = get_education(wiki_url)

In [44]:
# Check
errors[AttributeError].pop(2)

{'_id': 'C001051',
 'middle_name': None,
 'last_name': 'Carter',
 'google_id': 'kg:/m/04m7rg',
 'first_name': 'John',
 'wiki_url': 'https://en.wikipedia.org/wiki/John_Carter_(Texas_politician)',
 'error': None,
 'education': ['State University of New York, Albany',
  'BA',
  'Albany Law School',
  'JD']}

In [46]:
len(errors[AttributeError])

12

#### End Detour
#### AttributeError: No 'Education' Section in Wikipedia Page Infobox
- Some are in 'Personal details' section in 'Alma mater' row
- Some have no tertiary education

In [47]:
# Reassign errors
errors = errors[AttributeError]

In [76]:
def get_alma_mater(rep):
    '''
    Function for alternative education scrape of Wikipedia page
    '''
    
    try:
        url = rep['wiki_url']
        r = requests.get(url)
        soup = BeautifulSoup(r.content)
        infobox = soup.find('table', attrs={'class': 'infobox vcard'})
        am = infobox.find('a', attrs={'title': 'Alma mater'})
        edu = [ a.text for a in am.parent.next_sibling.find_all('a') ]
        return edu, None
    except Exception as e:
        return None, e

In [78]:
# No tertiary education
no_edus = []
for rep in errors:
    rep['education'], rep['error'] = get_alma_mater(rep)
    if rep['error'] != None:
        no_edus.append(rep)

# Data Cleaning Education and Insertion

In [92]:
# Remove error keys from rep dictionaries
for rep in reps:
    del rep['error']

### Notes on Function:
- Dictionary ```{degree: institution}``` was not used as the possibility of an individual earning 2 of the same degree types will result in key conflicts.  Additionally, ```{institution: degree}``` dictionary was not used as an individual may earn more than 1 degree from the same institution.
- Tuples were not used as many databases do not accept tuples as data types.

In [121]:
def clean_education(rep):
    '''
    Function to pair degree with institution and standardize non-degrees
    '''
    
    edu = rep['education']
    if edu == None:
        return [['HS', 'HS']]
    if len(edu) < 2:
        return [['HS', 'HS']]
    
    edu_list = []
    for i in range(len(edu)):
        institute = None
        degree = None
        if len(edu[i]) < 5:
            degree = edu[i]
            for j in range(i-1, -1, -1):
                if len(edu[j]) >= 5:
                    institute = edu[j]
                    edu_list.append([degree, institute])
                    break

    return edu_list

In [118]:
# Clean education
for rep in reps:
    rep['education'] = clean_education(rep)

In [124]:
# Sample
reps[100]

{'first_name': 'Jason',
 'google_id': '/g/11gzqdw_wg',
 'last_name': 'Crow',
 '_id': 'C001121',
 'middle_name': None,
 'wiki_url': 'https://en.wikipedia.org/wiki/Jason_Crow',
 'education': [['BA', 'University of Wisconsin, Madison'],
  ['JD', 'University of Denver']]}

## Batch Insert Data

### Inserting Updated Google Entity IDs and Wikipedia URLs

In [141]:
batch = db.batch()
total = 0
insert_len = 0
batch_num = 1
for rep in reps:
    insert_ref = db.collection("reps").document(rep['_id'])
    up_dict = {
        'google_id': rep['google_id'],
        'wiki_url': rep['wiki_url'],
    }
    batch.update(insert_ref, up_dict)
    insert_len += 1
    total += 1
    if insert_len > 200:
        batch.commit()
        print(f'{insert_len} reps updated in batch #{batch_num}')
        insert_len = 0
        batch_num += 1
        
batch.commit()
print(f'{insert_len} reps updated in batch #{batch_num}')
print(f'{total} reps updated in total')

201 reps updated in batch #1
201 reps updated in batch #2
106 reps updated in batch #3
508 reps updated in total


### Inserting into New Collection 'edu'

In [144]:
batch = db.batch()
total = 0
insert_len = 0
batch_num = 1
for rep in reps:
    for edu in rep['education']:
        insert_ref = db.collection("edu").document()
        data = {
            '_id': rep['_id'],
            'degree': edu[0],
            'institution': edu[1]
        }
        batch.set(insert_ref, data)
        insert_len += 1
        total += 1
        if insert_len > 399:
            batch.commit()
            print(f'{insert_len} degrees inserted in batch #{batch_num}')
            insert_len = 0
            batch_num += 1

batch.commit()
print(f'{insert_len} degrees inserted in batch #{batch_num}')
print(f'{total} degrees inserted in total')

400 degrees inserted in batch #1
400 degrees inserted in batch #2
135 degrees inserted in batch #3
935 degrees inserted in total
