# Educational Background Acquisition Notebook
- Webscraping Wikipedia for US House of Representatives Educational Backgrounds
- Vote Smart used as backup

## Configuration

In [1]:
import configparser
import pymongo

In [2]:
config = configparser.ConfigParser()
config.read('../database-dev/auth/config.ini')

MONGO_LOCAL = config.get('mongodb', 'MONGO_LOCAL')
MONGO_DB = config.get('mongodb', 'MONGO_DB')
client = pymongo.MongoClient(MONGO_LOCAL)
db = client.get_database(MONGO_DB)

In [3]:
# Set database collection
collection = db['reps']

## Database Query
- Get representative documents (ID, name, wikipedia URL) from MongoDB.

In [4]:
# Database query returning wikipedia URLs
result = collection.find(
    {},
    {'_id': 1, 'first_name': 1, 'last_name': 1, 'wiki_url': 1, 'votesmart_id': 1}
)
reps = [ rep for rep in result ]

In [5]:
# Check if there are missing wikipedia URLs
for rep in reps:
    assert rep['wiki_url'] != None
    assert rep['wiki_url'] != ''

## Data Acquisition - Beautiful Soup
- Scrape educational backgrounds from wikipedia with Beautiful Soup.
- Educational background located in ```<th>Education</th>``` or ```<th><a>Alma mater</a></th>``` row of ```<table>``` with attribute ```class="infobox"```.
- Manual check of individuals without tertiary educational degrees.

In [6]:
import functools
import requests
import re
from bs4 import BeautifulSoup

In [7]:
# Wrapper for error logging
def error_logging(func):
    @functools.wraps(func)
    def wrapper_error(*args):
        data = None
        error = None
        try:
            data = func(*args)
        except Exception as e:
            error = type(e)
        return data, error
    return wrapper_error

In [8]:
@error_logging
def wiki_edu_scrape(wiki_url):
    '''
    Function to scrape wikipedia by "Education" or "Alma mater" table row
    '''
    
    r = requests.get(wiki_url).text
    soup = BeautifulSoup(r)
    box = soup.find('table', attrs={'class': 'infobox vcard'})
    try:
        edus = box.find('th', text='Education').next_sibling
        edu = [ a.text for a in edus.find_all('a') ]
    except:
        edus = box.find('a', attrs={'title': 'Alma mater'})
        edu = [ a.text for a in edus.parent.next_sibling.find_all('a') ]
    
    return edu

In [9]:
@error_logging
def get_vs_id(rep):
    '''
    Function to retrieve missing Vote Smart ID with query
    '''
    
    call_string = f'https://votesmart.org/search?q={rep["first_name"]}+{rep["last_name"]}'
    r = requests.get(call_string).text
    soup = BeautifulSoup(r)
    anchors = soup.find_all('a')
    for a in anchors:
        if a.text == f'{rep["first_name"]} {rep["last_name"]}':
            _id = re.search('(?<=/).*?(?=(?:/))', str(a))[0]
            break
    
    return _id

In [10]:
@error_logging
def vs_edu_scrape(rep):
    '''
    Function to scrape Vote Smart by "Education" <b> element
    '''
    
    url = 'https://justfacts.votesmart.org/candidate/biography/' + rep['votesmart_id']
    r = requests.get(url).content
    soup = BeautifulSoup(r)

    # Collapsable card object
    edu_card = soup.find('b', text='Education').parent.parent.parent
    
    # Education paragraph objects
    edu = [ p.text for p in edu_card.find_all('p') ]
    
    edus = []
    for e in edu:
        entry = e.split(',')
        if len(entry[0]) < 5:
            degree = entry[0]
            for s in entry[1:]:
                pattern = '(?=.*College)|(?=.*University)|(?=.*School)|(?=.*Institute)'
                if re.search(pattern, s):
                    institution = s.strip()
                    edus.append([degree, institution])
    
    if edus != []:
        return edus
    else:
        return None

In [11]:
@error_logging
def clean_edu(rep):
    '''
    Function to pair degree with institution and standardize non-degrees
    '''
    
    edu = rep['education']
    
    edu_list = []
    for i in range(len(edu)):
        institute = None
        degree = None
        if len(edu[i]) < 10:
            degree = edu[i]
            degree = ''.join(degree.split('.')).upper()
            for j in range(i-1, -1, -1):
                if len(edu[j]) >= 10:
                    institute = edu[j]
                    edu_list.append([degree, institute])
                    break

    return edu_list

In [12]:
# Scrape educational backgrounds of all current (2021) reps
no_wiki_edu = []
no_vs_id = []
no_vs_edu = []
other_errors = []
for rep in reps:
    edus, error = wiki_edu_scrape(rep['wiki_url'])
    rep['education'] = edus
    if error: # No educational background on wikipedia
        no_wiki_edu.append(rep)
        
    elif len(edus) < 2: # No degree shown in wikipedia educational background
        vs_id, error = get_vs_id(rep)
        if error:
            no_vs_id.append(rep)
        else:
            rep['votesmart_id'] = vs_id
            edus, error = vs_edu_scrape(rep)
            if error:
                no_vs_edu.append(rep)
            else:
                rep['education'] = edus
    
    else:
        edus, error = clean_edu(rep)
        if error:
            other_errors.append(rep)
        else:
            rep['education'] = edus

### Manual Check: No tertiary educational degree

In [24]:
no_wiki_edu

[{'_id': 'B000825',
  'first_name': 'Lauren',
  'last_name': 'Boebert',
  'votesmart_id': None,
  'wiki_url': 'https://en.wikipedia.org/wiki/Lauren_Boebert',
  'education': None},
 {'_id': 'B001295',
  'first_name': 'Mike',
  'last_name': 'Bost',
  'votesmart_id': '6302',
  'wiki_url': 'https://en.wikipedia.org/wiki/Mike_Bost',
  'education': None},
 {'_id': 'G000577',
  'first_name': 'Garret',
  'last_name': 'Graves',
  'votesmart_id': '155424',
  'wiki_url': 'https://en.wikipedia.org/wiki/Garret_Graves',
  'education': None},
 {'_id': 'K000395',
  'first_name': 'Fred',
  'last_name': 'Keller',
  'votesmart_id': '119553',
  'wiki_url': 'https://en.wikipedia.org/wiki/Fred_Keller_(politician)',
  'education': None},
 {'_id': 'N000190',
  'first_name': 'Ralph',
  'last_name': 'Norman',
  'votesmart_id': '47930',
  'wiki_url': 'https://en.wikipedia.org/wiki/Karen_Handel',
  'education': None},
 {'_id': 'O000171',
  'first_name': 'Tom',
  'last_name': "O'Halleran",
  'votesmart_id': '28499

### Manual Check: No tertiary educational degree
- Manually assign ```'education': None```

In [20]:
no_vs_id

[{'_id': 'C001104',
  'first_name': 'Madison',
  'last_name': 'Cawthorn',
  'votesmart_id': None,
  'wiki_url': 'https://en.wikipedia.org/wiki/Madison_Cawthorn',
  'education': ['Patrick Henry College']},
 {'_id': 'W000827',
  'first_name': 'Ron',
  'last_name': 'Wright',
  'votesmart_id': '78666',
  'wiki_url': 'https://en.wikipedia.org/wiki/Ron_Wright_(politician)',
  'education': ['University of Texas at Arlington']}]

In [26]:
for rep in no_vs_id:
    rep['education'] = None

## Update Database

In [25]:
from pymongo import UpdateOne

In [28]:
# Create bulk updates
updates = []
for rep in reps:
    update = UpdateOne(
        {'_id': rep['_id']},
        {'$set': {'votesmart_id': rep['votesmart_id'], 'education': rep['education']}}
    )
    updates.append(update)

In [29]:
result = collection.bulk_write(updates)

In [30]:
result.bulk_api_result

{'writeErrors': [],
 'writeConcernErrors': [],
 'nInserted': 0,
 'nUpserted': 0,
 'nMatched': 440,
 'nModified': 440,
 'nRemoved': 0,
 'upserted': []}