# Notebook for Error Corrections

In [123]:
import configparser
import pymongo

import re
import requests
import googleapiclient
from googleapiclient.discovery import build
from mediawiki import MediaWiki
from bs4 import BeautifulSoup

In [101]:
config = configparser.ConfigParser()
config.read('../database-dev/auth/config.ini')

# MongoDB
MONGO_LOCAL = config.get('mongodb', 'MONGO_LOCAL')
MONGO_DB = config.get('mongodb', 'MONGO_DB')
client = pymongo.MongoClient(MONGO_LOCAL)
db = client.get_database(MONGO_DB)
collection = db['reps']

# ProPublica
PROPUBLICA_KEY = config.get('propublica', 'PROPUBLICA_API_KEY')
API_ROOT = config.get('propublica', 'API_ROOT')
PROPUBLICA_HEADER = {'X-API-Key': f'{PROPUBLICA_KEY}'}

# GKG
GKG_API_KEY = config.get('gcpkeys', 'GKG_API_KEY')
GKG = config.get('gcpkeys', 'GKG')
GKG_VERSION = config.get('gcpkeys', 'GKG_VERSION')

# Instantiate service connection
service = build(GKG, GKG_VERSION, developerKey=GKG_API_KEY)
entities = service.entities()

# Instantiate wikipedia object
wikipedia = MediaWiki()

In [102]:
reps = [ r for r in collection.find({}) ]

In [103]:
# Check wikipedia URL errors
errors = []
for rep in reps:
    f_name = rep['first_name']
    l_name = rep['last_name']
    url = rep['wiki_url']
    if re.search(f'.*{f_name}.*', url) or re.search(f'.*{l_name}.*', url):
        pass
    else:
        errors.append(rep)
        print(rep['first_name'], rep['last_name'])
        print(rep['wiki_url'])

Jesús García
https://en.wikipedia.org/wiki/Jes%C3%BAs_%22Chuy%22_Garc%C3%ADa
Ralph Norman
https://en.wikipedia.org/wiki/Karen_Handel
Jefferson Van Drew
https://en.wikipedia.org/wiki/Jeff_Van_Drew


## Ralph Norman
- Notified ProPublica of incorrect Google Entity ID.
- Previously notified ProPublica of incorrect John Carter Google Entity ID.

### Google Entity ID

In [104]:
# Check GKG result
_id = errors[1]['google_id']
r = entities.search(ids=_id).execute()
result = r['itemListElement'][0]['result']
wiki_url = result['detailedDescription']['url']
print(wiki_url)
print('Database Google Entity ID:', _id)

https://en.wikipedia.org/wiki/Karen_Handel
Database Google Entity ID: /m/0g838b


In [105]:
# Check ProPublica Data
call_string = API_ROOT + f"members/{errors[1]['_id']}.json"
r = requests.get(call_string, headers=PROPUBLICA_HEADER)
result = r.json()['results'][0]
print('ProPublica Google Entity ID:', result['google_entity_id'])

ProPublica Google Entity ID: /m/0g838b


In [106]:
# Use GKG search query to get proper ID
query = f"{errors[1]['first_name']} {errors[1]['last_name']} politician"
r = entities.search(query=query).execute()
result = r['itemListElement'][0]['result']
_id = result['@id']
gid = re.search('(?<=:).*', _id)[0]
print('Google Entity Search Result:', result)

Google Entity Search Result: {'url': 'http://www.scstatehouse.gov/member.php?code=1421590739', 'name': 'Ralph Norman', 'description': 'U.S. Representative', '@type': ['Person', 'Thing'], '@id': 'kg:/m/0f9kbx'}


In [119]:
# Retrieve correct wikipedia url
query = f"{errors[1]['first_name']} {errors[1]['last_name']} politician"
wiki_url = wikipedia.page(query).url
wiki_url

'https://en.wikipedia.org/wiki/Ralph_Norman'

### Educational Background

In [129]:
r = requests.get(wiki_url).text
soup = BeautifulSoup(r)
box = soup.find('table', attrs={'class': 'infobox vcard'})
edus = box.find('th', text='Education').next_sibling
edu = [ a.text for a in edus.find_all('a') ]
print(edu)

['Presbyterian College', 'BS']


In [130]:
edu = [[edu[1], edu[0]]]

In [131]:
# Correct errors in database
_id = errors[1]['_id']
result = collection.update({'_id': _id}, {'$set': {'google_id': gid, 'wiki_url': wiki_url, 'education': edu}})
result

  result = collection.update({'_id': _id}, {'$set': {'google_id': gid, 'wiki_url': wiki_url, 'education': edu}})


{'n': 1, 'nModified': 1, 'ok': 1.0, 'updatedExisting': True}

In [132]:
# Verify
result = collection.find_one({'_id': _id})
print(result['google_id'])
print(result['wiki_url'])
print(result['education'])

/m/0f9kbx
https://en.wikipedia.org/wiki/Ralph_Norman
[['BS', 'Presbyterian College']]
