# Civics Test and Updates

First things first, there's one thing needed. And that is, to download the [2008 version of the civics test](https://www.uscis.gov/sites/default/files/document/questions-and-answers/OoC_100_Questions_2008_Civics_Test_V1.pdf).

Secondly, I saved the information of the state I reside in and the ZIP code in a separate file (my_location.json). This way I could easily automate it for any state and/or zip code.

## Imports

Nothing too crazy, I wanted to stick mostly to the basics here. Didn't even import pandas or numpy!

In [1]:
from PyPDF2 import PdfReader
import re
import random
import requests
from bs4 import BeautifulSoup
import json
from datetime import date

And then load the two files containing the 100 civics questions and answers, as well as the state and ZIP.

In [2]:
reader = PdfReader('../data/100q.pdf')
len(reader.pages)

11

In [3]:
with open('../data/my_location.json') as fi:
    my_location = json.load(fi)

In [4]:
# example page extract
# reader.pages[0].extract_text()

## Reading the Q and A from the PDF

Parsing the PDF had some tricky elements to it:
- One question goes into a second line (so ending on \n didn't work)
- Answers at end of page didn't end with \n
- There's extra white space and some other junk occasionally

In [5]:
# second attempt, connect the qs and as
civics = {}

for p_num in range(len(reader.pages)):
    page = reader.pages[p_num]
    text = page.extract_text()

    # get the start of questions
    q_starts = [q.start(0) for q in re.finditer('\d{1,3}\.', text)]

    for i in range(len(q_starts)):

        if i < len(q_starts) - 1:
            q_and_a = text[q_starts[i]:q_starts[i+1]]
        # the last question on the page
        else:
            q_and_a = text[q_starts[i]:]
            # adding endline for easier parsin lager
            q_and_a += '\n'

        # extract and clean the question
        question = re.search('\d{1,3}\.[\s\S]*?[▪|\[]', q_and_a).group(0)
        question = re.sub('\[', '', question)
        question = re.sub('\s+', ' ', question)
        question = re.sub('▪', '', question)
        question = re.sub('\*', '', question)

        # extract question number, for dictionary
        q_num, question = question.split('.', maxsplit=1)
        q_num = int(q_num)
        question = question.strip()

        # extract the answers
        answers = []
        for answer in re.findall('▪[\s\S]*?[\n|\[]', q_and_a):
            answer = re.sub('\[', '', answer)
            answer = answer.rstrip()
            answer = re.sub('▪\s+', '', answer)

            answers.append(answer)

            # hundred is a special case ...because it has at least 2 if not 3 words
            # preceding the digit in ()
            if  'hundred' in answer:
                # start with base and then remove either all digits, or all
                tmp = re.search('\S+\shundred\s?\S*?\s\(\d+\)', answer).group(0)

                answer_nodigit = answer.replace(tmp, tmp.split('(')[0]).strip()
                answer_digit = answer.replace(tmp, tmp.split('(')[1]).replace(')', '')

                answers.append(answer_nodigit)
                answers.append(answer_digit)

            # for fuzzy matching, I want to be able to answer with words or numbers
            # I was not super happy with splitting EVERYTHING
            # so for example two (2) will become two, two (2) and 2                
            elif re.search('\(\d+\)', answer):
                # start with base and then remove either all digits, or all
                answer_nodigit = answer
                answer_digit = answer

                # there may be more than one number in ()
                # take whatever word preceeds the number in (), it's always going to be like nine (9)
                for tmp in re.findall('\S+\s\(\d+\)', answer):
                    answer_nodigit = answer_nodigit.replace(tmp, tmp.split()[0])
                    answer_digit = answer_digit.replace(tmp, tmp.split()[1])

                answer_nodigit = answer_nodigit.strip()
                answer_digit = re.sub('[(|)]', '', answer_digit)
                answers.append(answer_nodigit)
                answers.append(answer_digit)
                

        # these comments will later be replaced, but I collected them initially
        comments = re.search('\[[\s\S]*?\]', q_and_a)
        if comments:
            answers.append(comments.group(0))

        # save in dictionary of dictionaries
        # I decided it's going to be easier to access by #, since spelling can differ online
        civics[q_num] = {question: answers}

In [6]:
# base_answer = 'twenty-five (25) and nine (9)'
# print('original', base_answer)

# digits_only = base_answer
# no_digits = base_answer
# for tmp in re.findall('\S+\s\(\d+\)', base_answer):
#     digits_only = digits_only.replace(tmp, tmp.split()[0])
#     no_digits = no_digits.replace(tmp, tmp.split()[1])

# no_digits = re.sub('[(|)]', '', no_digits)

# print('digits', digits_only.strip())
# print('no digits', no_digits.strip())

In [7]:
# to print it out
# civics

## Scraping the updated questions and State-specific info

Get the governor, house rep., senators and the capital of your state (again, obtained from the file my_location.json)

In [8]:
r = requests.get(f'https://www.usa.gov/states/{my_location["state_full"].lower()}')

if r.status_code != 200:
    print('there was a problem with loading the page!')

soup = BeautifulSoup(r.text)
governor = soup.find('span', 
            {'class': "field field--name-field-governor field--type-link field--label-hidden field__item line-height-2"}
            ).text

# remove "Governor" from name
governor = re.sub('Governor ', '', governor)

In [9]:
r = requests.get(f'https://ziplook.house.gov/htbin/findrep_house?ZIP={my_location["my_zip"]}')

if r.status_code != 200:
    print('there was a problem with loading the page!')

soup = BeautifulSoup(r.text)
house_rep = soup.find('div', {'id': 'PossibleReps'}).find('a').text
house_rep = house_rep.strip()

In [10]:
r = requests.get('https://www.senate.gov/general/contact_information/senators_cfm.xml')

if r.status_code != 200:
    print('there was a problem with loading the page!')

soup = BeautifulSoup(r.text, features='xml')
senators = []
for state in soup.find_all('state'):
    if state.get_text() == my_location['state_abbr'].upper():
        senators.append(state.parent.find('first_name').get_text() + ' ' 
                        + state.parent.find('last_name').get_text())

In [11]:
r = requests.get('https://www.britannica.com/topic/list-of-state-capitals-in-the-United-States-2119210')

if r.status_code != 200:
    print('there was a problem with loading the page!')

soup = BeautifulSoup(r.text)
# I could use pandas here as well ...
# but essentially, once I hit the state, the next row is going to be the capital
table_cells = soup.find_all('td')

for i, cell in enumerate(table_cells):
    if cell.get_text() == my_location['state_full'].capitalize():

        #print(table_cells[i+1].get_text())
        capital = table_cells[i+1].get_text()
        break

In [12]:
r = requests.get('https://www.uscis.gov/citizenship/find-study-materials-and-resources/check-for-test-updates')

if r.status_code != 200:
    print('there was a problem with loading the page!')

soup = BeautifulSoup(r.text)

civics_updates = {}

# the logic here is that a question will be <p>
# followed by a list
# and then possibly another <p> which just says which page to visit
for child in soup.find('div', {'class': 'accordion__panel'}).children:
    if child.name == 'p':

        # first p will be a question
        question = child.get_text()
        if re.search('\d{1,3}\.', question):
            question = re.sub('\*', '', question)
            # same logic as before
            q_num, question = question.split('.', maxsplit=1)
            q_num = int(q_num)
            question = question.strip()       

        # second will be just "visit senate.gov"
        else:
            continue

    elif child.name == 'ul':
        #answers = [li.get_text().strip() for li in child.children]
        answers = []
        for li in child.children:
            answer = li.get_text().strip() 
            answers.append(answer)

            # easier than above since there's currently no instances of hundreds
            # or multiple digits in different ()                
            if re.search('\(\d+\)', answer):
                answer_digit = re.findall('\((\d+)\)', answer)
                answers.extend(answer_digit)

                answer_nodigit = re.sub('\s*\(\d+\)\s*', ' ', answer)
                answer_nodigit = answer_nodigit.strip()
                answers.append(answer_nodigit)
        

        # when we get the answers we can save them
        civics_updates[q_num] = {question: answers}
    else:
        print('uh oh, new unrecognized type!')

In [13]:
# civics_updates

## Putting it all together

First replace the original question with the updates form USCIS.

In [14]:
# insert the civics updates into the civics_final
civics_final = civics.copy()

# update from civics
for q_num, q_and_a in civics_updates.items():
    civics_final[q_num] = q_and_a

Second, add the names for governor, senators, representative and capital of state.

In [15]:
for q_num, q_and_a in civics_final.items():

    if  'U.S. Senators now?' in list(q_and_a.keys())[0]:
        civics_final[q_num] = {list(q_and_a.keys())[0]: senators}

    elif 'Name your U.S. Representative.' in list(q_and_a.keys())[0]:
        # note just for consistency, make house_rep, gov and capital a list
        civics_final[q_num] = {list(q_and_a.keys())[0]: [house_rep]}
    
    elif 'Who is the Governor of your state now?' in list(q_and_a.keys())[0]:
        civics_final[q_num] = {list(q_and_a.keys())[0]: [governor]}

    elif 'What is the capital of your state?' in list(q_and_a.keys())[0]:
        civics_final[q_num] = {list(q_and_a.keys())[0]: [capital]}


One more fix! there's a few questions that ask for 2 answers, but the answer is provided in a single line, separated by "and"

In [16]:
for q_num, q_and_a in civics_final.items():

    q = list(q_and_a.keys())[0]
    a = list(q_and_a.values())[0]

    if ('two' in q) and len(a) == 1:
        
        # make that 2 lines
        civics_final[q_num] = {q: a[0].split(' and ')}

    # possibly go back and fix this?
    #if q == 'Where is the Statue of Liberty?':
    #    print(q)
    #    print(a)

    

In [17]:
# to print it out
#civics_final

Save it to a file, including today's date!

In [18]:
with open(f'../data/civics{date.today()}.json', 'w', encoding='utf-8') as f:
    json.dump(civics_final, f, ensure_ascii=False, indent=4)