In [1]:
# dependencies
from bs4 import BeautifulSoup as bs
from splinter import Browser
import requests
import PyPDF2
import io

In [2]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

## Use splinter to access Collegeboard archives

In [3]:
# start browser
executable_path = {'executable_path':'chromedriver.exe'}
browser = Browser('chrome',**executable_path,headless=False)

In [4]:
# check for working links
arr = ['2010','2011','2012','2013','2014','2015','2016']
url = 'https://research.collegeboard.org/programs/sat/data/archived/cb-seniors-'
for i in arr:
    print(url+i)

https://research.collegeboard.org/programs/sat/data/archived/cb-seniors-2010
https://research.collegeboard.org/programs/sat/data/archived/cb-seniors-2011
https://research.collegeboard.org/programs/sat/data/archived/cb-seniors-2012
https://research.collegeboard.org/programs/sat/data/archived/cb-seniors-2013
https://research.collegeboard.org/programs/sat/data/archived/cb-seniors-2014
https://research.collegeboard.org/programs/sat/data/archived/cb-seniors-2015
https://research.collegeboard.org/programs/sat/data/archived/cb-seniors-2016


## Grab reading scores from 75th percentile per state

In [5]:
# grab critical reading scores from 75th percentile for state (page 6)
arr = ['2010','2011','2012','2013','2014','2015','2016']
url = 'https://research.collegeboard.org/programs/sat/data/archived/cb-seniors-'

crit_scores = []
for i in arr:
    browser.visit(url+i)
    html = browser.html
    soup = bs(html,'html.parser')
    
    table = soup.find('table',class_='table')
    states = table.find_all('a')
    
    print(f'Grabbing scores from {i}...')
    print('----------------------')
    
    year_holder = {}
    for state in states:
        state_name = state.text.replace('\xa0','').rstrip()
        state_url = state['href']
        response = requests.get(state_url)
        
        with io.BytesIO(response.content) as open_pdf_file:
            pdf_reader = PyPDF2.PdfFileReader(open_pdf_file)
            score_page = pdf_reader.getPage(5)
            text = score_page.extractText().split('\n')
            
            first_loc = [i for i, s in enumerate(text) if 'Critical Reading' in s][1] + 1
            number_str = text[first_loc]
            crit_score_per_state = [number_str[i:i+3] for i in range(0, len(number_str), 3)]  # 1 for math
            
        year_holder[state_name] = crit_score_per_state[0]
        
    crit_scores.append(year_holder)
            
print('Finished.')

Grabbing scores from 2010...
----------------------
Grabbing scores from 2011...
----------------------
Grabbing scores from 2012...
----------------------
Grabbing scores from 2013...
----------------------
Grabbing scores from 2014...
----------------------
Grabbing scores from 2015...
----------------------
Grabbing scores from 2016...
----------------------
Finished.


## Grab math scores from 75th percentile per state

In [6]:
# grab math scores from 75th percentile for state (page 6)
arr = ['2010','2011','2012','2013','2014','2015','2016']
url = 'https://research.collegeboard.org/programs/sat/data/archived/cb-seniors-'

math_scores = []
for i in arr:
    browser.visit(url+i)
    html = browser.html
    soup = bs(html,'html.parser')
    
    table = soup.find('table',class_='table')
    states = table.find_all('a')
    
    print(f'Grabbing scores from {i}...')
    print('----------------------')
    
    year_holder = {}
    for state in states:
        state_name = state.text.replace('\xa0','').rstrip()
        state_url = state['href']
        response = requests.get(state_url)
        
        with io.BytesIO(response.content) as open_pdf_file:
            pdf_reader = PyPDF2.PdfFileReader(open_pdf_file)
            score_page = pdf_reader.getPage(5)
            text = score_page.extractText().split('\n')
            
            first_loc = [i for i, s in enumerate(text) if 'Critical Reading' in s][1] + 1
            number_str = text[first_loc]
            math_score_per_state = [number_str[i:i+3] for i in range(0, len(number_str), 3)]  # 1 for math
            
        year_holder[state_name] = math_score_per_state[1]
        
    math_scores.append(year_holder)
            
print('Finished.')

Grabbing scores from 2010...
----------------------
Grabbing scores from 2011...
----------------------
Grabbing scores from 2012...
----------------------
Grabbing scores from 2013...
----------------------
Grabbing scores from 2014...
----------------------
Grabbing scores from 2015...
----------------------
Grabbing scores from 2016...
----------------------
Finished.


In [7]:
# remember to close browser
browser.quit()

## Display dataframes

In [30]:
# dependencies
import pandas as pd

In [58]:
crit_df = pd.DataFrame(crit_scores).transpose()
crit_df.columns = arr
crit_df.head()

Unnamed: 0,2010,2011,2012,2013,2014,2015,2016
Alabama,640,630,620,630,630,630,640
Alaska,590,590,590,580,580,580,560
Arizona,580,590,590,590,590,590,600
Arkansas,650,650,640,650,650,650,650
California,580,580,570,570,570,570,570


In [59]:
math_df = pd.DataFrame(math_scores).transpose()
math_df.columns = arr
math_df.head()

Unnamed: 0,2010,2011,2012,2013,2014,2015,2016
Alabama,630,630,620,620,630,630,640
Alaska,590,580,580,580,570,580,550
Arizona,590,600,600,600,600,600,600
Arkansas,640,640,640,650,650,650,650
California,600,600,600,590,590,590,580


## Create connection to database

In [33]:
# dependencies
from sqlalchemy import create_engine
import pymysql
pymysql.install_as_MySQLdb()

In [34]:
# load credentials
from config import username,password

In [35]:
engine = create_engine(f'mysql://{username}:{password}@localhost/etl_project')

In [36]:
engine.table_names()

['crime_rate', 'crit_scores', 'housing_price', 'math_scores', 'population']

## Additional transformation

In [44]:
crit_df.reset_index(inplace=True)
math_df.reset_index(inplace=True)

In [50]:
crit_df = crit_df.rename(index=str,columns={'index':'state'})
math_df = math_df.rename(index=str,columns={'index':'state'})

In [51]:
crit_df.head()

Unnamed: 0,state,2010,2011,2012,2013,2014,2015,2016
0,Alabama,640,630,620,630,630,630,640
1,Alaska,590,590,590,580,580,580,560
2,Arizona,580,590,590,590,590,590,600
3,Arkansas,650,650,640,650,650,650,650
4,California,580,580,570,570,570,570,570


In [52]:
math_df.head()

Unnamed: 0,state,2010,2011,2012,2013,2014,2015,2016
0,Alabama,630,630,620,620,630,630,640
1,Alaska,590,580,580,580,570,580,550
2,Arizona,590,600,600,600,600,600,600
3,Arkansas,640,640,640,650,650,650,650
4,California,600,600,600,590,590,590,580


## Load dataframes into database

In [53]:
crit_df.to_sql(name='crit_scores', con=engine, if_exists='append', index=False)

In [54]:
math_df.to_sql(name='math_scores', con=engine, if_exists='append', index=False)

## Save as csv files

In [55]:
# dependencies
import os

In [56]:
path = os.path.join('output','math_scores.csv')
math_df.to_csv(path)

In [57]:
path = os.path.join('output','crit_scores.csv')
crit_df.to_csv(path)