In [None]:
import requests
import sqlite3 
import sys


In [None]:
# key and documentation for the dataset available from 
# https://collegescorecard.ed.gov/data/documentation/
api_key='K8LmDhBpijHbIRP2fgSPm9KEMMxTMkk939bkHwAs'

In [None]:
# create a database and cursor object
connection = sqlite3.connect('scorecard.db')
cursor=connection.cursor()

In [None]:
# create a table for 4 year colleges with chosen features
cursor.execute('''
        CREATE TABLE fouryear (
            id INTEGER PRIMARY KEY,
            name TEXT,
            size INTEGER,
            part_time_share REAL,
            cost REAL,
            pell_grant_rate REAL,
            federal_loan_rate REAL,
            completion_rate_4yr_150nt REAL,
            retention_rate_full_time REAL,
            retention_rate_part_time REAL,
            share_25_older REAL,
            two_yr_default_rate REAL,
            three_yr_default_rate REAL,
            share_firstgeneration REAL,
            parents_middleschool REAL,
            parents_highschool REAL,
            parents_somecollege REAL,
            median_hh_income REAL
        )''')

In [None]:
# create a table for 2 year colleges with chosen features
cursor.execute('''
        CREATE TABLE twoyear (
            id INTEGER PRIMARY KEY,
            name TEXT,
            size INTEGER,
            part_time_share REAL,
            cost REAL,
            pell_grant_rate REAL,
            federal_loan_rate REAL,
            completion_rate__less_than_4yr_150nt REAL,
            retention_rate_full_time REAL,
            retention_rate_part_time REAL,
            share_25_older REAL,
            two_yr_default_rate REAL,
            three_yr_default_rate REAL,
            share_firstgeneration REAL,
            parents_middleschool REAL,
            parents_highschool REAL,
            parents_somecollege REAL,
            median_hh_income REAL
        )''')

In [None]:
def update_table_4yr(*arg):
    '''
    Take a list of feature values from the database and insert them into our SQL database.
    If the values are missing then they are controlled in a predefined way to catch later.
    '''
    # target variable name set for the different programs
    if arg[0]=='twoyear':
        completion='completion_rate__less_than_4yr_150nt'
    else:
        completion='completion_rate_4yr_150nt'
    # loop through arguments and check for missing values, replace with impossible number
    args = [args if (args != None) else -999 for args in arg ]
    for arg in args[3:]:
        if isinstance(arg,str):
            args[args.index(arg)]=-999
    # if the name is missing somehow, set it to Missing
    if args[2]==-999:
        args[2]='Missing'
    # catch known errors from past attempts at querying the API
    try:
        cursor.execute(f'''
            INSERT INTO {args[0]} 
            (id, name, size, part_time_share, cost, pell_grant_rate,
            federal_loan_rate,{completion},retention_rate_full_time,
            retention_rate_part_time,share_25_older,two_yr_default_rate,
            three_yr_default_rate,share_firstgeneration,parents_middleschool,
            parents_highschool,parents_somecollege,median_hh_income)
            VALUES
            ({args[1]},'{args[2]}',{args[3]},{args[4]},{args[5]},{args[6]},
            {args[7]},{args[8]},{args[9]},
            {args[10]},{args[11]},{args[12]},
            {args[13]},{args[14]},{args[15]},
            {args[16]},{args[17]},{args[18]})
            ''')
    except sqlite3.IntegrityError:
        pass
    except sqlite3.OperationalError:
        print(arg)

In [None]:
# define functions to gather data for 2 and 4 year programs
# different functions necessary based on some variation in feature names

def get_scorecard_2yr():
    '''
    Query the API for the desired features and update the table for each entry taken from 
    the database into our local SQL database.
    '''
    url='https://api.data.gov/ed/collegescorecard/v1/schools.json?school.degrees_awarded.predominant=2'
    headers = '&api_key='+api_key
    # pass in all of the fields we want to pull and set the results per page to 100
    url_params ='&_fields=id,school.name,latest.student.size,latest.student.part_time_share,' \
        + 'latest.cost.tuition.in_state,latest.aid.pell_grant_rate,latest.aid.federal_loan_rate,' \
        + 'latest.completion.completion_rate_less_than_4yr_150nt,latest.student.retention_rate.lt_four_year.full_time,' \
        + 'latest.student.retention_rate.lt_four_year.part_time,latest.student.share_25_older,' \
        + 'latest.repayment.2_yr_default_rate,latest.repayment.3_yr_default_rate,latest.student.share_firstgeneration,' \
        + 'latest.student.share_firstgeneration_parents.middleschool,latest.student.share_firstgeneration_parents.highschool,' \
        + 'latest.student.share_firstgeneration_parents.somecollege,latest.student.demographics.median_hh_income&_per_page=100'
    url=url+url_params+headers
    first_page = requests.get(url).json()
    # pull the first page and insert each result into our table
    for result in first_page['results']:
        update_table('twoyear',result['id'],result['school.name'],result['latest.student.size']
                       ,result['latest.student.part_time_share'],result['latest.cost.tuition.in_state']
                       ,result['latest.aid.pell_grant_rate'],result['latest.aid.federal_loan_rate']
                       ,result['latest.completion.completion_rate_less_than_4yr_150nt']
                       ,result['latest.student.retention_rate.lt_four_year.full_time']
                       ,result['latest.student.retention_rate.lt_four_year.part_time']
                       ,result['latest.student.share_25_older']
                       ,result['latest.repayment.2_yr_default_rate'], result['latest.repayment.3_yr_default_rate']
                       ,result['latest.student.share_firstgeneration'], result['latest.student.share_firstgeneration_parents.middleschool']
                       ,result['latest.student.share_firstgeneration_parents.highschool']
                       ,result['latest.student.share_firstgeneration_parents.somecollege']
                       ,result['latest.student.demographics.median_hh_income'])
    # figure out the number of pages for our query
    num_pages = first_page['metadata']['total']//100
    # loop through the rest of the pages
    for page in range(1, num_pages + 1):
        next_page = requests.get(url+f'&_page={page}').json()
        # loop through the results of the next page
        for result in next_page['results']:
            update_table('twoyear',result['id'],result['school.name'],result['latest.student.size']
                       ,result['latest.student.part_time_share'],result['latest.cost.tuition.in_state']
                       ,result['latest.aid.pell_grant_rate'],result['latest.aid.federal_loan_rate']
                       ,result['latest.completion.completion_rate_less_than_4yr_150nt']
                       ,result['latest.student.retention_rate.lt_four_year.full_time']
                       ,result['latest.student.retention_rate.lt_four_year.part_time']
                       ,result['latest.student.share_25_older']
                       ,result['latest.repayment.2_yr_default_rate'], result['latest.repayment.3_yr_default_rate']
                       ,result['latest.student.share_firstgeneration'], result['latest.student.share_firstgeneration_parents.middleschool']
                       ,result['latest.student.share_firstgeneration_parents.highschool']
                       ,result['latest.student.share_firstgeneration_parents.somecollege']
                       ,result['latest.student.demographics.median_hh_income'])

In [None]:
def get_scorecard_4yr():
    '''
    Query the API for the desired features and update the table for each entry taken from 
    the database into our local SQL database.
    '''
    url='https://api.data.gov/ed/collegescorecard/v1/schools.json?school.degrees_awarded.predominant=3'
    headers = '&api_key='+api_key
    # pass in all of the fields we want to pull and set the results per page to 100
    url_params ='&_fields=id,school.name,latest.student.size,latest.student.part_time_share,latest.cost.tuition.in_state,' /
    +'latest.aid.pell_grant_rate,latest.aid.federal_loan_rate,latest.completion.completion_rate_4yr_150nt,' /
    +'latest.student.retention_rate.four_year.full_time,latest.student.retention_rate.four_year.part_time,latest.student.share_25_older,' /
    +'latest.repayment.2_yr_default_rate,latest.repayment.3_yr_default_rate,latest.student.share_firstgeneration,' /
    +'latest.student.share_firstgeneration_parents.middleschool,latest.student.share_firstgeneration_parents.highschool,' /
    +'latest.student.share_firstgeneration_parents.somecollege,latest.student.demographics.median_hh_income&_per_page=100'
    url=url+url_params+headers
    first_page = requests.get(url).json()
    # pull the first page and insert each result into our table
    for result in first_page['results']:
        update_table('fouryear',result['id'],result['school.name'],result['latest.student.size']
                       ,result['latest.student.part_time_share'],result['latest.cost.tuition.in_state']
                       ,result['latest.aid.pell_grant_rate'],result['latest.aid.federal_loan_rate'], result['latest.completion.completion_rate_4yr_150nt']
                       ,result['latest.student.retention_rate.four_year.full_time']
                       ,result['latest.student.retention_rate.four_year.part_time']
                       ,result['latest.student.share_25_older']
                       ,result['latest.repayment.2_yr_default_rate'], result['latest.repayment.3_yr_default_rate']
                       ,result['latest.student.share_firstgeneration'], result['latest.student.share_firstgeneration_parents.middleschool']
                       ,result['latest.student.share_firstgeneration_parents.highschool'], result['latest.student.share_firstgeneration_parents.somecollege']
                       ,result['latest.student.demographics.median_hh_income'])
    # figure out the number of pages for our query
    num_pages = first_page['metadata']['total']//100
    
    # loop through the rest of the pages
    for page in range(1, num_pages + 1):
        first_page = requests.get(url+f'&_page={page}').json()
        # loop through the results of the next page
        for result in first_page['results']:
            update_table('fouryear',result['id'],result['school.name'],result['latest.student.size']
                       ,result['latest.student.part_time_share'],result['latest.cost.tuition.in_state']
                       ,result['latest.aid.pell_grant_rate'],result['latest.aid.federal_loan_rate'], result['latest.completion.completion_rate_4yr_150nt']
                       ,result['latest.student.retention_rate.four_year.full_time']
                       ,result['latest.student.retention_rate.four_year.part_time']
                       ,result['latest.student.share_25_older']
                       ,result['latest.repayment.2_yr_default_rate'], result['latest.repayment.3_yr_default_rate']
                       ,result['latest.student.share_firstgeneration'], result['latest.student.share_firstgeneration_parents.middleschool']
                       ,result['latest.student.share_firstgeneration_parents.highschool'], result['latest.student.share_firstgeneration_parents.somecollege']
                       ,result['latest.student.demographics.median_hh_income'])

In [None]:
# request from the api and update our SQL database
get_scorecard_2yr()
get_scorecard_4yr()
# commit the changes
connection.commit()