In [1]:
# This is the "Overview" page.
BASE_URL_1 = \
"http://www.collegedata.com/cs/data/college/college_pg01_tmpl.jhtml?schoolId="
page1_labels = {'control_type':'Institution Type',
                'coed_type':'Coeducational',
                'ugrads':'Undergraduate Students',
                'ugrads_women':'Women',
                'ugrads_men':'Men',
                'grads':'Graduate Students',
                'gpa':'Average GPA',
                'sat':'SAT Math',
                'cost':'Cost of Attendance',
                'need_met_pct':'Average Percent of Need Met',
                'award':'Average Freshman Award',
                'debt':'Average Indebtedness of 2016 Graduates',
                'on_campus_pct':'Students in College Housing'}

# This is the "Admissions" page.
BASE_URL_2 = \
"http://www.collegedata.com/cs/data/college/college_pg02_tmpl.jhtml?schoolId="
page2_labels = {'adm_rate':'Overall Admission Rate',
                'adm_women':'Women',
                'adm_men':'Men',
                'enrolled':'Students Enrolled',
                'enrolled_women':'Women*',
                'enrolled_men':'Men*',
                'ed_rate':'Early Decision Admission Rate',
                'ea_rate':'Early Action Admission Rate',
                'hs_rank':'High School Class Rank'}

# This is the "Students" page.
BASE_URL_6 = \
"http://www.collegedata.com/cs/data/college/college_pg06_tmpl.jhtml?schoolId="
page6_labels = {'ugrads_fulltime':'Full-Time Undergraduates',
                'returning_pct':'First-Year Students Returning',
                'grad_6_yrs_pct':'Students Graduating Within 6 Years'}

BASE_URLS = [BASE_URL_1, BASE_URL_2, BASE_URL_6]
page_labels = [page1_labels, page2_labels, page6_labels]

In [2]:
from bs4 import BeautifulSoup

def scrape(page, labels):
    scraped = {}
    
    all_th_tags = {}
    for th_tag in page.find_all('th'):
        th_string = " ".join(string for string in th_tag.stripped_strings)
        while th_string in all_th_tags.keys():
            th_string += "*"
        all_th_tags[th_string] = th_tag
        
    for key, label in labels.items():
        if label in all_th_tags.keys():
            th_tag = all_th_tags[label]
            td_tag = th_tag.next_sibling
            td_string = " ".join(string for string in td_tag.stripped_strings)
            scraped[key] = td_string
        
    return scraped

In [30]:
import pandas as pd
from IPython.core.display import clear_output
from requests import get
from os.path import isfile

START_ID = 1
END_ID = 5000
CHUNKSIZE = 10

path = 'data/scraped_collegedata.csv'
if isfile(path):
    START_ID = pd.read_csv(path)['page_id'].max() + 1

df = pd.DataFrame()
for page_id in range(START_ID, END_ID + 1):
    scraped_school = {}
    for i in range(0,len(BASE_URLS)):
        url = BASE_URLS[i] + str(page_id)
        
        print(url)   # Print a status update.
        
        response = get(url)
        page = BeautifulSoup(response.text, "lxml")
        if page.h1:
            scraped_school['name'] = page.h1.string
            scraped_school.update(scrape(page, page_labels[i]))
            series = pd.Series(scraped_school, name=page_id)
            df = df.append(series)
        
        clear_output(wait = True)   # Clear the status update.
    
    if (page_id - START_ID) % CHUNKSIZE == 0:
        if not isfile(path):
            df.to_csv(path, mode='w', header=True, index_label='page_id')
        else: # else it exists so append without writing the header
            df.to_csv(path, mode='a', header=False)
        
        # Clear the dataframe.
        df = pd.DataFrame()

http://www.collegedata.com/cs/data/college/college_pg06_tmpl.jhtml?schoolId=5000
