In [1]:
import pandas as pd
import re
from os.path import isfile
from requests import get
from IPython.core.display import clear_output
from bs4 import BeautifulSoup

HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) "
           "AppleWebKit/605.1.15 (KHTML, like Gecko) "
           "Version/12.0 Safari/605.1.15"}

BASE_URL_1 = "https://www.collegedata.com/cs/data/college/college_pg0"
BASE_URL_2 = "_tmpl.jhtml?schoolId="

EMPTY_H1_HEADING = "Retrieve a Saved Search"

SCHOOL_ID_START = 1
SCHOOL_ID_END = 5000

PATH = "data/test.csv"

In [2]:
def scrape_collegedata(start = SCHOOL_ID_START, stop = SCHOOL_ID_END):
    for school_id in range(start, stop + 1):
        school = scrape_school(school_id)
        if school:
            df = pd.DataFrame(school, index = [school_id])
            df.to_csv(PATH, mode = 'a+', header = ~isfile(PATH), 
                      index_label = 'SchoolId')
    return None

In [3]:
def scrape_school(school_id):
    school = {}
    for page_id in range(1, 7):
        page = scrape_page(school_id, page_id)
        if not page:
            break
        school.update(page)
    return school

In [4]:
def scrape_page(school_id, page_id):
    soup = get_soup(school_id, page_id)
    cleaned_soup = clean_soup(soup)
    
    page = {}
    page.update(scrape_rows(cleaned_soup))
    page.update(scrape_tables(cleaned_soup))
    return page

In [5]:
def get_soup(school_id, page_id):
    
    # Build URL and send request.
    url = BASE_URL_1 + str(page_id) + BASE_URL_2 + str(school_id)
    print("Scraping {}".format(url))
    result = get(url, headers = HEADERS)
    clear_output(wait = True)
    
    # Abort if received an unusual status code.
    if result.status_code != 200:
        # TO DO: Add a proper way to handle or at least record this error.
        return None
    
    soup = BeautifulSoup(result.text, "lxml")
    
    # Abort if page does not have a <h1> header.
    if not soup.h1:
        # TO DO: Add a proper way to handle or at least record this error.
        return None
    
    # Abort if <h1> header matches that of the error page with no school data.
    if soup.h1.string == EMPTY_H1_HEADING:
        # TO DO: Add a proper way to handle or at least record this error.
        return None
    
    # Abort if the entire page HTML did not load.
    if not soup.find(string = 'Content END'):
        # TO DO: Add a proper way to handle or at least record this error.
        return None
    
    return soup

In [6]:
def clean_soup(soup):
    
    if not soup:
        return None
        
    # Delete certain div sections by their id tags.
    div_ids = ['section19']
    tags = soup.find_all(id = div_ids)
    if tags:
        for tag in tags:
            tag.decompose()
    
    # Delete certain redundant tables by their caption strings.
    captions = ['Selection of Students',
                'Grade Point Average of Enrolled Freshmen',
                'SAT Scores of Enrolled Freshmen',
                'ACT Scores of Enrolled Freshmen',
                'Financial Aid Office',
                'Undergraduate Majors',
                'Intercollegiate Sports Offered']
    regexs = [re.compile(caption) for caption in captions]
    tags = soup.find_all('caption', string = regexs)
    if tags:
        for tag in tags:
            tag.parent.decompose()
    
    # Relabel the varying '{city_name} Population' tag with a constant label.
    tag = soup.find(string = re.compile('Population'))
    if tag:
        tag.string = 'City Population'
        
    # Relabel some labels on first page to remove ambiguity with duplicates.
    tag = soup.find('th', string = 'Undergraduate Students')
    if tag:
        women = tag.find_next('th', string = re.compile('Women'))
        men = tag.find_next('th', string = re.compile('Men'))
        grads = tag.find_next('th', string = re.compile('Graduate Students'))
        
        tag.string = 'All Undergraduates'
        women.string = 'Undergrads (women)'
        men.string = 'Undergrads (men)'
        grads.string = 'All Graduate Students'
        
    # Add prefix to table labels to remove ambiguity with other fields.
    tag = soup.find('div', id = 'section7')
    if tag:
        th_tags = tag.table.tbody.find_all('th')
        for th_tag in th_tags:
            label = " ".join(th_tag.stripped_strings)
            th_tag.string = 'Factor - ' + label
            
    # Add gender suffixes to duplicate field names.
    tag = soup.find('div', id = 'section8')
    if tag:
        adm_rate_w = tag.find_next('th', string = re.compile('Women'))
        adm_rate_m = tag.find_next('th', string = re.compile('Men'))
        enrolled_w = adm_rate_w.find_next('th', string = re.compile('Women'))
        enrolled_m = adm_rate_m.find_next('th', string = re.compile('Men'))
        
        adm_rate_w.string = 'Overall Admission Rate (women)'
        adm_rate_m.string = 'Overall Admission Rate (men)'
        enrolled_w.string = 'Students Enrolled (women)'
        enrolled_m.string = 'Students Enrolled (men)'

    # Add appropriate markup to ambiguous need-based award labels.
    div_tag = soup.find('div', id = 'section11')
    if div_tag:
        captions = ['Freshmen', 'All Undergraduates']
        for caption in captions:
            cap_tag = div_tag.find('caption', string = re.compile(caption))
            table_tag = cap_tag.parent
            tags = table_tag.tbody.find_all('th')
            for tag in tags:
                tag.string = tag.string + ' (' + caption + ')'
                if tag.attrs == {'class': ['sub']}:
                    tag.string = 'Average Award - ' + tag.string
                    
    # Add appropriate markup to ambiguous non-need based award labels.
    div_tag = soup.find('div', id = 'section12')
    if div_tag:
        caption = re.compile('Non-Need Awards')
        cap_tag = div_tag.find('caption', string = caption)
        table_tag = cap_tag.parent
        tags = table_tag.tbody.find_all('th')
        for tag in tags:
            if tag.attrs != {'class': ['sub']}:
                tag.string = " ".join(tag.stripped_strings)
                subtags = tag.find_all_next('th')[:2]
                for subtag in subtags:
                    subtag.string = " ".join(subtag.stripped_strings)
                    subtag.string = tag.string[:-12] + " - " + subtag.string
    
    # Delete duplicate values.
    tag = soup.find('div', id = 'section26')
    if tag:
        strings = ['All Undergraduates','Women','Men']
        for string in strings:
            regex = re.compile(string)
            tag.find_next('th', string = regex).parent.decompose()
            
    return soup

In [7]:
def scrape_rows(soup):
    rows = {}
    if soup:
        rows['Name'] = soup.h1.string
        
        content = soup.find('div', id = 'tabcontwrap')

        for tag in content('thead'):
            tag.parent.decompose()

        for tag in content(['th','td']):
            tag.string = " ".join(tag.stripped_strings)

        for tr in content('tr'):
            if tr('th') and tr('td'):
                label = tr.find('th').string
                value = tr.find('td').string
                rows[label] = value
    return rows

In [8]:
def scrape_tables(soup):
    tables = {}
    if soup:

        content = soup.find('div', id = 'tabcontwrap')
        for thead in content('thead'):

            # Get column labels
            td_tags = thead('td')
            col_labels = []
            for i, td_tag in enumerate(td_tags):
                label = " ".join(td_tag.stripped_strings)
                col_labels.append(label)

            # Get row labels and cell values.
            table_values = {}
            tr_tags = thead.parent.tbody('tr')
            for tr_tag in tr_tags:
                
                # Get the row label.
                row_label = " ".join(tr_tag.th.stripped_strings)
                
                if row_label:
                    # Get the row values.
                    row_values = []
                    td_tags = tr_tag('td')
                    for td_tag in td_tags:
                        row_values.append(" ".join(td_tag.stripped_strings))

                    # Determine if row val should be saved as categorical var.
                    unique_vals = set(row_values)
                    if (len(unique_vals) == 2) and ('X' in unique_vals):
                        index_val = row_values.index('X')
                        label = row_label
                        table_values[label] = col_labels[index_val]

                    # Or else, append the column label to the row label.
                    else:
                        for j, row_value in enumerate(row_values):
                            label = row_label
                            if col_labels[j]:
                                label = label + " - " + col_labels[j]
                            table_values[label] = row_value
            
            tables.update(table_values)

    return tables

In [None]:
scrape_collegedata()