In [1]:
import pandas as pd
import re
from os.path import isfile
from requests import get
from IPython.core.display import clear_output
from bs4 import BeautifulSoup

HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) "
           "AppleWebKit/605.1.15 (KHTML, like Gecko) "
           "Version/12.0 Safari/605.1.15"}

BASE_URL_1 = "https://www.collegedata.com/cs/data/college/college_pg0"
BASE_URL_2 = "_tmpl.jhtml?schoolId="

EMPTY_H1_HEADING = "Retrieve a Saved Search"

SCHOOL_ID_START = 1
SCHOOL_ID_END = 5000

COLLEGEDATA_RAW_PATH = "data/collegedata_raw.csv"

---
## Main idea

Scrape a school, save the scraped data to CSV, repeat for all schools.

If some scraping has already been done, pick up where we last left off.

In [2]:
def scrape_collegedata(start = SCHOOL_ID_START, stop = SCHOOL_ID_END):
    # If a previously scraped csv exists, start the scraper where it left off.
    if isfile(COLLEGEDATA_RAW_PATH):
        ids = pd.read_csv(COLLEGEDATA_RAW_PATH, usecols = 'SchoolId')
        start = ids.max() + 1
    
    # Scrape and save each school.
    for school_id in range(start, stop + 1):
        school = scrape_school(school_id)
        save_to_csv(school)
        
    return None

---
## Saving the scraped school
When saving, if a scraped file already exists, append the scraped school as a row. If the file doesn't exist (this is the first school), write column headers during the save, too.

If there was no school data scraped, then do nothing.

In [3]:
def save_to_csv(school):
    if not school:
        return None
    
    df = pd.DataFrame(school, index = school['SchoolId'])
    
    if isfile(COLLEGEDATA_RAW_PATH):
        df.to_csv(COLLEGEDATA_RAW_PATH, mode = 'a+', header = False)
    else:
        df.to_csv(PATH, mode = 'a+', header = True, index_label = 'SchoolId')
    
    return None

---
## Scraping a school

Great, so all we have to do now is figure out how to scrape a school.

It turns out, a school's worth of information on CollegeData.com is actually six pages of data. To scrape a school, we must scrape all six of these pages.

If any of those pages fail to scrape, we'll abandon the whole thing and return nothing.

In [4]:
def scrape_school(school_id):
    school = {}
    for page_id in range(1, 7):
        page = scrape_page(school_id, page_id)
        if not page:
            return {}
        school.update(page)
    school['SchoolId'] = school_id
    return school

---
## Scraping a page

To scrape a page, we'll need to get the Beautiful Soup object containing all the page's raw HTML.

Then we'll "clean" the soup with some preprocessing to make the extraction a bit easier.

We'll scrape the data coming from the common "rows" part of the pages that look like this:

And we'll separately handle the data coming from these "tables" parts of the pages that look like this:

We'll return what we found (which could be nothing).

In [22]:
def scrape_page(school_id, page_id):
    page = {}
    soup = get_soup(school_id, page_id)
    cleaned_soup = clean_soup(soup)
    
    page.update(scrape_rows(cleaned_soup))
    page.update(scrape_tables(cleaned_soup))
    return page

---
## Getting the soup

We'll need to construct the URL for the page given the school's id and the current page's id.

If we get a status code other than 200, we'll return None. Otherewise, we'll use Beautiful Soup to convert the response to a soup object.

If for some reason the page doesn't have a `<h1>` header tag (it should!), then this isn't a normal CollegeData page with useful data, so we'll stop and return None.

If the page does have an `<h1>` tag but it says "Retrieve a Saved Search", we know we've hit a CollegeData page corresponding to no school, and since there is no useful data, we'll stop and return None.

A fully loaded normal page with CollegeData content should, toward the end of the document, contain an HTML comment tag that says 'Content END'. If for some reason we were not served the entire page, we won't see this tag. If that happens, we should stop and return None.

If the request and soup passed those three checks, we'll return the soup.

In [23]:
def get_soup(school_id, page_id):
    
    # Build URL and send request.
    url = BASE_URL_1 + str(page_id) + BASE_URL_2 + str(school_id)
    print("Scraping {}".format(url))
    result = get(url, headers = HEADERS)
    clear_output(wait = True)
    
    # Abort if received an unusual status code.
    if result.status_code != 200:
        # TO DO: Add a proper way to handle or at least record this error.
        return None
    
    soup = BeautifulSoup(result.text, "lxml")
    
    # Abort if page does not have a <h1> header.
    if not soup.h1:
        # TO DO: Add a proper way to handle or at least record this error.
        return None
    
    # Abort if <h1> header matches that of the error page with no school data.
    if soup.h1.string == EMPTY_H1_HEADING:
        # TO DO: Add a proper way to handle or at least record this error.
        return None
    
    # Abort if the entire page HTML did not load.
    if not soup.find(string = 'Content END'):
        # TO DO: Add a proper way to handle or at least record this error.
        return None
    
    return soup

---
## Cleaning the soup

First, if there is no soup, then return None.

We're going to delete some sections we don't need.

We also want to rename some labels that are either ambiguous or shared.

In [5]:
def clean_soup(soup):
    if soup:
        
    
        soup = delete_tags(soup)
        soup = rename_labels(soup)
    
    return soup

---
## Deleting parts of the HTML
First, we'll see if the page has this section (found by the unique  `id = 'section19'` in the HTML):


If it does, we'll delete it because...

In addition to that section, we'll also delete these sections:


I decided this because ...

These sections can be identified and removed by the unique strings in their `<caption>` tags.

In [None]:
def delete_tags(soup):
    # Delete certain div sections by their id tags.
    div_ids = ['section19']
    tags = soup.find_all(id = div_ids)
    if tags:
        for tag in tags:
            tag.decompose()
            
    # Delete duplicate values.
    tag = soup.find('div', id = 'section26')
    if tag:
        strings = ['All Undergraduates','Women','Men']
        for string in strings:
            regex = re.compile(string)
            tag.find_next('th', string = regex).parent.decompose()
            
            
    # Delete certain redundant tables by their caption strings.
    captions = ['Selection of Students',
                'Grade Point Average of Enrolled Freshmen',
                'SAT Scores of Enrolled Freshmen',
                'ACT Scores of Enrolled Freshmen',
                'Financial Aid Office',
                'Undergraduate Majors',
                'Intercollegiate Sports Offered']
    regexs = [re.compile(caption) for caption in captions]
    tags = soup.find_all('caption', string = regexs)
    if tags:
        for tag in tags:
            tag.parent.decompose()
    
    return soup

---
## Renaming labels

Consider this label:


Since this label changes every page depending on the city name, we'll want to standardize it by renaming it to 'City Population'.


We'll also want to rename these labels to remove ambiguity:


On this table, we'll want to add some prefixes to avoid ambiguity with other fields:


These labels need gender suffixes to avoid clashing:


The awards section needs serious relabelling:



In [24]:
def rename_labels(soup):
    
    # Relabel the varying '{city_name} Population' tag with a constant label.
    tag = soup.find(string = re.compile('Population'))
    if tag:
        tag.string = 'City Population'
        
    # Relabel some labels on first page to remove ambiguity with duplicates.
    tag = soup.find('th', string = 'Undergraduate Students')
    if tag:
        women = tag.find_next('th', string = re.compile('Women'))
        men = tag.find_next('th', string = re.compile('Men'))
        grads = tag.find_next('th', string = re.compile('Graduate Students'))
        
        tag.string = 'All Undergraduates'
        women.string = 'Undergrads (women)'
        men.string = 'Undergrads (men)'
        grads.string = 'All Graduate Students'
        
    # Add prefix to table labels to remove ambiguity with other fields.
    tag = soup.find('div', id = 'section7')
    if tag:
        th_tags = tag.table.tbody.find_all('th')
        for th_tag in th_tags:
            label = " ".join(th_tag.stripped_strings)
            th_tag.string = 'Factor - ' + label
            
    # Add gender suffixes to duplicate field names.
    tag = soup.find('div', id = 'section8')
    if tag:
        adm_rate_w = tag.find_next('th', string = re.compile('Women'))
        adm_rate_m = tag.find_next('th', string = re.compile('Men'))
        enrolled_w = adm_rate_w.find_next('th', string = re.compile('Women'))
        enrolled_m = adm_rate_m.find_next('th', string = re.compile('Men'))
        
        adm_rate_w.string = 'Overall Admission Rate (women)'
        adm_rate_m.string = 'Overall Admission Rate (men)'
        enrolled_w.string = 'Students Enrolled (women)'
        enrolled_m.string = 'Students Enrolled (men)'

    # Add appropriate markup to ambiguous need-based award labels.
    div_tag = soup.find('div', id = 'section11')
    if div_tag:
        captions = ['Freshmen', 'All Undergraduates']
        for caption in captions:
            cap_tag = div_tag.find('caption', string = re.compile(caption))
            table_tag = cap_tag.parent
            tags = table_tag.tbody.find_all('th')
            for tag in tags:
                tag.string = tag.string + ' (' + caption + ')'
                if tag.attrs == {'class': ['sub']}:
                    tag.string = 'Average Award - ' + tag.string
                    
    # Add appropriate markup to ambiguous non-need based award labels.
    div_tag = soup.find('div', id = 'section12')
    if div_tag:
        caption = re.compile('Non-Need Awards')
        cap_tag = div_tag.find('caption', string = caption)
        table_tag = cap_tag.parent
        tags = table_tag.tbody.find_all('th')
        for tag in tags:
            if tag.attrs != {'class': ['sub']}:
                tag.string = " ".join(tag.stripped_strings)
                subtags = tag.find_all_next('th')[:2]
                for subtag in subtags:
                    subtag.string = " ".join(subtag.stripped_strings)
                    subtag.string = tag.string[:-12] + " - " + subtag.string
    

            
    return soup

---
## Scraping rows

With our soup finally cleaned, we're ready to scrape all the rows.

First, we'll save the `<h1>` header string as the 'Name' of the school.

We'll descend into the `tabcontwrap` `<div>` part of the page, which contains all the data.

We'll delete any tables (since we're only scraping data from rows).

For all remaining `<th>` headers and `<td>` data tags, we'll compress their potentially complicated and split up string contents into a single simple string.

Then for each `<tr>` row they are in, we'll set the `<th>` string as the label and the `<td>` string as the value and save them to the rows dictionary, which we'll return.

In [25]:
def scrape_rows(soup):
    rows = {}
    
    if soup:

        for tag in soup('thead'):
            tag.parent.decompose()

        for tag in soup(['th','td']):
            tag.string = " ".join(tag.stripped_strings)

        for tr in soup('tr'):
            if tr('th') and tr('td'):
                label = tr.find('th').string
                value = tr.find('td').string
                rows[label] = value
    return rows

---
## Scraping tables



In [26]:
def scrape_tables(soup):
    tables = {}
    if soup:

        content = soup.find('div', id = 'tabcontwrap')
        for thead in content('thead'):

            # Get column labels
            td_tags = thead('td')
            col_labels = []
            for i, td_tag in enumerate(td_tags):
                label = " ".join(td_tag.stripped_strings)
                col_labels.append(label)

            # Get row labels and cell values.
            table_values = {}
            tr_tags = thead.parent.tbody('tr')
            for tr_tag in tr_tags:
                
                # Get the row label.
                row_label = " ".join(tr_tag.th.stripped_strings)
                
                if row_label:
                    # Get the row values.
                    row_values = []
                    td_tags = tr_tag('td')
                    for td_tag in td_tags:
                        row_values.append(" ".join(td_tag.stripped_strings))

                    # Determine if row val should be saved as categorical var.
                    unique_vals = set(row_values)
                    if (len(unique_vals) == 2) and ('X' in unique_vals):
                        index_val = row_values.index('X')
                        label = row_label
                        table_values[label] = col_labels[index_val]

                    # Or else, append the column label to the row label.
                    else:
                        for j, row_value in enumerate(row_values):
                            label = row_label
                            if col_labels[j]:
                                label = label + " - " + col_labels[j]
                            table_values[label] = row_value
            
            tables.update(table_values)

    return tables

In [76]:
scrape_collegedata()

Scraping https://www.collegedata.com/cs/data/college/college_pg01_tmpl.jhtml?schoolId=5000


In [4]:
USNEWS_HTML_PATH = 'data/usnews.html'
USNEWS_RAW_CSV_PATH = 'data/usnews_raw.csv'
ROW_ID_ATTR = {"data-view":"colleges-search-results-table-row"}

scraped = []
with open(USNEWS_HTML_PATH, 'r') as file:
    page = BeautifulSoup(file.read(), "lxml")
    for row in page('tr', attrs = ROW_ID_ATTR):
        vals = "---".join(row.stripped_strings).split('---')
        if '(tie)' in vals:
            vals.remove('(tie)')
        if '1' in vals:
            vals.remove('1')
        scraped.append(vals[0:3])
        
ranks_df = pd.DataFrame(scraped, columns = ['Name','Location','Rank Info'])
ranks_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 3 columns):
Name         1400 non-null object
Location     1400 non-null object
Rank Info    1400 non-null object
dtypes: object(3)
memory usage: 32.9+ KB


Save the raw scraped rankings to a .csv for later cleaning:

In [5]:
ranks_df.to_csv(USNEWS_RAW_CSV_PATH, index = False)