In [22]:
import numpy as np
import pandas as pd
import re
from os.path import isfile
from requests import get
from IPython.core.display import clear_output
from bs4 import BeautifulSoup

HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) "
           "AppleWebKit/605.1.15 (KHTML, like Gecko) "
           "Version/12.0 Safari/605.1.15"}

BASE_URL_1 = "https://www.collegedata.com/cs/data/college/college_pg0"
BASE_URL_2 = "_tmpl.jhtml?schoolId="

EMPTY_TITLE = "Retrieve a Saved Search"

SCHOOL_ID_START = 1
SCHOOL_ID_END = 20

BATCH_SIZE = 1


def get_clean_string(tag):
    child_tags = tag.find_all(True)
    if child_tags:
        for child_tag in tag.find_all(True):
            if child_tag.name == 'div':
                child_tag.unwrap()
            # else:
                child_tag.decompose()     
    value = " ".join(tag.stripped_strings)
    
    return value


def scrape_row(tr_tag):
    scraped_row = {}
    
    th_tags = tr_tag.find_all('th')
    if not th_tags:
        return None
    
    th_tag = th_tags[0]
    label = get_clean_string(th_tag)
    if not label:
        return None
    
    values = []
    parent = th_tag.parent
    if not parent:
        return None
    
    td_tags = parent.find_all('td')
    if not td_tags:
        return None
    
    for td_tag in td_tags:
        value = get_clean_string(td_tag)
        values.append(value)
    
    scraped_row[label] = values
    return scraped_row

    
def scrape_table(thead_tag):
    # Get the column labels from <thead>, if they exist.
    # If they do not, use a number label placeholder instead.
    column_labels = []
    thead_td_tags = thead_tag.find_all('td')
    if not thead_td_tags:
        return None
    for thead_td_tag in thead_td_tags:
        i = 0
        if thead_td_tag.string:
            column_label = thead_td_tag.string
        else:
            column_label = None
        while column_label in column_labels:
            if column_label:
                column_label += '*'
            else:
                column_label = '*'
        column_labels.append(column_label)

    # Get the <th> strings in the <tbody>.
    if thead_tag.parent:
        if not thead_tag.parent.tbody:
            return None
        tbody_tr_tags = thead_tag.parent.tbody.find_all('tr')
        if not tbody_tr_tags:
            return None
    
    scraped_table = {}
    # Scrape the rows of each table.
    for tr_tag in tbody_tr_tags:
        scraped_row = {}
        raw_scraped_row = scrape_row(tr_tag)
        if raw_scraped_row:
            all_raw_values = []
            for raw_value in raw_scraped_row.values():
                all_raw_values += raw_value
            unique_vals = set(all_raw_values)
                
            for raw_label, raw_values in raw_scraped_row.items():
                for i, raw_value in enumerate(raw_values):
                    if (len(unique_vals) == 2) and ('X' in unique_vals):
                        if raw_value == 'X':
                            scraped_row[raw_label] = column_labels[i]
                    else:
                        column_label = column_labels[i]
                        label = raw_label
                        if column_label:
                            label = raw_label + " - " + column_label
                        scraped_row[label] = raw_value
                        
        scraped_table.update(scraped_row)
        
    return scraped_table

In [23]:
def preprocess_soup(soup):
    # Relabel and remove some data from soup before scraping.
    
    # Relabel the variable '{CITY} Population' tag with a constant label.
    tags = soup.find_all(string = re.compile('Population'))
    if tags:
        for tag in tags:
            tag.string.replace_with('City Population')
    
    
    # Remove data that is either extraneous or irrelevant for analysis.
    
    # Remove entire sections.
    remove_ids=['section10','section19']
    tags = soup.find_all(id=remove_ids)
    if tags:
        for tag in tags:
            tag.decompose()
            
    # Remove tables.
    captions = ['Selection of Students',
                'Undergraduate Majors',
                'Intercollegiate Sports Offered']
    regexps = [re.compile(caption) for caption in captions]
    tags = soup.find_all('caption', string=regexps)
    if tags:
        for tag in tags:
            tag.parent.decompose()
        
    return None

In [24]:
def scrape_page(school_id, page_id):    
    scraped_page = {}
    
    # Build URL and send request.
    url = BASE_URL_1 + str(page_id) + BASE_URL_2 + str(school_id)
    result = get(url, headers = HEADERS)
    
    # Print status update.
    print("Scraping {}".format(result.url))
    
    # Save School Id.
    scraped_page['School Id'] = school_id
    
    error = None
    # Check request errors.
    if result.status_code != 200:
        error = "Page {}: Status Code {}".format(page_id, result.status_code)
        print("ERROR {} with {}".format(error, result.url))
        scraped_page['Error'] = error
        return scraped_page

    soup = BeautifulSoup(result.text, "lxml")
    
    # Scrape name.
    scraped_page['Name'] = soup.h1.string

    # Check errors in returned HTML soup.
    if not soup.h1:
        error = "Page {}: No heading tag.".format(page_id)
        print("ERROR {} with {}".format(error, result.url))
        scraped_page['Error'] = error
        return scraped_page

    if soup.h1 and soup.h1.string == EMPTY_TITLE:
        error = "Page {}: No school data.".format(page_id)
        print("ERROR {} with {}".format(error, result.url))
        scraped_page['Error'] = error
        return scraped_page

    if not soup.find(string='Content END'):
        error = "Page {}: Page not fully loaded.".format(page_id)
        print("ERROR {} with {}".format(error, result.url))
        scraped_page['Error'] = error
        return scraped_page

    # Clean the soup.
    preprocess_soup(soup)
    
    # Scrape data from tables with <thead> and <tbody> tags, if any.
    tags = soup.find(id='tabcontwrap').find_all('thead')
    if tags:
        for tag in tags:
            scraped_table = scrape_table(tag)
            
            scraped_page.update(scraped_table)
        
            # Delete table from soup.
            tag.parent.decompose()
        

    # Scrape remaining table rows.
    tags = soup.find(id='tabcontwrap').find_all('tr')
    if tags:
        for tag in tags:
            scraped_row = scrape_row(tag)
            
            if scraped_row:
                for label, values in scraped_row.items():
                    scraped_row[label] = values[0]
            
                scraped_page.update(scraped_row)

    # Clear status update.
    clear_output(wait = True)
    
    return scraped_page

In [25]:
def merge_dicts(dict_1, dict_2):
    if not dict_1 and not dict_2:
        return None
    if not dict_1:
        return dict_2
    if not dict_2:
        return dict_1
    
    for dict_2_key, dict_2_value in dict_2.items():
        
        if dict_2_key not in dict_1.keys():
            dict_1[dict_2_key] = dict_2_value
        
        else:
            dict_1_value = dict_1[dict_2_key]
            if dict_1_value != dict_2_value:
                
                new_key = dict_2_key
                while new_key in dict_1.keys():
                    new_key += '*'
                dict_1[new_key] = dict_2_value
                
    return dict_1

In [28]:
def scrape_school(school_id):
    page_ids = np.arange(1, 7)
    scraped_pages = [scrape_page(school_id, page_id) for page_id in page_ids]
    
    scraped_school = {}
    for scraped_page in scraped_pages:
        scraped_school = merge_dicts(scraped_school, scraped_page)
    
    return scraped_school

In [35]:
scraped = {school_id: scrape_school(school_id) for school_id in range(6, 10)}

Scraping https://www.collegedata.com/cs/data/college/college_pg06_tmpl.jhtml?schoolId=9


In [41]:
df = pd.DataFrame.from_dict(scraped, orient='index')
df.index = df['School Id']
df = df.drop(columns='School Id')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 6 to 9
Columns: 234 entries, Name to Disciplines Pursued
dtypes: object(234)
memory usage: 7.3+ KB
