In [41]:
import numpy as np
import pandas as pd
import re
from os.path import isfile
from requests import get
from IPython.core.display import clear_output
from bs4 import BeautifulSoup

HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) "
           "AppleWebKit/605.1.15 (KHTML, like Gecko) "
           "Version/12.0 Safari/605.1.15"}

BASE_URL_1 = "https://www.collegedata.com/cs/data/college/college_pg0"
BASE_URL_2 = "_tmpl.jhtml?schoolId="

SCHOOL_ID_START = 1
SCHOOL_ID_END = 20

BATCH_SIZE = 1


def get_clean_string(tag):
    child_tags = tag.find_all(True)
    if child_tags:
        for child_tag in tag.find_all(True):
            if child_tag.name == 'div':
                child_tag.unwrap()
            # else:
                child_tag.decompose()     
    value = " ".join(tag.stripped_strings)
    
    return value


def scrape_row(tr_tag):
    scraped_row = {}
    
    th_tags = tr_tag.find_all('th')
    if not th_tags:
        return None
    
    th_tag = th_tags[0]
    label = get_clean_string(th_tag)
    if not label:
        return None
    
    values = []
    parent = th_tag.parent
    if not parent:
        return None
    
    td_tags = parent.find_all('td')
    if not td_tags:
        return None
    
    for td_tag in td_tags:
        value = get_clean_string(td_tag)
        values.append(value)
    
    scraped_row[label] = values
    return scraped_row

    
def scrape_table(thead_tag):
    # Get the column labels from <thead>, if they exist.
    # If they do not, use a number label placeholder instead.
    column_labels = []
    thead_td_tags = thead_tag.find_all('td')
    if not thead_td_tags:
        return None
    for thead_td_tag in thead_td_tags:
        i = 0
        if thead_td_tag.string:
            column_label = thead_td_tag.string
        else:
            column_label = None
        while column_label in column_labels:
            if column_label:
                column_label += '*'
            else:
                column_label = '*'
        column_labels.append(column_label)

    # Get the <th> strings in the <tbody>.
    if thead_tag.parent:
        if not thead_tag.parent.tbody:
            return None
        tbody_tr_tags = thead_tag.parent.tbody.find_all('tr')
        if not tbody_tr_tags:
            return None
    
    scraped_table = {}
    # Scrape the rows of each table.
    for tr_tag in tbody_tr_tags:
        scraped_row = {}
        raw_scraped_row = scrape_row(tr_tag)
        if raw_scraped_row:
            all_raw_values = []
            for raw_value in raw_scraped_row.values():
                all_raw_values += raw_value
            unique_vals = set(all_raw_values)
                
            for raw_label, raw_values in raw_scraped_row.items():
                for i, raw_value in enumerate(raw_values):
                    if (len(unique_vals) == 2) and ('X' in unique_vals):
                        if raw_value == 'X':
                            scraped_row[raw_label] = column_labels[i]
                    else:
                        column_label = column_labels[i]
                        label = raw_label
                        if column_label:
                            label = raw_label + " - " + column_label
                        scraped_row[label] = raw_value
                        
        scraped_table.update(scraped_row)
        
    return scraped_table




def add_dict(dict_1, dict_2):
    if dict_2:
        temp_dict = {}
        for key, value in dict_2.items():
            if key not in dict_1.keys():
                temp_dict[key] = value
            elif value != dict_1[key]:
                while key in dict_1.keys():
                    key += '*'
                temp_dict[key] = value
        dict_1.update(temp_dict)
    return None

In [None]:
def preprocess_soup(soup):
    # Relabel and remove some data from soup before scraping.
    
    # Relabel the variable '{CITY} Population' tag with a constant label.
    tags = soup.find_all(string = re.compile('Population'))
    if tags:
        for tag in tags:
            tag.string.replace_with('City Population')
    
    
    # Remove data that is either extraneous or irrelevant for analysis.
    
    # Remove entire sections.
    remove_ids=['section10','section19']
    tags = soup.find_all(id=remove_ids)
    if tags:
        for tag in tags:
            tag.decompose()
            
    # Remove tables.
    captions = ['Selection of Students',
                'Undergraduate Majors',
                'Intercollegiate Sports Offered']
    regexps = [re.compile(caption) for caption in captions]
    tags = soup.find_all('caption', string=regexps)
    if tags:
        for tag in tags:
            tag.parent.decompose()
        
    return None

In [None]:
def scrape_page(soup):    
    scraped_page = {}
    
    # Scrape name.
    scraped_page['Name'] = soup.h1.string
    
    # Scrape data from tables with <thead> and <tbody> tags, if any.
    tags = soup.find(id='tabcontwrap').find_all('thead')
    if tags:
        for tag in tags:
            scraped_table = scrape_table(tag)
            
            add_dict(scraped_page, scraped_table)
        
            # Delete table from soup.
            tag.parent.decompose()
        

    # Scrape remaining table rows.
    tags = soup.find(id='tabcontwrap').find_all('tr')
    if tags:
        for tag in tags:
            scraped_row = scrape_row(tag)
            
            if scraped_row:
                for label, values in scraped_row.items():
                    scraped_row[label] = values[0]
            
                add_dict(scraped_page, scraped_row)
    
    return scraped_page

In [45]:
errors = []
def scrape_school(school_id):
    scraped_school = {}
    name = None

    for page_id in np.arange(1, 7):

        # Build URL and send request.
        url = BASE_URL_1 + str(page_id) + BASE_URL_2 + str(school_id)
        result = get(url, headers = HEADERS)

        # Print status update.
        print("Scraping {}".format(result.url))

        # Check request errors.
        if result.status_code != 200:
            error = "Status Code {}".format(result.status_code)
            print("ERROR {} with {}".format(error, result.url))
            errors.append((result.url, result.status_code))
            break

        soup = BeautifulSoup(result.text, "lxml")

        # Check errors in returned HTML soup.
        error = None
        if not soup.h1:
            error = "No heading tag."
            print("ERROR {} with {}".format(error, result.url))
            errors.append((result.url, error))
            break

        if soup.h1.string == EMPTY_TITLE:
            error = "No school data."
            print("ERROR {} with {}".format(error, result.url))
            errors.append((result.url, error))
            break

        if not soup.find(string='Content END'):
            error = "Page not fully loaded."
            print("ERROR {} with {}".format(error, result.url))
            errors.append((result.url, error))
            break

        # Clean the soup.
        preprocess_soup(soup)

        # Scrape.
        scraped_page = scrape_page(soup)

        # Add current page data to all data for current school.
        add_dict(scraped_school, scraped_page)

        # Clear status update.
        clear_output(wait = True)


    return scraped_batch




scraped = {}
for batch_start in range(0, SCHOOL_ID_END + 1, BATCH_SIZE):
    scraped_batch = scrape_batch(batch_start)
    scraped.update(scraped_batch)

df = pd.DataFrame.from_dict(scraped, orient='index')

df

Unnamed: 0,Name,Web Site,Institution Type,Coeducational,Undergraduate Students,Women,Men,Graduate Students,Entrance Difficulty,Overall Admission Rate,...,Activities and Organizations,ROTC,All Undergraduates,Full-Time Undergraduates,Average Age,All Graduate Students,Students Graduating Within 5 Years,Students Graduating Within 6 Years,Average Starting Salary,Disciplines Pursued
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,Bryn Athyn College,www.brynathyn.edu/,Private,Yes,326.0,165 (50.6%),161 (49.4%),4,Minimally difficult,88% of 241 applicants were admitted,...,"choral groups, drama theatre, c.a.r.e. (commun...","Army ROTC is offered off campus, Air Force ROT...",326.0,314.0,21.0,4,45.7%,48.0%,Not reported,Not reported
7,Adelphi University,www.adelphi.edu/,Private,Yes,5266.0,"3,603 (68.4%)","1,663 (31.6%)",2712,Moderately difficult,"73% of 11,851 applicants were admitted",...,"choral groups, drama theatre, radio station, s...","Army ROTC is offered off campus, Air Force ROT...",5266.0,4874.0,22.0,2712,66%,67.5%,"$57,000 per year",Not reported
8,Albany College of Pharmacy and Health Sciences,www.acphs.edu/,Private,Yes,1078.0,646 (59.9%),432 (40.1%),481,Moderately difficult,"67% of 1,583 applicants were admitted",...,"choral groups, national fraternities, american...",,1078.0,1055.0,21.0,481,76.7%,77.0%,Not reported,Not reported
9,Albertus Magnus College,www.albertus.edu/,Private,Yes,1220.0,814 (66.7%),406 (33.3%),335,Moderately difficult,67% of 780 applicants were admitted,...,"choral groups, drama theatre, student alumni a...",,1220.0,1034.0,30.0,335,40.4%,44.0%,Not reported,Not reported
10,Albright College,www.albright.edu/,Private,Yes,2015.0,"1,195 (59.3%)",820 (40.7%),21,Moderately difficult,"50% of 8,332 applicants were admitted",...,"choral groups, drama theatre, radio station, s...",,2015.0,1993.0,20.0,21,52.2%,53.0%,Not reported,Business 5% Law 9% Arts and Sciences 12% Medic...
