# Scraping collegedata.com with Qt Webkit


In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

import requests
import time
from bs4 import BeautifulSoup
import re
import TIdatabase as ti

Collegedata.com is a website where we can find a lot of data in almost standardized form. The website has a page for each university, containing a list of student profiles that applied there. An example of such a page can be found [here](http://www.collegedata.com/cs/admissions/admissions_tracker_result.jhtml?schoolId=444&classYear=2020). The webiste also has a [page](http://www.collegedata.com/cs/admissions/admissions_profile_view.jhtml?profileName=FutureMD98) for each student profile. These pages contain the information we need: where the student applied and whether they got in, personal information like gender and race and academic information like test scores. Our scraping process consists of two parts. First, we visit the pages for each university in our list to get a list of student profile names. In part two, we visit each student page, scrape the information and add it to a dataframe.

The URL of a university page is as follows:
http://www.collegedata.com/cs/admissions/admissions_tracker_result.jhtml?schoolId=444&classYear=2020  
Every university has an ID number. Since we are considering a list of 25 fixed universities, we will hardcode these ID's in a dictionary. The second parameter for the link is the graduation year of the applicants. 

The URL of a student profile page is as follows:
http://www.collegedata.com/cs/admissions/admissions_profile_view.jhtml?profileName=FutureMD98  
Once we have a list of profile names from part 1, we can easily create each of these urls.

## Part 1: Getting a list of student profile names

Below, we define some dictionaries and list with information on the universities.
- `college_ids` is a list of university names that serve as university ID in our university dataframe (defined in `TIdatabase.py`).
- `college_urls` is the list of numeric university ID's used by collegedata.com, in the same order as in `college_ids`.
- `college_id_dict` is a dictionary with the university names as key and the numeric ID's as value and can be used to create the collegedata.com urls.  

Finally, we define the base urls for the pages that we want to scrape. 

In [2]:
college_ids=['Princeton', 'Harvard', 'Yale', 'Columbia', 'Stanford', 'UChicago', 'MIT', 'Duke', 'UPenn', 'CalTech', 'JohnsHopkins', 'Dartmouth', 'Northwestern', 'Brown', 'Cornell', 'Vanderbilt', 'WashU', 'Rice', 'NotreDame', 'UCB', 'Emory', 'Georgetown', 'CarnegieMellon', 'UCLA', 'USC']
college_urls=[111, 444, 244, 399, 781, 327, 186, 1026, 67, 706, 1509, 403, 1803, 163, 787, 1562, 1720, 731, 1774, 1090, 1039, 1182, 204, 1093, 1138]
college_id_dict=dict(zip(college_ids,college_urls))
baseurl='http://www.collegedata.com/cs/admissions/'
tracker_url='admissions_tracker_result.jhtml?schoolId='
student_url='admissions_profile_view.jhtml?profileName='

Scraping the webpage of a university on collegedata.com requires a special approach, as the data itself is loaded by Javascript and therefore cannot simply be found in the html code that we get when using `requests`.  The following code has been found in a [blogpost](https://webscraping.com/blog/Scraping-multiple-JavaScript-webpages-with-webkit/) and helps to load the page before we scrape it. When creating an object of the class `Render`, we give a list of urls as parameter. The object then uses webkit to process the Javascript on each webpage in the list before accessing the resulting html code. We embed the function `scrape` which immediately obtains the list of profilenames from each webpage. Each profilename in the code is part of the `href` in a `<a>` tag as follows:

`<a style="..." rel="..." href="javascript:enterProfileByName('rorygilmore')"></a>`

In [3]:
## Source: https://webscraping.com/blog/Scraping-multiple-JavaScript-webpages-with-webkit/
import sys
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *

class Render(QWebPage):  
    def __init__(self, urls):
        self.app = QApplication(sys.argv)  
        QWebPage.__init__(self)  
        self.loadFinished.connect(self._loadFinished)  
        self.urls = urls 
        self.profileList=set()
        self.crawl()  
        self.app.exec_()  
      
    def crawl(self):  
        if self.urls:  
            url = self.urls.pop(0)    
            self.mainFrame().load(QUrl(url))  
        else:  
            self.app.quit()  
        
    def _loadFinished(self, result): 
        frame = self.mainFrame()  
        url = str(frame.url().toString())  
        html = frame.toHtml()  
        self.scrape(url, html)
        self.crawl()
    
    # Once we have the html code with processed Javascript, we can parse it to find the profile names
    def scrape(self,url, html):
        soup = BeautifulSoup(str(html.toAscii()),'html.parser')   # we create a beautiful soup object   
        profiles=soup.find_all("a",href=re.compile(r"enterProfileByName")) # find all <a> tags that contain the string 'enterProfileByName'
        for p in profiles:
            self.profileList.add(p.get("href").split("'")[1]) #the profile name is between apostrophe's

Now, all we have to do to obtain our list of profile names is to create a list of all university page urls. 

The next cell finds the profile names for all 25 universities and years 2009-2019. This code takes about 55 minutes.

In [5]:
%%time
urls=[]
for school in college_id_dict:
    schoolurl=baseurl+tracker_url+str(college_id_dict[school])+'&classYear='
    for year in range(2009,2020):
        urls.append(schoolurl+str(year))
r=Render(urls)
print 'We found ', len(r.profileList), 'students'

We found  5028 students
CPU times: user 22min 18s, sys: 4min 20s, total: 26min 39s
Wall time: 38min 40s


Two of these profile names turn out to be useless as their webpages are empty so we remove them.

In [6]:
# these are bad profiles without any information
r.profileList.remove('orangecat')
r.profileList.remove('j7Wa4')
r.profileList.remove('EDz3k')
print 'We found ', len(r.profileList), 'students'

We found  5026 students


## Part 2: Scraping student profile pages

Now that we have a list of profile names (`r.profileList`), we can easily create the urls of each student profile page. This time, we can find our data directly in the html so we don't need any Javascript processing. 

We start by defining a series of dictionaries that we will need to convert strings we find in the html code to the  information we need. For a full description of the dataframe columns and types, we refer to `TIdatabase.py` and this [Google doc](https://docs.google.com/spreadsheets/d/1dm73Vmov8bhNoVRUtyg6TU-IgE7DPDVlukMkvnaCqAg/edit#gid=0&vpid=A1).

We will create two dataframes. One contains all the information for each individual student. The other combines student ID's and university ID's and contains the information of a that student's application to that university. The dataframe columns can be seen below:

In [7]:
columns_student = ['classrank', 'admissionstest','AP','averageAP','SATsubject', 'GPA', 'GPA_w', 'program','schooltype',
            'intendedgradyear', 'addInfo', 'canAfford', 'female', 'MinorityGender','MinorityRace','international',
           'firstinfamily','sports','artist', 'workexp']
columns_uni = ['collegeID','earlyAppl','visited','alumni', 'outofstate', 'acceptStatus','acceptProb']

For indicator columns, we will assign -1 for False, 1 for True and 0 when the information is unavailable or undecisive. 

We use [this](https://www.act.org/solutions/college-career-readiness/compare-act-sat/) webpage to convert ACT composite scores to SAT critical reading and math (CR+M) scores.

In [8]:
# Indicators for gender, corresponding to dataframe column 'female':
genderdict = {'Male': -1, 'Female': 1} 
#Indicator for the type of high school for the dataframe column 'schooltype':
highschooldict = {'Public': -1, 'Private': 1, 'Parochial': 1, 'Home-Schooled': 1} 
# A list of words we associate with underrepresented minority races for the dataframe column 'MinorityRace'
minoritylist = ['african', 'hispanic', 'latin','indian', 'native', 'black', 'mexican','puerto','alaska','hawai','pacific island']
# A list of SAT scores from https://www.act.org/solutions/college-career-readiness/compare-act-sat/ corresponding to ACT composite scores between 36 and 11
sats=[1600, 1560, 1510, 1460, 1420, 1380, 1340, 1300, 1260, 1220, 1190, 1150, 1110, 1070, 1030, 990, 950, 910, 870, 
      830, 790, 740, 690, 640, 590, 530]
# A dictionary to translate an ACT composite score to an SAT CR+M score.
act2satdict=dict(zip(range(36,10,-1),sats))
# General indicator for boolean columns in the webpage. For example the column 'Athlete' of 'Alumni'
booleandict={'': -1, 'X': 1}
# Indicators for the admission status corresponding to dataframe column 'acceptStatus'
statusdict={'Will Attend': 1, 'Accepted': 1, 'Applied': 0, 'Deferred': -1, 'Denied': -1, 'Not Applied': 0, 'Wait-Listed': -1, 'Withdrawn': 0, 'Pending': -1}
# List of university names as used on collegedata.com
uni_list=['Princeton University', 'Harvard College', 'Yale University', 'Columbia University', 'Stanford University', 'University of Chicago', 'Massachusetts Institute of Technology', 'Duke University', 'University of Pennsylvania', 'California Institute of Technology', 'Johns Hopkins University', 'Dartmouth College', 'Northwestern University', 'Brown University', 'Cornell University', 'Vanderbilt University', 'Washington University in St. Louis', 'Rice University', 'University of Notre Dame', 'University of California, Berkeley', 'Emory University', 'Georgetown University', 'Carnegie Mellon University', 'University of California, Los Angeles', 'University of Southern California']
# Dictionary to translate the university name used on collegedata.com to the university name used in our dataframe
uni_name_dict = dict(zip(uni_list, college_ids))
# List of states for each university
uni_state=['NJ', 'MA', 'CT', 'NY', 'CA', 'IL', 'MA', 'NC', 'PA', 'CA', 'MD', 'NH', 'IL', 'RI', 'NY', 'TN', 'MO', 'TX', 'IN', 'CA', 'GA', 'DC', 'PA', 'CA', 'CA']
# Dictionary get the state of a university
uni_state_dict = dict(zip(uni_list,uni_state))
# Dictionary to translate a state to its abbreviation 
states_dict={'Alabama': 'AL',
 'Alaska': 'AK',
 'American Samoa': 'AS',
 'Arizona': 'AZ',
 'Arkansas': 'AR',
 'California': 'CA',
 'Colorado': 'CO',
 'Connecticut': 'CT',
 'Delaware': 'DE',
 'District of Columbia': 'DC',
 'Florida': 'FL',
 'Georgia': 'GA',
 'Guam': 'GU',
 'Hawaii': 'HI',
 'Idaho': 'ID',
 'Illinois': 'IL',
 'Indiana': 'IN',
 'Iowa': 'IA',
 'Kansas': 'KS',
 'Kentucky': 'KY',
 'Louisiana': 'LA',
 'Maine': 'ME',
 'Maryland': 'MD',
 'Massachusetts': 'MA',
 'Michigan': 'MI',
 'Minnesota': 'MN',
 'Mississippi': 'MS',
 'Missouri': 'MO',
 'Montana': 'MT',
 'National': 'NA',
 'Nebraska': 'NE',
 'Nevada': 'NV',
 'New Hampshire': 'NH',
 'New Jersey': 'NJ',
 'New Mexico': 'NM',
 'New York': 'NY',
 'North Carolina': 'NC',
 'North Dakota': 'ND',
 'Northern Mariana Islands': 'MP',
 'Ohio': 'OH',
 'Oklahoma': 'OK',
 'Oregon': 'OR',
 'Pennsylvania': 'PA',
 'Puerto Rico': 'PR',
 'Rhode Island': 'RI',
 'South Carolina': 'SC',
 'South Dakota': 'SD',
 'Tennessee': 'TN',
 'Texas': 'TX',
 'Utah': 'UT',
 'Vermont': 'VT',
 'Virgin Islands': 'VI',
 'Virginia': 'VA',
 'Washington': 'WA',
 'West Virginia': 'WV',
 'Wisconsin': 'WI',
 'Wyoming': 'WY',
  'Other': 'Other'}

Next, we define some functions to help convert the html strings to the right form for our dataframes.

In [9]:
# getFromDict: general function that looks up a key in a dictionary and returns the value if available and 0 if not available
#              input:      dictionary = any dictionary for indicators
#                          text = key to look up in dictonary
#              output:     if the dictionary has text as input, output = dictionary[text], else output=0
def getFromDict(dictionary,text):
    if dictionary.has_key(text):
        return dictionary[text]
    else:
        return 0
# isMinority: check if a string contains one of the words associated with minorities
#              input:      text = user-supplied string describing a race
#              output:     output = 1 if the race is considered a minority, else output = -1 
def isMinority(text):
    for m in minoritylist:
        if m in text:
            return 1
    return -1
# getScores: obtain a metric from scores, given a list of html strings containing scores 
#              input:      scores = list of html strings containing scores
#                          fun = function to apply to the scores
#              output:     metric obtained by applying fun to the list of numerical scores
#                          and the number of valid scores found in the list
def getScores(scores,fun):
    scores = [s.get_text().strip() for s in scores] # clean the strings
    while '' in scores: 
        scores.remove('') # remove all empty strings
    if len(scores)>0:
        scores=[int(s) for s in scores] # create list of actual numerical scores
        return fun(scores), len(scores)
    else:
        return None, 0
# getAdmissionTestScore: obtain one single admission test score, given SAT and ACT scores
#              input:      doc = list of html strings containing the SAT scores (CR, M and W) and the ACT score
#              output:     a single score derived from the combination of available admission test scores
def getAdmissionTestScore(doc):
    satCRM,dummy = getScores(doc[0:2],sum) # Get the sum of the SAT CR and M scores
    satW,dummy = getScores([doc[2]],np.max) # Get the SAT W score
    act,dummy = getScores([doc[4]],np.max) # Get the ACT score
    if act == None: # If no ACT score available, we just use the total SAT score
        return (satCRM+satW)
    elif satCRM==None: # if no SAT CR + M score, we need to replace it with the ACT score
        if satW==None: # if also no SAT W score available, there is a problem ( but this is never the case )
            print "Warning: no SAT writing score"
        return act2satdict[act]+satW # Convert ACT to an SAT score and combine with the SAT W score
    else: # if all scores available, we use the maximum of the translated ACT score and SAT CR+M score
        return (max(act2satdict[act],satCRM)+satW)
# getCanAfford: detect the indicator 'canAfford'
#              input:      text = string from the html that indicates whether a student applied for financial support
#              output:     output =1 if the student can probably afford tuition, output = -1 if the student applied for financial support, output =0 if information unavailable
def getCanAfford(text):
    if 'Yes' in text:
        return -1
    elif 'No' in text:
        return 1
    else:
        return 0
# removePunct: general function to clean up string-based columns
#              input:      text = string to clean
#              output:     cleaned string
def removePunct(text):
    text = re.sub(r'([^\s\w]|_)+'," ",text)
    return re.sub('\s+',' ',text).encode('latin-1')

Next comes the function that does all the work. Given a beautiful soup object, we create two things. One is a dictionary that corresponds to a row in the Students dataframe and the other is a list of dictionaries, one for every application from that student to one of the universities. 

In [10]:
def getColumnValues(soup):
    # initialize the dictionary and list
    values=dict(zip(columns_student,[None for i in range(len(columns_student))]))
    applications=[]
    # We start with the general information box at the top which includes class year, gender and ethnicity
    doc= soup.find("div",{"class": "general"})
    values['intendedgradyear'] = int(re.findall(r'\d{4}',doc.find("h1").get_text().split('Class of')[-1])[0]) # CLASS YEAR
    doc = doc.find_all("span")
    values['female']= getFromDict(genderdict,doc[0].get_text().strip()) # GENDER
    values['MinorityGender']= 1 if values['female']==0 else -1 # Minority gender if no gender found
    values['MinorityRace'] = isMinority(doc[1].get_text().strip().lower()) # MINORITY RACE
    values['program'] = removePunct(doc[2].get_text().strip()) # PROGRAM
    # Now we look at the academics box which includes GPA and high school info
    doc = soup.find("div", {"class": "academicswrap"}).find_all("span")
    values['schooltype']=getFromDict(highschooldict,doc[0].get_text().strip()) # SCHOOL TYPE
    state=getFromDict(states_dict,doc[1].get_text().strip()) # save the state of the student
    values['international'] = 1 if state=='Other' else -1 # INTERNATIONAL indicator
    values['GPA'] = float(doc[3].get_text()) # unweighted GPA
    values['GPA_w'] = float(doc[4].get_text()) if doc[4].get_text().strip()!='' else None # Weighted GPA
    # Next, we go to the test score box which includes SAT, ACT and AP info
    values['admissionstest'] = getAdmissionTestScore(soup.find("div", {"class": "testscorewrap"}).find_all("td")) #Admissions test
    values['SATsubject'] = len(soup.find("caption",text="SAT Subject Test Scores").next_sibling.next_sibling.find_all("tr")) # Number of SAT SUBJECT
    ap_num = len(soup.find("caption",text="AP Examinations").next_sibling.next_sibling.find_all("tr")) # Number of AP's 
    values['AP']=ap_num #AP
    if ap_num>0:
        doc = soup.find("caption",text="AP Examinations").next_sibling.next_sibling.find_all("td")
        values['averageAP'],values['AP']= getScores(doc,np.mean) # AVERAGE AP score
    # Every webpage also has three text fields for any additional information, which we just save in the 'addInfo' column
    doc = soup.find_all("div", {"class": "word"})
    doc = [d.get_text().strip() for d in doc]  
    values['addInfo']= removePunct(doc[0]+doc[1]+doc[2]) # Additional info
    # Next: the colleges applied to and the admission results for the admissions table
    doc = soup.find("table", {"class": "collchoice"})
    collegelist = doc.find("tbody").find_all("tr") # every university is a row in a table
    for c in collegelist:
        uni = c.find("th").find("span").get_text().strip() #get university name
        if uni in uni_list: # if the university is one of our 25 universities
            unirow = dict(zip(columns_uni,[None for i in range(6)])) #initialize dictionary
            unirow['collegeID']=uni_name_dict[uni] # get University ID
            doc=c.find_all("td", {"class": "center"})
            unirow['earlyAppl']=booleandict[doc[0].get_text().strip()] # Early Admission indicator
            unirow['alumni']=booleandict[doc[1].get_text().strip()] # Alumni/Legacy indicator
            if values['sports']==None or values['sports']==0: 
                values['sports']=booleandict[doc[2].get_text().strip()] # Athlete indicator
            doc = doc[2].next_sibling.find_next("span")
            unirow['acceptStatus']=getFromDict(statusdict,doc.get_text().strip()) # Admission status indicator
            if values['canAfford']==None or values['canAfford']==0:
                values['canAfford']=getCanAfford(doc.find_next("span").get_text().strip()) # can Afford indicator
            unirow['outofstate']= -1 if state==uni_state_dict[uni] else 1 #compare student state to university state for OUT OF STATE indicator
            applications.append(unirow) # add dictionary to list of applications
    return values, applications

We are finally ready to do the real work. We create 3 dataframes:
- `students` is a dataframe that will contain all student information. 
- `colleges` is a dataframe with 25 rows: one for each university. It is hardcoded in `TIdatabase.py`. 
- `applForm` is a dataframe to combine a student's application with the university to which they are applying. 

We create a url from every profile name in the list that we obtained earlier and use Beautiful Soup and `requests` to get the webpage html. We use the above defined function to extract all information needed and add them to the dataframes. 

In [13]:
%%time
# Remove old dataframes if they still exists
if ('students' in locals()): 
    students.cleanup()
    del students
if ('applForm' in locals()): del applForm
# initialize new dataframes
students=ti.Student()
colleges = ti.College()
applForm = ti.ApplForm()
for p in r.profileList: # For each profile name
    profile_url=baseurl+student_url+p # create url
    soup=BeautifulSoup(requests.get(profile_url).text,'html.parser') #get html
    # Check for empty webpage
    if soup.find("div", {"class": "academicswrap"})==None: 
        print p, ' not Found'
        continue
    # Get information
    newrow, applications = getColumnValues(soup)
    # insert student information to students dataframe. This generates a new student ID string
    studentID=students.insert(newrow)
    for app in applications:
        # add the newly obtained student ID to the applications dictionaries
        app['studentID']=studentID[0]
    # add the applications dictionaries to the applForm dataframe
    applForm.insert(applications)
students.df.head()

EDz3k  not Found
CPU times: user 11min 52s, sys: 37.8 s, total: 12min 30s
Wall time: 1h 4min 38s


We obtain a database of 5025 students and 16094 applications. Note it is 1 less than before running the cell above, because the profile page for student `EDz3k` was incomplete. 

In [14]:
students.df.shape

(5025, 21)

In [15]:
applForm.df.shape

(16094, 8)

We save the frames to csv files so they can be used in our classification notebooks.

In [16]:
students.save('collegedata_students.csv')
applForm.save('collegedata_applications.csv')

As a test, we read these csv files to see if the dataframes are still correct.

In [17]:
if ('students' in locals()): 
    students.cleanup()
    del students
if ('applForm' in locals()): del applForm
students=ti.Student()
applForm = ti.ApplForm()
students.read('collegedata_students.csv')
applForm.read('collegedata_applications.csv')
students.df.head()

Unnamed: 0,studentID,classrank,admissionstest,AP,averageAP,SATsubject,GPA,GPA_w,program,schooltype,intendedgradyear,addInfo,canAfford,female,MinorityGender,MinorityRace,international,firstinfamily,sports,artist,workexp
0,S50C3UECT8,,2290,7,5.0,3,3.8,4.34,Biomedical engineering,-1,2017,Basketball outside of school violin cancer awa...,0,1,-1,-1,-1,,-1,,
0,JTEQOV7ZCB,,2080,5,4.4,4,3.9,4.22,Mechanical Engineering,1,2018,Swimming 3 years Water Polo 3 years Foreign La...,1,1,-1,-1,-1,,-1,,
0,5ZRH2MVO4F,,2200,6,4.833333,3,4.0,4.58,Pre Med,-1,2012,City Government Youth Partnership Advisory Com...,0,-1,-1,-1,-1,,-1,,
0,3I94MHBBCL,,2210,4,4.25,0,3.72,,,-1,2014,,0,1,-1,-1,-1,,-1,,
0,EIO07T1RL7,,2040,6,4.0,0,3.88,,English,-1,2018,,1,-1,-1,-1,-1,,-1,,


In [18]:
applForm.df.head()

Unnamed: 0,studentID,collegeID,earlyAppl,visited,alumni,outofstate,acceptStatus,acceptProb
0,S50C3UECT8,Rice,-1,,-1,-1,1,
0,JTEQOV7ZCB,UPenn,-1,,-1,1,-1,
1,JTEQOV7ZCB,Princeton,-1,,-1,1,-1,
2,JTEQOV7ZCB,Harvard,1,,-1,1,-1,
3,JTEQOV7ZCB,Stanford,-1,,-1,-1,-1,


We have succesfully scraped admissions data from collegedata.com. The results can be found in `collegedata_students.csv` and `collegedata_applications.csv`. Next, we normalize the data in `normalize.ipynb`.