# Scraping UIUC CS faculty homepages

## System setup 

Before we start, make sure to install the required libraries
    
    pip install bs4
    pip install selenium

Since UIUC's website has some javascript rendered HTML content, we'll be using Selenium for scraping the content loaded dynamically by javascript. For this,you would also need to download a selenium supported browser webdriver.

e.g. For Chrome, download the appropriate webdriver from here: http://chromedriver.chromium.org/downloads, unzip it and save in current directory.

In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
import re 
import urllib

In [2]:
#create a webdriver object and set options for headless browsing
options = Options()
options.headless = True
browser = webdriver.Chrome('./chromedriver',options=options)

In [3]:
#uses webdriver object to execute javascript code and get dynamically loaded webcontent
def get_js_soup(url,browser):
    browser.get(url)
    res_html = browser.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser') #beautiful soup object to be used for parsing html content
    return soup

#tidies extracted text 
def process_bio(bio):
    bio = bio.encode('ascii',errors='ignore').decode('utf-8')       #removes non-ascii characters
    bio = re.sub('\s+',' ',bio)       #repalces repeated whitespace characters with single space
    return bio

''' More tidying
Sometimes the text extracted HTML webpage may contain javascript code and some style elements. 
This function removes script and style tags from HTML so that extracted text does not contain them.
'''
def remove_script(soup):
    for script in soup(["script", "style"]):
        script.decompose()
    return soup


#Checks if bio_url is a valid faculty homepage
def is_valid_homepage(bio_url,dir_url):
    try:
        #sometimes the homepage url points to the faculty profile page
        #which should be treated differently from an actual homepage
        ret_url = urllib.request.urlopen(bio_url).geturl() 
    except:
        return False       #unable to access bio_url
    urls = [re.sub('((https?://)|(www.))','',url) for url in [ret_url,dir_url]] #removes url scheme (https,http) or www 
    return not(urls[0]== urls[1])

In [4]:
#extracts all Faculty Profile page urls from the Directory Listing Page
def scrape_dir_page(dir_url,browser):
    print ('-'*20,'Scraping directory page','-'*20)
    faculty_links = []
    faculty_base_url = 'https://cpsc.yale.edu'
    #execute js on webpage to load faculty listings on webpage and get ready to parse the loaded HTML 
    soup = get_js_soup(dir_url,browser)     
    for link_holder in soup.find_all('td',class_='views-field views-field-name'): #get list of all <div> of class 'photo nocaption'
        rel_link = link_holder.find('a')['href'] #get url
        #url returned is relative, so we need to add base url
        faculty_links.append(faculty_base_url+rel_link) 
    print ('-'*20,'Found {} faculty profile urls'.format(len(faculty_links)),'-'*20)
    return faculty_links

In [5]:
dir_url = 'https://cpsc.yale.edu/people/faculty' #url of directory listings of CS faculty
faculty_links = scrape_dir_page(dir_url,browser)

-------------------- Scraping directory page --------------------


JavascriptException: Message: javascript error: Cannot read property 'innerHTML' of null
  (Session info: headless chrome=79.0.3945.79)


In [63]:
def scrape_faculty_page(fac_url,browser):
    soup = get_js_soup(fac_url,browser)
    bio_url = ''
    bio = ''
    bio_header=''
    bio_body=''
    bio_header = soup.find('h1',class_='title').get_text(separator=' ')
    bio_body = soup.find('div',class_='profile').get_text(separator=' ')
    bio = bio_header + ' ' + bio_body
    bio_url = fac_url
    return bio_url,bio

In [64]:
#Scrape all faculty homepages using profile page urls
bio_urls, bios = [],[]
tot_urls = len(faculty_links)
for i,link in enumerate(faculty_links):
    print ('-'*20,'Scraping faculty url {}/{}'.format(i+1,tot_urls),'-'*20)
    bio_url,bio = scrape_faculty_page(link,browser)
    bio_urls.append(bio_url)
    bios.append(bio)

-------------------- Scraping faculty url 1/25 --------------------
-------------------- Scraping faculty url 2/25 --------------------
-------------------- Scraping faculty url 3/25 --------------------
-------------------- Scraping faculty url 4/25 --------------------
-------------------- Scraping faculty url 5/25 --------------------
-------------------- Scraping faculty url 6/25 --------------------
-------------------- Scraping faculty url 7/25 --------------------
-------------------- Scraping faculty url 8/25 --------------------
-------------------- Scraping faculty url 9/25 --------------------
-------------------- Scraping faculty url 10/25 --------------------
-------------------- Scraping faculty url 11/25 --------------------
-------------------- Scraping faculty url 12/25 --------------------
-------------------- Scraping faculty url 13/25 --------------------
-------------------- Scraping faculty url 14/25 --------------------
-------------------- Scraping faculty url 1

In [65]:
def write_lst(lst,file_):
    with open(file_,'w') as f:
        for l in lst:
            f.write(l)
            f.write('\n')

In [66]:
bio_urls_file = 'bio_urls.txt'
bios_file = 'bios.txt'
write_lst(bio_urls,bio_urls_file)
write_lst(bios,bios_file)