In [25]:
from bs4 import BeautifulSoup
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
import re 
import urllib
import time

In [26]:
#create a webdriver object and set options for headless browsing
options = Options()
options.headless = True
driver = webdriver.Chrome('chromedriver',options=options)

In [27]:
#uses webdriver object to execute javascript code and get dynamically loaded webcontent
def get_js_soup(url,driver):
    driver.get(url)
    res_html = driver.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser') #beautiful soup object to be used for parsing html content
    return soup

#tidies extracted text 
def process_bio(bio):
    bio = bio.encode('ascii',errors='ignore').decode('utf-8')       #removes non-ascii characters
    bio = re.sub('\s+',' ',bio)       #repalces repeated whitespace characters with single space
    return bio

''' More tidying
Sometimes the text extracted HTML webpage may contain javascript code and some style elements. 
This function removes script and style tags from HTML so that extracted text does not contain them.
'''
def remove_script(soup):
    for script in soup(["script", "style"]):
        script.decompose()
    return soup


#Checks if bio_url is a valid course homepage
def is_valid_homepage(bio_url,dir_url):
    if bio_url.endswith('.pdf'): #we're not parsing pdfs
        return False
    try:
        #sometimes the homepage url points to the same page as the course profile page
        #which should be treated differently from an actual homepage
        ret_url = urllib.request.urlopen(bio_url).geturl() 
    except:
        return False       #unable to access bio_url
    urls = [re.sub('((https?://)|(www.))','',url) for url in [ret_url,dir_url]] #removes url scheme (https,http or www) 
    return not(urls[0]== urls[1])

Start Scraper

In [28]:
#extracts all course Profile page urls from the Directory Listing Page
def scrape_default_page(dir_url,driver):
    print ('-'*20,'Scraping directory page','-'*20)
    course_links = []
    course_base_url = 'https://courses.illinois.edu'
    #execute js on webpage to load course listings on webpage and get ready to parse the loaded HTML 
    soup = get_js_soup(dir_url,driver)     
    for link_holder in soup.find_all('tr'): #get list of all <div> of class 'name'
        try:
            rel_link = link_holder.find('a')['href'] #get url
            #url returned is relative, so we need to add base url
            course_links.append(course_base_url+rel_link)
        except:
            a=1
    print ('-'*20,'Found {} course profile urls'.format(len(course_links)),'-'*20)
    return course_links

In [29]:
default_url = 'https://courses.illinois.edu/schedule/DEFAULT/DEFAULT' #url of directory listings of all subjects
subject_links = scrape_default_page(default_url,driver)
print(subject_links)

-------------------- Scraping directory page --------------------
-------------------- Found 189 course profile urls --------------------
['https://courses.illinois.edu/schedule/2021/fall/AAS', 'https://courses.illinois.edu/schedule/2021/fall/ABE', 'https://courses.illinois.edu/schedule/2021/fall/ACCY', 'https://courses.illinois.edu/schedule/2021/fall/ACE', 'https://courses.illinois.edu/schedule/2021/fall/ACES', 'https://courses.illinois.edu/schedule/2021/fall/ADV', 'https://courses.illinois.edu/schedule/2021/fall/AE', 'https://courses.illinois.edu/schedule/2021/fall/AFAS', 'https://courses.illinois.edu/schedule/2021/fall/AFRO', 'https://courses.illinois.edu/schedule/2021/fall/AFST', 'https://courses.illinois.edu/schedule/2021/fall/AGCM', 'https://courses.illinois.edu/schedule/2021/fall/AGED', 'https://courses.illinois.edu/schedule/2021/fall/AHS', 'https://courses.illinois.edu/schedule/2021/fall/AIS', 'https://courses.illinois.edu/schedule/2021/fall/ALEC', 'https://courses.illinois.edu

In [30]:
#extracts all course Profile page urls from the Directory Listing Page
def scrape_subject_page(dir_url,driver):
    #print ('-'*20,'Scraping directory page','-'*20)
    course_links = []
    course_base_url = 'https://courses.illinois.edu'
    #execute js on webpage to load course listings on webpage and get ready to parse the loaded HTML 
    soup = get_js_soup(dir_url,driver)     
    for link_holder in soup.find_all('tr'): #get list of all <div> of class 'name'
        try:
            rel_link = link_holder.find('a')['href'] #get url
            #url returned is relative, so we need to add base url
            course_links.append(course_base_url+rel_link)
        except:
            a=1
    #print ('-'*20,'Found {} course profile urls'.format(len(course_links)),'-'*20)
    return course_links

In [31]:
course_links = []
for sub_url in subject_links:
    course_links.append(scrape_subject_page(sub_url, driver))
flat = []
for i in course_links:
    for j in i:
        flat.append(j)
course_links = flat
print(course_links)

['https://courses.illinois.edu/schedule/2021/fall/AAS/100', 'https://courses.illinois.edu/schedule/2021/fall/AAS/120', 'https://courses.illinois.edu/schedule/2021/fall/AAS/200', 'https://courses.illinois.edu/schedule/2021/fall/AAS/201', 'https://courses.illinois.edu/schedule/2021/fall/AAS/275', 'https://courses.illinois.edu/schedule/2021/fall/AAS/281', 'https://courses.illinois.edu/schedule/2021/fall/AAS/297', 'https://courses.illinois.edu/schedule/2021/fall/AAS/300', 'https://courses.illinois.edu/schedule/2021/fall/AAS/310', 'https://courses.illinois.edu/schedule/2021/fall/AAS/357', 'https://courses.illinois.edu/schedule/2021/fall/AAS/402', 'https://courses.illinois.edu/schedule/2021/fall/AAS/479', 'https://courses.illinois.edu/schedule/2021/fall/ABE/100', 'https://courses.illinois.edu/schedule/2021/fall/ABE/199', 'https://courses.illinois.edu/schedule/2021/fall/ABE/223', 'https://courses.illinois.edu/schedule/2021/fall/ABE/224', 'https://courses.illinois.edu/schedule/2021/fall/ABE/23

In [67]:
def scrape_course_page(fac_url,driver):
    soup = get_js_soup(fac_url,driver)
    
    profile_sec = soup.find('h1', {"class": "app-inline"})
    title = profile_sec.text
    profile_sec = soup.find('span', {"class": "app-label app-text-engage"})
    name = profile_sec.text
    
    profile_sec = profile_sec.find_next('p')
    credit = profile_sec.text
    try:
        credit = (credit[7:]).strip()
    except:
        credit = credit
    profile_sec = profile_sec.find_next('p')
    intro = profile_sec.text
    
    instructors = []
    uid = 1
    while True:
        try:
            profile_sec = soup.find('tr', {"id": "uid"+str(uid)})
            profile_sec = profile_sec.find_next('div')
            profile_sec = profile_sec.find_next('div')
            profile_sec = profile_sec.find_next('div')
            profile_sec = profile_sec.find_next('div')
            profile_sec = profile_sec.find_next('div')
            profile_sec = profile_sec.find_next('div')
            text = profile_sec.get_text(separator="<br>")
            instructors += text.split("<br>")
            uid+=1
        except:
            break
    instructors = [elem for elem in instructors if len(elem) != 0]
    prereq = []
    prereq_section = soup.find("div", {"id": "app-course-info"}).find_all("div", {"class": "col-sm-12"})[2]

    for child in prereq_section:
        if "Prerequisite" in str(child):
            prereq = [ element.get_text() for element in child.find_all("a") ]
            if "Same as" in str(child):
                prereq = prereq[1:]

    return title, name, credit, intro, list(set(instructors)), prereq

In [70]:
#Scrape homepages of all urls
titles, names, credits, intros, instructors, prereq = [],[],[],[],[],[]
tot_urls = len(course_links)
for i,link in enumerate(course_links):
    data = scrape_course_page(link,driver)
    titles.append(data[0])
    names.append(data[1])
    credits.append(data[2])
    intros.append(data[3])
    instructors.append(data[4])
    prereq.append(data[5])

dictionaries = []
for i in range(tot_urls):
    dictionary = {}
    dictionary["title"] = titles[i]
    dictionary["names"] = names[i]
    dictionary["credit"] = credits[i]
    dictionary["intro"] = intros[i]
    dictionary["instructors"] = instructors[i]
    dictionary["prereq"] = prereq[i]
    dictionaries.append(dictionary)


print(dictionaries)


[{'title': 'AAS 100', 'names': 'Intro Asian American Studies', 'credit': '3 hours.', 'intro': 'Interdisciplinary introduction to the basic concepts and approaches in Asian American Studies. Surveys the various dimensions of Asian American experiences including history, social organization, literature, arts, and politics.', 'instructors': ['Zheng, R', 'Rosado-Torres, A', 'Wang, Y', 'Gonzalez, D', 'Atienza, P', 'Rana, J'], 'prereq': []}, {'title': 'AAS 120', 'names': 'Intro to Asian Am Pop Culture', 'credit': '3 hours.', 'intro': "Introductory understanding of the way U.S. popular culture has affected Asian Americans and the contributions Asian Americans have made to U.S. media and popular culture since the mid 1880's.", 'instructors': ['Park, D'], 'prereq': []}, {'title': 'AAS 200', 'names': 'U.S. Race and Empire', 'credit': '3 hours.', 'intro': 'Invites students to examine histories and narratives of U.S. race and empire, drawing upon multiple theoretical and methodological works in As

In [71]:
import json
for i in range(tot_urls):
    d = dictionaries[i]
    with open('../../data/course/' + titles[i] + '.json', 'w', encoding='utf-8') as f:
        json.dump(d, f, ensure_ascii=False, indent=4)

In [76]:
tot_urls

4417

In [78]:
driver.close()