In [1]:
import re

In [2]:
import requests

In [3]:
base_url = 'https://www.handbook.unsw.edu.au'

In [4]:
resp = requests.get(base_url, timeout=10)
try:
    resp.encoding = resp.apparent_encoding
    resp.raise_for_status()
except:
    print("cannot get response")
    pass

In [5]:
resp.text[:100]

'<!DOCTYPE html>\n<html lang="en">\n\n<head>\n\t\t\t<meta name="viewport" content="width=device-width, initi'

In [6]:
resp.status_code

200

In [7]:
from bs4 import BeautifulSoup

In [8]:
demo = resp.text
soup = BeautifulSoup(demo, 'html.parser')

In [9]:
divs = soup.find_all('a', {'class': ['a-lettuce level-two', ''], 'tabindex': '-1'})

In [10]:
divs[:5]

[<a class="a-lettuce level-two" href="/ArchitectureAndBuilding/browse?interest_value=68b44253db96df002e4c126b3a961980
 " tabindex="-1" target="_self">
 <h4>Architecture and Building</h4>
 </a>,
 <a class="a-lettuce level-two" href="/BusinessAndManagement/browse?interest_value=dd350293db96df002e4c126b3a961909
 " tabindex="-1" target="_self">
 <h4>Business and Management</h4>
 </a>,
 <a class="a-lettuce level-two" href="/CreativeArts/browse?interest_value=c175c653db96df002e4c126b3a9619f3
 " tabindex="-1" target="_self">
 <h4>Creative Arts</h4>
 </a>,
 <a class="a-lettuce level-two" href="/Education/browse?interest_value=f9158253db96df002e4c126b3a961953
 " tabindex="-1" target="_self">
 <h4>Education</h4>
 </a>,
 <a class="a-lettuce level-two" href="/EngineeringAndRelatedTechnologies/browse?interest_value=8a948253db96df002e4c126b3a961950
 " tabindex="-1" target="_self">
 <h4>Engineering and Related Technologies</h4>
 </a>]

In [11]:
divs[-5:]

[<a class="a-lettuce level-two" href="/InformationTechnologyElectricalEngineering/browse?sa=a2ce83204f0f5b00eeb3eb4f0310c705" tabindex="-1" target="_self">
 <h4>ZEIT: Information Technology &amp; Electrical Engineering</h4>
 </a>,
 <a class="a-lettuce level-two" href="/UniversityCollegeGeneralEducation/browse?sa=a6ce83204f0f5b00eeb3eb4f0310c716" tabindex="-1" target="_self">
 <h4>ZGEN: University College General Education</h4>
 </a>,
 <a class="a-lettuce level-two" href="/HumanitiesSocialSciences/browse?sa=e2ce83204f0f5b00eeb3eb4f0310c71b" tabindex="-1" target="_self">
 <h4>ZHSS: Humanities &amp; Social Sciences</h4>
 </a>,
 <a class="a-lettuce level-two" href="/UniversityCollegeinterdisciplinary/browse?sa=22ce83204f0f5b00eeb3eb4f0310c720" tabindex="-1" target="_self">
 <h4>ZINT: University College (Interdisciplinary)</h4>
 </a>,
 <a class="a-lettuce level-two" href="/PhysicalEnvironmentalMathematicalSciences/browse?sa=6ece83204f0f5b00eeb3eb4f0310c724" tabindex="-1" target="_self">
 <h

In [12]:
divs[100].contents

['\n', <h4>MDCN: Medicine</h4>, '\n']

In [13]:
# but subject_areas contains 'by area of interest',
# 'by faculty' and by 'subject area'
# filer out 'by area of interest' nad 'by faculty'
subject_areas = []
for div in divs:
    name = div.contents[1].string
    if re.search(r'[A-Z]{4}:[\s\w]+$',name):
        link = div.get('href')
        subject_areas.append([name, link])

In [14]:
subject_areas[:7]

[['ACCT: Accounting',
  '/Accounting/browse?sa=b4cecfec4fcb5b00eeb3eb4f0310c7eb'],
 ['ACTL: Actuarial Studies',
  '/ActuarialStudies/browse?sa=7cce03204f0f5b00eeb3eb4f0310c709'],
 ['ADAD: Art and Design',
  '/ArtAndDesign/browse?sa=b8ce03204f0f5b00eeb3eb4f0310c70e'],
 ['AERO: Aerospace Engineering',
  '/AerospaceEngineering/browse?sa=c5ce03204f0f5b00eeb3eb4f0310c713'],
 ['ANAT: Anatomy', '/Anatomy/browse?sa=05ce03204f0f5b00eeb3eb4f0310c718'],
 ['ARCH: Architecture',
  '/Architecture/browse?sa=41ce03204f0f5b00eeb3eb4f0310c71d'],
 ['ARTS: Disciplinary and Interdisciplinary Humanities',
  '/DisciplinaryAndInterdisciplinaryHumanities/browse?sa=8dce03204f0f5b00eeb3eb4f0310c721']]

In [15]:
# now we have subject areas and href
# write them into a csv file
import csv
def write_csv(lst: list, filename: str, headers: list):
    with open('{:}.csv'.format(filename), 'w+', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(headers)
        for i in lst:
            writer.writerow(i)
# headers = ['Subject Area', 'href']
# write_csv(subjects, 'subject_areas', headers)

In [2]:
from selenium import webdriver
driver = webdriver.Edge('msedgedriver.exe')
driver.implicitly_wait(60)

SessionNotCreatedException: Message: session not created: This version of MSEdgeDriver only supports MSEdge version 83


In [None]:
driver.get(base)

In [None]:
# crawl for the course codes given the subject area
# do big subject first
from time import sleep

total_code = []

for s in subject_areas:
    url = base + str(s[1])
    driver.get(url)
    button_exist = False
    # try find the button once
    try:
        button = driver.find_element_by_class_name('a-browse-more-controls-btn')
        button_exist = True
    except:
        button_exist = False
    # if button exists, courses are in 'section' tags
    if button_exist:
        # click expand button until there is no more to browse
        while True:
            try:
                button = driver.find_element_by_class_name('a-browse-more-controls-btn')
                if 'No more' in button.text:
                    break
                button.click()
                sleep(5)
            except:
                break
        courses = driver.find_elements_by_class_name('section')
        small_cs = []
    else:
        small_cs = driver.find_elements_by_class_name('align-left')
        courses = []
    
    codes = []
    for i in range(0, len(courses), 3):
        if courses[i].text:
            codes.append(courses[i].text)
    for l in small_cs:
        codes.append(l.text)
    
    total_code.append(codes)

In [None]:
total_code[:10]

In [None]:
total_code[-5:]

In [25]:
# write the codes into a csv
write_csv(total_code, 'codes', ['Course Codes Grouped by Subject Area'])

In [114]:
# start to crawl for each subject
driver.implicitly_wait(60)
total_info = []
base = 'https://www.handbook.unsw.edu.au'
path_u = '/undergraduate/courses/2020/'
path_p = '/postgraduate/courses/2020/'
path_r = '/research/courses/2020/'
path = [path_u, path_p, path_r]
for area in total_code:
    for code in area:
        if code == '':
            continue
        for p in path:
            try:
                url = base + p + code + '/'
                r = requests.get(url, timeout=30)
                r.raise_for_status()
            except:
                pass
            soup = BeautifulSoup(r.text, 'html.parser')
            if 'Error' in soup.title.string:
                continue
            else:
                break
                
        # code, name, course url, offering, enrolment condition, outline url, overview
        try:
            name = soup.title.string.split(' - ')[2]
        except:
            print(url)
        condi = soup.find_all('div', {'data-hbui': 'readmore__toggle-text'})
        try:
            pre = condi[1].div.string
        except:
            pre = ' '
        # find(name, attrs, recursive, text, **kwargs)
        ovs = soup.find('div', {'class': 'readmore__wrapper'})
        if ovs.p is None:
            ov = ovs.string
        else:
            ov = ovs.p.string
        offering = ' '
        offerings = soup.find_all('div', {'class': 'o-attributes-table-item'})
        for o in offerings:
            if not o.p is None and 'Term' in o.p.string:
                offering = o.p.string
                break
            
        outline = soup.find('a', {'class': 'a-btn-secondary a-btn-secondary--with-icon'}).get('href')
        info_lst = [code, name, url, offering, pre, outline, ov]
        total_info.append(info_lst)

In [115]:
# write them into a csv file
headers = ['Course Code', 'Course Name', 'Course URL', 'Offering Terms', 'Enrolment Conditions', 'Outline URL', 'Overview']
with open('courses.csv', 'w+', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(headers)
    for t in total_info:
        try:
            writer.writerow(t)
        except:
            traceback.print_exc()