In [6]:
import re

In [7]:
import requests

In [3]:
base = 'https://www.handbook.unsw.edu.au'

In [8]:
resp = requests.get(base, timeout=10)
try:
    resp.encoding = resp.apparent.encoding
    resp.raise_for_status()
except:
    pass

In [9]:
resp.text[:100]

'<!DOCTYPE html>\n<html lang="en">\n\n<head>\n\t\t\t<meta name="viewport" content="width=device-width, initi'

In [10]:
from bs4 import BeautifulSoup

In [11]:
demo = resp.text
soup = BeautifulSoup(demo, 'html.parser')

In [12]:
divs = soup.find_all('a', {'class': ['a-lettuce level-two', '']})
divs[:10]
divs[0].contents

['\n', <h4>Architecture and Building</h4>, '\n']

In [13]:
subject_areas = []
for item in divs:
    name = item.contents[1].string
    link = item.get('href')
    subject_areas.append([name, link])
subject_areas[:5]

[['Architecture and Building',
  '/ArchitectureAndBuilding/browse?interest_value=68b44253db96df002e4c126b3a961980\n'],
 ['Business and Management',
  '/BusinessAndManagement/browse?interest_value=dd350293db96df002e4c126b3a961909\n'],
 ['Creative Arts',
  '/CreativeArts/browse?interest_value=c175c653db96df002e4c126b3a9619f3\n'],
 ['Education',
  '/Education/browse?interest_value=f9158253db96df002e4c126b3a961953\n'],
 ['Engineering and Related Technologies',
  '/EngineeringAndRelatedTechnologies/browse?interest_value=8a948253db96df002e4c126b3a961950\n']]

In [14]:
# but subject_areas contains 'by area of interest',
# 'by faculty' and by 'subject area'
# filer out 'by area of interest' nad 'by faculty'
subjects = []
for item in subject_areas:
    if re.search(r'[A-Z]{4}:[\s\w.]+', item[0]):
        subjects.append(item)
subjects[:20]

[['ACCT: Accounting',
  '/Accounting/browse?sa=b4cecfec4fcb5b00eeb3eb4f0310c7eb'],
 ['ACTL: Actuarial Studies',
  '/ActuarialStudies/browse?sa=7cce03204f0f5b00eeb3eb4f0310c709'],
 ['ADAD: Art and Design',
  '/ArtAndDesign/browse?sa=b8ce03204f0f5b00eeb3eb4f0310c70e'],
 ['AERO: Aerospace Engineering',
  '/AerospaceEngineering/browse?sa=c5ce03204f0f5b00eeb3eb4f0310c713'],
 ['ANAT: Anatomy', '/Anatomy/browse?sa=05ce03204f0f5b00eeb3eb4f0310c718'],
 ['ARCH: Architecture',
  '/Architecture/browse?sa=41ce03204f0f5b00eeb3eb4f0310c71d'],
 ['ARTS: Disciplinary and Interdisciplinary Humanities',
  '/DisciplinaryAndInterdisciplinaryHumanities/browse?sa=8dce03204f0f5b00eeb3eb4f0310c721'],
 ['ATSI: Nura Gili (Indigenous Programs)',
  '/NuraGiliindigenousPrograms/browse?sa=c9ce03204f0f5b00eeb3eb4f0310c726'],
 ['AVEN: Aviation', '/Aviation/browse?sa=09ce03204f0f5b00eeb3eb4f0310c72b'],
 ['AVIA: Aviation', '/Aviation/browse?sa=45ce03204f0f5b00eeb3eb4f0310c730'],
 ['AVIF: Aviation', '/Aviation/browse?sa=7

In [None]:
# now we have subject areas and href
# write them into a csv file
import csv
def write_csv(lst: list, filename: str, headers: list):
    with open('{:}.csv'.format(filename), 'w+', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(headers)
        for i in lst:
            writer.writerow(i)
# headers = ['Subject Area', 'href']
# write_csv(subjects, 'subject_areas', headers)

In [17]:
from selenium import webdriver
driver = webdriver.Edge('msedgedriver.exe')
driver.implicitly_wait(60)

In [18]:
driver.get(base)

In [20]:
# crawl for the course codes given the subject area
# do big subject first
from time import sleep

total_code = []

for s in subjects:
    url = base + str(s[1])
    driver.get(url)
    button_exist = False
    # try find the button once
    try:
        button = driver.find_element_by_class_name('a-browse-more-controls-btn')
        button_exist = True
    except:
        button_exist = False
    # if button exists, courses are in 'section' tags
    if button_exist:
        # click expand button until there is no more to browse
        while True:
            try:
                button = driver.find_element_by_class_name('a-browse-more-controls-btn')
                if 'No more' in button.text:
                    break
                button.click()
                sleep(5)
            except:
                break
        courses = driver.find_elements_by_class_name('section')
        small_cs = []
    else:
        small_cs = driver.find_elements_by_class_name('align-left')
        courses = []
    
    codes = []
    for i in range(0, len(courses), 3):
        if courses[i].text:
            codes.append(courses[i].text)
    for l in small_cs:
        codes.append(l.text)
    
    total_code.append(codes)

In [21]:
total_code[:100]

[['ACCT1501',
  'ACCT1511',
  'ACCT2101',
  'ACCT2507',
  'ACCT2522',
  'ACCT2542',
  'ACCT2672',
  'ACCT3202',
  'ACCT3303',
  'ACCT3563',
  'ACCT3583',
  'ACCT3601',
  'ACCT3610',
  'ACCT3625',
  'ACCT3708',
  'ACCT4796',
  'ACCT4797',
  'ACCT4798',
  'ACCT4809',
  'ACCT4851',
  'ACCT4852',
  'ACCT4897'],
 ['ACTL1101',
  'ACTL2100',
  'ACTL2101',
  'ACTL2102',
  'ACTL2111',
  'ACTL2131',
  'ACTL3141',
  'ACTL3142',
  'ACTL3151',
  'ACTL3162',
  'ACTL3182',
  'ACTL3191',
  'ACTL3192',
  'ACTL3202',
  'ACTL3303',
  'ACTL4001',
  'ACTL4002',
  'ACTL4003',
  'ACTL4010',
  'ACTL4011',
  'ACTL4012',
  'ACTL4301',
  'ACTL4302',
  'ACTL4303',
  'ACTL4305'],
 ['ADAD0600',
  'ADAD1001',
  'ADAD1002',
  'ADAD1100',
  'ADAD2400',
  'ADAD2402',
  'ADAD2610',
  'ADAD3114',
  'ADAD3400',
  'ADAD3402',
  'ADAD4000',
  'ADAD4001',
  'ADAD4010',
  'ADAD4011',
  'ADAD4100'],
 ['AERO3110',
  'AERO3410',
  'AERO3630',
  'AERO3660',
  'AERO4110',
  'AERO4120',
  'AERO4500',
  'AERO4620',
  'AERO9500',
  '

In [24]:
total_code[-5:]

[['ZEIT1102',
  'ZEIT1110',
  'ZEIT1190',
  'ZEIT1191',
  'ZEIT1206',
  'ZEIT1208',
  'ZEIT1290',
  'ZEIT1291',
  'ZEIT1501',
  'ZEIT1503',
  'ZEIT1504',
  'ZEIT1600',
  'ZEIT1690',
  'ZEIT1800',
  'ZEIT1901',
  'ZEIT1902',
  'ZEIT2102',
  'ZEIT2103',
  'ZEIT2104',
  'ZEIT2105',
  'ZEIT2106',
  'ZEIT2190',
  'ZEIT2207',
  'ZEIT2208',
  'ZEIT2209',
  'ZEIT2500',
  'ZEIT2501',
  'ZEIT2502',
  'ZEIT2503',
  'ZEIT2504',
  'ZEIT2601',
  'ZEIT2602',
  'ZEIT2603',
  'ZEIT2700',
  'ZEIT2802',
  'ZEIT2803',
  'ZEIT2901',
  'ZEIT2902',
  'ZEIT3101',
  'ZEIT3102',
  'ZEIT3111',
  'ZEIT3112',
  'ZEIT3113',
  'ZEIT3114',
  'ZEIT3118',
  'ZEIT3119',
  'ZEIT3120',
  'ZEIT3121',
  'ZEIT3190',
  'ZEIT3191',
  'ZEIT3215',
  'ZEIT3216',
  'ZEIT3218',
  'ZEIT3220',
  'ZEIT3221',
  'ZEIT3222',
  'ZEIT3302',
  'ZEIT3404',
  'ZEIT3500',
  'ZEIT3501',
  'ZEIT3502',
  'ZEIT3503',
  'ZEIT3504',
  'ZEIT3505',
  'ZEIT3506',
  'ZEIT3600',
  'ZEIT3601',
  'ZEIT3602',
  'ZEIT3603',
  'ZEIT3605',
  'ZEIT3606',
  'ZEI

In [25]:
# write the codes into a csv
write_csv(total_code, 'codes', ['Course Codes Grouped by Subject Area'])

In [114]:
# start to crawl for each subject
driver.implicitly_wait(60)
total_info = []
base = 'https://www.handbook.unsw.edu.au'
path_u = '/undergraduate/courses/2020/'
path_p = '/postgraduate/courses/2020/'
path_r = '/research/courses/2020/'
path = [path_u, path_p, path_r]
for area in total_code:
    for code in area:
        if code == '':
            continue
        for p in path:
            try:
                url = base + p + code + '/'
                r = requests.get(url, timeout=30)
                r.raise_for_status()
            except:
                pass
            soup = BeautifulSoup(r.text, 'html.parser')
            if 'Error' in soup.title.string:
                continue
            else:
                break
                
        # code, name, course url, offering, enrolment condition, outline url, overview
        try:
            name = soup.title.string.split(' - ')[2]
        except:
            print(url)
        condi = soup.find_all('div', {'data-hbui': 'readmore__toggle-text'})
        try:
            pre = condi[1].div.string
        except:
            pre = ' '
        # find(name, attrs, recursive, text, **kwargs)
        ovs = soup.find('div', {'class': 'readmore__wrapper'})
        if ovs.p is None:
            ov = ovs.string
        else:
            ov = ovs.p.string
        offering = ' '
        offerings = soup.find_all('div', {'class': 'o-attributes-table-item'})
        for o in offerings:
            if not o.p is None and 'Term' in o.p.string:
                offering = o.p.string
                break
            
        outline = soup.find('a', {'class': 'a-btn-secondary a-btn-secondary--with-icon'}).get('href')
        info_lst = [code, name, url, offering, pre, outline, ov]
        total_info.append(info_lst)

In [115]:
# write them into a csv file
headers = ['Course Code', 'Course Name', 'Course URL', 'Offering Terms', 'Enrolment Conditions', 'Outline URL', 'Overview']
with open('courses.csv', 'w+', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(headers)
    for t in total_info:
        try:
            writer.writerow(t)
        except:
            traceback.print_exc()