In [9]:
import os
import requests
import json
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from os.path import join
from slugify import slugify
from bs4 import BeautifulSoup
from bs4.element import NavigableString

base_dir = "uog-courses"
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

In [2]:
browsebysubjectarea_url = "https://www.gla.ac.uk/coursecatalogue/browsebysubjectarea/"

In [3]:
subjectareas = []
r = requests.get(browsebysubjectarea_url)
r_soup = BeautifulSoup(r.text, "lxml")

maincontent = r_soup.find('div', {'class':'maincontent'})
li_subjectareas = maincontent.find('ul').findAll('li')

for li in li_subjectareas:
    a = li.find('a')
    subjectareas.append([a.text.strip(), a.get('href')])

In [4]:
def get_subject_code(url):
    q_id = url.index('?') + len("?code=")
    amp_id = url.index('&', q_id)
    return url[q_id:amp_id]

subject, code = [], []
for sa in subjectareas:
    subject.append(sa[0])
    code.append(get_subject_code(sa[1]))

subjects = pd.DataFrame({ 'subject': subject, 'code': code })
subjects.to_csv(join(base_dir, "subjects.csv"))

In [43]:
subjects = pd.read_csv(join(base_dir, "subjects.csv"), index_col=0)
subjects.head()

Unnamed: 0,code,subject
0,ACCFIN,Accounting and Finance
1,ADED,Adult and Continuing Education
2,AMERST,American Studies
3,ARCH,Archaeology
4,ARTMED,Arts and Media Informatics


In [6]:
def get_course_code(url):
    q_id = url.index('?') + len("?code=")
    return url[q_id:]

courselist_url = "https://www.gla.ac.uk/coursecatalogue/courselist/?code=%s"
subject, code = subjects['subject'].values, subjects['code'].values

subs, codes, course_codes = [], [], []

for s in list(zip(code, subject)):
    url = courselist_url % s[0]
    r = requests.get(url)
    r_soup = BeautifulSoup(r.text, "lxml")
    maincontent = r_soup.find('div', {'class':'maincontent'})
    lis = []
    uls = maincontent.findAll('ul')
    for ul in uls:
        lis.extend(ul.findAll('li'))
    for li in lis:
        a = li.find('a')
        subs.append(s[0])
        codes.append(s[1])
        course_codes.append(get_course_code(a.get('href')))

In [12]:
with_remark = re.compile('([0-1\w\s\'\.&:\-,\(\)]+)\s\(([0-9a-zA-Z]+)\)\s([A-Z0-9]+)')
just_title = re.compile('([0-1\w\s\'\.&:\-,\(\)]+)\s([A-Z0-9]+)')

course_url = "https://www.gla.ac.uk/coursecatalogue/course/?code=%s"
details = []
for code in course_codes:
    url = course_url % code
    r = requests.get(url)
    r_soup = BeautifulSoup(r.text, "lxml")
    maincontent = r_soup.find('div', {'class':'maincontent'})
    course_title = maincontent.find('h1').text
    match_with_remark = with_remark.search(course_title)
    detail = {}
    remark = ''
    title = ''
    if match_with_remark:
        title = match_with_remark.group(1).strip()
        remark = match_with_remark.group(2).strip()
    else:
        match_just_title = just_title.search(course_title)
        if match_just_title:
            title = match_just_title.group(1).strip()
        else:
            print(url)
    detail = {slugify(a.text.split(':')[0].strip(), separator="_"): a.text.split(':')[1].strip() 
              for a 
              in maincontent.find('ul').findAll('li')}
    detail['remark'] = remark
    detail['title'] = title
    detail['code'] = code
    details.append(detail)
    
with open(join(base_dir,'courses.json'), 'w') as outfile:
    json.dump(details, outfile, indent=4)

In [14]:
courses_df = pd.DataFrame(details)
courses_df.to_csv(join(base_dir,'courses_raw.csv'))
courses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5244 entries, 0 to 5243
Data columns (total 11 columns):
academic_session                      5244 non-null object
available_to_erasmus_students         3650 non-null object
available_to_visiting_students        5244 non-null object
code                                  5244 non-null object
credits                               5244 non-null object
level                                 5244 non-null object
remark                                5244 non-null object
school                                5244 non-null object
taught_wholly_by_distance_learning    185 non-null object
title                                 5244 non-null object
typically_offered                     5244 non-null object
dtypes: object(11)
memory usage: 450.7+ KB


In [51]:
courses_df = pd.read_csv(join(base_dir,'courses_raw.csv'), index_col = 0)

yes_transform = lambda x: x == 'Yes'

courses_df.level = pd.Categorical(values=courses_df.level)
courses_df.available_to_erasmus_students = courses_df.available_to_erasmus_students.apply(yes_transform)
courses_df.available_to_visiting_students = courses_df.available_to_visiting_students.apply(yes_transform)
courses_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5244 entries, 0 to 5243
Data columns (total 11 columns):
academic_session                      5244 non-null object
available_to_erasmus_students         5244 non-null bool
available_to_visiting_students        5244 non-null bool
code                                  5244 non-null object
credits                               5244 non-null int64
level                                 5244 non-null category
remark                                554 non-null object
school                                5244 non-null object
taught_wholly_by_distance_learning    185 non-null object
title                                 5244 non-null object
typically_offered                     5244 non-null object
dtypes: bool(2), category(1), int64(1), object(7)
memory usage: 384.5+ KB


In [54]:
reg = re.compile('([A-Z]+)')
def get_subject(value):
    match = reg.search(value)
    return match.group(1)

courses_df['subject'] = courses_df.code.apply(get_subject)
courses_df['subject'] = pd.Categorical(values=courses_df.subject)
courses_df.to_csv(join(base_dir,'courses_1.csv'))
courses_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5244 entries, 0 to 5243
Data columns (total 12 columns):
academic_session                      5244 non-null object
available_to_erasmus_students         5244 non-null bool
available_to_visiting_students        5244 non-null bool
code                                  5244 non-null object
credits                               5244 non-null int64
level                                 5244 non-null category
remark                                554 non-null object
school                                5244 non-null object
taught_wholly_by_distance_learning    185 non-null object
title                                 5244 non-null object
typically_offered                     5244 non-null object
subject                               5244 non-null category
dtypes: bool(2), category(2), int64(1), object(7)
memory usage: 392.6+ KB


In [49]:
courses_df.subject.unique()

[ACCFIN, ADED, AMERST, ARCH, ARTMED, ..., TRS, UESTC, URBAN, VETSCI, VETMED]
Length: 64
Categories (64, object): [ACCFIN, ADED, AMERST, ARCH, ..., UESTC, URBAN, VETSCI, VETMED]

In [45]:
subjects.head()

Unnamed: 0,code,subject
0,ACCFIN,Accounting and Finance
1,ADED,Adult and Continuing Education
2,AMERST,American Studies
3,ARCH,Archaeology
4,ARTMED,Arts and Media Informatics
