In [None]:
import os
import requests
import json
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from os.path import join
from slugify import slugify
from bs4 import BeautifulSoup
from bs4.element import NavigableString

base_dir = "uog-courses"
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

In [None]:
browsebysubjectarea_url = "https://www.gla.ac.uk/coursecatalogue/browsebysubjectarea/"

In [None]:
subjectareas = []
r = requests.get(browsebysubjectarea_url)
r_soup = BeautifulSoup(r.text, "lxml")

maincontent = r_soup.find('div', {'class':'maincontent'})
li_subjectareas = maincontent.find('ul').findAll('li')

for li in li_subjectareas:
    a = li.find('a')
    subjectareas.append([a.text.strip(), a.get('href')])

In [None]:
def get_subject_code(url):
    q_id = url.index('?') + len("?code=")
    amp_id = url.index('&', q_id)
    return url[q_id:amp_id]

subject, code = [], []
for sa in subjectareas:
    subject.append(sa[0])
    code.append(get_subject_code(sa[1]))

subjects = pd.DataFrame({ 'subject': subject, 'code': code })
subjects.to_csv(join(base_dir, "subjects.csv"))

In [None]:
subjects = pd.read_csv(join(base_dir, "subjects.csv"), index_col=0)
subjects.head()

In [None]:
def get_course_code(url):
    q_id = url.index('?') + len("?code=")
    return url[q_id:]

courselist_url = "https://www.gla.ac.uk/coursecatalogue/courselist/?code=%s"
subject, code = subjects['subject'].values, subjects['code'].values

subs, codes, course_codes = [], [], []

for s in list(zip(code, subject)):
    url = courselist_url % s[0]
    r = requests.get(url)
    r_soup = BeautifulSoup(r.text, "lxml")
    maincontent = r_soup.find('div', {'class':'maincontent'})
    lis = []
    uls = maincontent.findAll('ul')
    for ul in uls:
        lis.extend(ul.findAll('li'))
    for li in lis:
        a = li.find('a')
        subs.append(s[0])
        codes.append(s[1])
        course_codes.append(get_course_code(a.get('href')))

In [None]:
with_remark = re.compile('([0-1\w\s\'\.&:\-,\(\)]+)\s\(([0-9a-zA-Z]+)\)\s([A-Z0-9]+)')
just_title = re.compile('([0-1\w\s\'\.&:\-,\(\)]+)\s([A-Z0-9]+)')

course_url = "https://www.gla.ac.uk/coursecatalogue/course/?code=%s"
details = []
for code in course_codes:
    url = course_url % code
    r = requests.get(url)
    r_soup = BeautifulSoup(r.text, "lxml")
    maincontent = r_soup.find('div', {'class':'maincontent'})
    course_title = maincontent.find('h1').text
    match_with_remark = with_remark.search(course_title)
    detail = {}
    remark = ''
    title = ''
    if match_with_remark:
        title = match_with_remark.group(1).strip()
        remark = match_with_remark.group(2).strip()
    else:
        match_just_title = just_title.search(course_title)
        if match_just_title:
            title = match_just_title.group(1).strip()
        else:
            print(url)
    detail = {slugify(a.text.split(':')[0].strip(), separator="_"): a.text.split(':')[1].strip() 
              for a 
              in maincontent.find('ul').findAll('li')}
    detail['remark'] = remark
    detail['title'] = title
    detail['code'] = code
    details.append(detail)
    
with open(join(base_dir,'courses.json'), 'w') as outfile:
    json.dump(details, outfile, indent=4)

In [None]:
courses_df = pd.DataFrame(details)
courses_df.to_csv(join(base_dir,'courses_raw.csv'))
courses_df.info()

In [None]:
courses_df = pd.read_csv(join(base_dir,'courses_raw.csv'), index_col = 0)

yes_transform = lambda x: x == 'Yes'

courses_df.level = pd.Categorical(values=courses_df.level, categories=["Level 1 (SCQF level 7)", 
                                                                       "Level 2 (SCQF level 8)", 
                                                                       "Level 3 (SCQF level 9)",
                                                                       "Level 4 (SCQF level 10)",
                                                                       "Level 4 (BDS, BVMS, MBChB)",
                                                                       "Level 5 (SCQF level 11)",  
                                                                       "Level 5 (BDS, BVMS, MBChB)", 
                                                                       "Level 6 (SCQF level 12)"], ordered=True)
courses_df.school = pd.Categorical(values=courses_df.school)
courses_df.available_to_erasmus_students = courses_df.available_to_erasmus_students.apply(yes_transform)
courses_df.available_to_visiting_students = courses_df.available_to_visiting_students.apply(yes_transform)
courses_df.info()

In [None]:
reg = re.compile('([A-Z]+)')
def get_subject(value):
    match = reg.search(value)
    return match.group(1)

courses_df['subject'] = courses_df.code.apply(get_subject)
courses_df['subject'] = pd.Categorical(values=courses_df.subject)
courses_df.to_csv(join(base_dir,'courses_1.csv'))
courses_df.info()

In [None]:
unique_levels = courses_df.level.unique()
print(unique_levels)

In [None]:
course_counts = courses_df[['subject','level','code']]
course_counts = course_counts.sort_values(by='subject')
subjects = subjects.sort_values(by='code')

print(subjects.head())
print(course_counts.head())

In [None]:
pivoted = pd.pivot_table(course_counts, 
                         index='subject', 
                         columns='level', 
                         values='code', 
                         aggfunc='count',
                         fill_value=0)

fig, ax = plt.subplots(figsize=(10,25))        
sns.heatmap(pivoted, annot=True, fmt="g", cmap='viridis', ax=ax)
ax.xaxis.tick_top()
plt.xticks(rotation='vertical')
plt.show()