# UCAS Grabber

In [1]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import time
import math

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}

In [3]:
# UCAS has the feature to list all universities by the initial letter
# This part of the code grabs each university's representitive string

univ_strings = []
univ_names = []

# Find all universities with the given initial letter
def get_univ_strings_by_letter(letter):
    time.sleep(0.5)
    url = 'https://www.ucas.com/explore/unis?studyYear=current&letter=' + letter
    res = requests.get(url, headers=headers).text
    soup = BeautifulSoup(res, "html.parser")

    univ_links = soup.find_all('a', class_="link-container__link link-container__hidden-link")
    for univ_link in univ_links:
        courses_page = univ_link['href'].replace('/explore/uni-detail/', '/explore/related/')
        univ_name = univ_link['title']
        univ_strings.append(courses_page)
        univ_names.append(univ_name)

letter_list = 'abcdefghiklmnopqrstuwy'     # for real
# letter_list = 'i'                        # for testing

for letter in letter_list:
    get_univ_strings_by_letter(letter)

# arrange data into a table and export to Microsoft Excel
univ_data = pd.DataFrame({'University': univ_names, 
                         'UCAS Link': univ_strings})
univ_data.to_excel('university-strings.xlsx')
univ_data.head(10)

Unnamed: 0,University,UCAS Link
0,University of Aberdeen,/explore/related/3a023828-8d77-ecfa-dc21-2708f...
1,Abertay University,/explore/related/1558119a-fa4d-ca42-ddbd-0694a...
2,Aberystwyth University,/explore/related/168d2cd8-ad19-7681-d665-453cf...
3,ACM (The Academy of Contemporary Music),/explore/related/3d3a3bb5-b96a-e00c-2682-29151...
4,Activate Learning,/explore/related/1630c359-4a88-1162-6be9-75a9b...
5,Al-Maktoum College of Higher Education,/explore/related/968ce5a3-c2a0-466b-8d0a-c4d99...
6,Amity University [IN] London,/explore/related/57f3e439-84c0-4e07-8e3e-92673...
7,Anglia Ruskin University,/explore/related/d890f3a3-4c8e-f241-747b-fa2f1...
8,AECC University College,/explore/related/d920c4e2-d428-8cbf-e0ff-2ec19...
9,Arden University,/explore/related/8554563c-d2bb-4bc5-b0eb-e05c7...


In [4]:
print("Number of Universities: " + str(len(univ_strings)))

print(univ_names)

Number of Universities: 333
['University of Aberdeen', 'Abertay University', 'Aberystwyth University', 'ACM (The Academy of Contemporary Music)', 'Activate Learning', 'Al-Maktoum College of Higher Education', 'Amity University [IN] London', 'Anglia Ruskin University', 'AECC University College', 'Arden University', 'Arts University Bournemouth', 'University Centre Askham Bryan', 'Aston University, Birmingham', 'Backstage Academy', 'Bangor University', 'Barnet and Southgate College', 'Barnfield College, Luton', 'Barnsley College Higher Education', 'Bath Spa University', 'University of Bath', 'Bedford College Group', 'University of Bedfordshire', 'BIMM Institute', 'Birkbeck, University of London', 'Birmingham City University', 'BMet (Birmingham Metropolitan College)', 'University College Birmingham', 'University of Birmingham', 'Bishop Auckland College', 'University Centre Bishop Burton', 'Bishop Grosseteste University', 'Blackburn College', 'Blackpool and the Fylde College', 'University 

In [63]:
target_univs = [#'University of Aberdeen', 
                #'Bath Spa University', 
                'University of Bath',  
                'University of Birmingham', 
                'University of Bristol', 
                #'Brunel University London', 
                'University of Cambridge', 
                #'Cardiff University', 
                #'University of Chester', 
                #'City, University of London', 
                #'Coventry University', 
                #'University for the Creative Arts',  
                #'University of Dundee', 
                'Durham University', 
                #'University of East Anglia UEA',
                'The University of Edinburgh', 
                #'University of Essex', 
                #'University of Exeter', 
                'University of Glasgow',
                #'University of Greenwich', 
                #'Heriot-Watt University', 
                #'University of Hull', 
                'Imperial College London', 
                #'Keele University', 
                #'University of Kent', 
                "King's College London, University of London", 
                'Lancaster University', 
                'University of Leeds', 
                #'University of Leicester', 
                #'University of Lincoln', 
                #'University of Liverpool', 
                #'London Metropolitan University', 
                'London School of Economics and Political Science, University of London', 
                'London School of Hygiene & Tropical Medicine, University of London', 
                'Loughborough University', 
                'University of Manchester', 
                #'Newcastle University', 
                'University of Nottingham', 
                #'Oxford Brookes University', 
                'University of Oxford', 
                #'University of Portsmouth', 
                'Queen Mary University of London', 
                #"Queen's University Belfast", 
                #'University of Reading', 
                #'Royal Academy of Music, University of London', 
                #'Royal Holloway, University of London', 
                #'University of St Andrews', 
                'University of Sheffield', 
                'University of Southampton', 
                #'University of Strathclyde', 
                #'University of Surrey', 
                #'University of Sussex', 
                #'Swansea University', 
                'UCL (University College London)', 
                'University of the Arts London', 
                'University of Warwick', 
                'University of York']

target_univ_strings = []

for univ in target_univs:
    if univ in univ_names:
        target_univ_strings.append(univ_strings[univ_names.index(univ)])

print('Number of target universities: ' + str(len(target_univ_strings)))

Number of target universities: 24


In [64]:
# UCAS also has the feature to list all courses provided by a university
# This part of the code grabs the links to all the course pages

urls = []

def get_course_links_by_univ(univ_string):
    url = 'https://www.ucas.com' + univ_string
    res = requests.get(url, headers=headers).text
    soup = BeautifulSoup(res, "html.parser")
    
    page_info = soup.find('div', class_="explore__page-count").get_text()
    course_count = int(page_info.split(' ')[-1])
    page_number = math.ceil( course_count / 48 )
    
    print('\nRetrieving course list from ' + target_univs[target_univ_strings.index(univ_string)])
    print('Number of courses available: ' + str(course_count))
    for i in range(page_number):
        url = 'https://www.ucas.com' + univ_string + '&page=' + str(i+1)
        res = requests.get(url, headers=headers).text
        soup = BeautifulSoup(res, "html.parser")
        
        course_links = soup.find_all('a', class_="link-container__link sr-only")
        for course_link in course_links:
            course_page = course_link['href']
            urls.append(course_page)
        time.sleep(5)
        print('  Page '+ str(i+1) + ' of ' + str(page_number) + ' completed' )

for univ_string in target_univ_strings:
    get_course_links_by_univ(univ_string)
    time.sleep(30)

Retrieving course list from University of Bath
Number of courses available: 264
  Page 1 of 6 completed
  Page 2 of 6 completed
  Page 3 of 6 completed
  Page 4 of 6 completed
  Page 5 of 6 completed
  Page 6 of 6 completed
Retrieving course list from University of Birmingham
Number of courses available: 271
  Page 1 of 6 completed
  Page 2 of 6 completed
  Page 3 of 6 completed
  Page 4 of 6 completed
  Page 5 of 6 completed
  Page 6 of 6 completed
Retrieving course list from University of Bristol
Number of courses available: 285
  Page 1 of 6 completed
  Page 2 of 6 completed
  Page 3 of 6 completed
  Page 4 of 6 completed
  Page 5 of 6 completed
  Page 6 of 6 completed
Retrieving course list from University of Cambridge
Number of courses available: 47
  Page 1 of 1 completed
Retrieving course list from Durham University
Number of courses available: 152
  Page 1 of 4 completed
  Page 2 of 4 completed
  Page 3 of 4 completed
  Page 4 of 4 completed
Retrieving course list from The Univ

In [96]:
print("Number of Course Pages: " + str(len(urls)))

Number of Course Pages: 7138


In [97]:
# saving all urls for good
file_for_urls = open('file_for_urls.txt','w')

for url in urls:
    file_for_urls.write(url)
    file_for_urls.write('\n')

file_for_urls.close()

In [121]:
# get_info() handler takes each url and grabs the key information for each course
# The results are arranged into a pandas DataFrame

def get_info(url): 
    try:
        res = requests.get(url, headers=headers).text
        soup = BeautifulSoup(res, "html.parser")
    except:
        res = ''
        soup = BeautifulSoup(res, "html.parser")
        time.sleep(60)

    try:
        university = soup.find('h3', id="provider-name").get_text()
    except:
        university = ''

    try:
        course_title = soup.find('h1', id="course-title").get_text()
    except:
        course_title = ''

    try:
        course_code = soup.find('dd',id='application-code').get_text()
        institution_code = soup.find('dd',id='institution-code').get_text()
    except:
        course_code = ''
        institution_code = ''

    try:
        qlf_tag = soup.find('label', {'for': 'qualification-type'})
        qualification = qlf_tag['data-options-bar-item-value']
    except:
        qualification = ''

    try:
        dur_tag = soup.find('label', {'for': "duration"})
        duration = dur_tag['data-options-bar-item-value']
    except:    
        duration = ''

    try:
        univ_link = soup.find('a', id='ProviderCourseUrl')['href']
    except:
        univ_link = ''

    tariff = ''
    a_level = ''
    a_level_detail = ' '

    try:
        entry_reqs = soup.find_all('h2', class_='accordion__label')
        details = soup.find_all('div', class_='accordion__inner-wrapper')
        for entry_req in entry_reqs:
            req = entry_req.get_text()
            if req.startswith('UCAS Tariff'):
                tariff = req.replace('UCAS Tariff - ', '')
            if req.startswith('A level'):
                a_level = req.replace('A level - ', '')
                idx = entry_reqs.index(entry_req)
                a_level_detail = details[idx].get_text()
    except:
        pass

    
    result = pd.DataFrame({'University': [university], 
                           'Course Title': [course_title], 
                           'Course Code': [course_code], 
                           'Institution Code': [institution_code], 
                           'Qualification': [qualification], 
                           'Duration': [duration],
                           'UCAS Tariff': [tariff],
                           'A-Level Requirement': [a_level], 
                           'A-Level Details': [a_level_detail[1:]],
                           'Course Page': [univ_link]
                           })
    time.sleep(2)
    return result

In [111]:
# reset ucas_data
# do not run this part unless a full update is required
ucas_data = pd.DataFrame()

In [None]:
# main task for the grabber
# each url linking to the UCAS course page is being called in turn
# information about the course is extracted and collected in a data table
for i in range(0, len(urls)):
    url = urls[i]
    result = get_info(url)
    if i % 12 == 0:
        percentage = i/len(urls)*100
        print('{:.2f}'.format(percentage) + '% completed.')
        time.sleep(40)
    ucas_data = ucas_data.append(result)

ucas_data.head(10)

0.00% completed.
0.17% completed.
0.34% completed.
0.50% completed.
0.67% completed.
0.84% completed.
1.01% completed.
1.18% completed.
1.34% completed.
1.51% completed.
1.68% completed.
1.85% completed.
2.02% completed.
2.19% completed.
2.35% completed.
2.52% completed.
2.69% completed.
2.86% completed.
3.03% completed.
3.19% completed.
3.36% completed.
3.53% completed.
3.70% completed.
3.87% completed.
4.03% completed.
4.20% completed.
4.37% completed.
4.54% completed.
4.71% completed.
4.88% completed.
5.04% completed.
5.21% completed.
5.38% completed.
5.55% completed.
5.72% completed.
5.88% completed.
6.05% completed.
6.22% completed.
6.39% completed.
6.56% completed.
6.72% completed.
6.89% completed.
7.06% completed.
7.23% completed.
7.40% completed.
7.57% completed.
7.73% completed.
7.90% completed.
8.07% completed.
8.24% completed.
8.41% completed.
8.57% completed.
8.74% completed.
8.91% completed.
9.08% completed.
9.25% completed.
9.41% completed.
9.58% completed.
9.75% complete

In [113]:
ucas_data.shape

(22, 10)

In [125]:
# Relabel the indices for the entries
# Export the data to an Microsoft Excel file

# ucas_data.reset_index(inplace=True)

ucas_data.to_excel('UCAS_data.xlsx')

ucas_data.head(5)
                                                 

Unnamed: 0,University,Course Title,Course Code,Institution Code,Qualification,Duration,UCAS Tariff,A-Level Requirement,A-Level Details,Course Page
0,University of Bath,Accounting and Finance,NN34,B16,Bachelor of Social Science (with Honours) - BS...,3 years,Not accepted,AAA - A*AB,Typical offer\nAAA or A*AB in three A levels i...,https://www.bath.ac.uk/courses/undergraduate-2...
0,University of Bath,Accounting and Finance (with professional plac...,NN43,B16,Bachelor of Science (with Honours) - BSc (Hons),4 years,Not accepted,AAA - A*AB,Typical offer\nAAA or A*AB in three A levels i...,https://www.bath.ac.uk/courses/undergraduate-2...
0,University of Bath,Advanced Quantitative Methods in Social Scienc...,,,Masters in Research (MRes),1 year,,,,https://www.bath.ac.uk/courses/postgraduate-20...
0,University of Bath,Aerospace Engineering,H400,B16,Master of Engineering (with Honours) - MEng (Hon),4 years,Not accepted,A*AA,Typical offer: A*AA in three A levels includin...,https://www.bath.ac.uk/courses/undergraduate-2...
0,University of Bath,Aerospace Engineering (with placement).,H423,B16,Master of Engineering (with Honours) - MEng (Hon),5 years,Not accepted,A*AA,Typical offer: A*AA in three A levels includin...,https://www.bath.ac.uk/courses/undergraduate-2...


In [119]:
# testing: check if the grabber can retrieve information from a single url
test = get_info(urls[23])
test.head()

Unnamed: 0,University,Course Title,Course Code,Institution Code,Qualification,Duration,UCAS Tariff,A-Level Requirement,A-Level Details,Course Page
0,University of Bath,Chemical Engineering (with placement),H814,B16,Bachelor of Engineering (with Honours) - BEng ...,4 years,Not accepted,A*AA,A*AA in three A-Levels including Chemistry and...,https://www.bath.ac.uk/courses/undergraduate-2...


In [94]:
urls[3552]

'https://digital.ucas.com/coursedisplay/courses/d7a3ff36-c292-36a5-099f-46a3ea19904f?academicYearId=2022&courseOptionId=5abc416e-84f7-4410-89bf-42858e827e21'