In [1]:
# import requests
import os
import re
import json
import requests
from requests import get
from bs4 import BeautifulSoup as bsoup
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

GASPAR_NAME = os.getenv("GASPAR_NAME")
GASPAR_PASS = os.getenv("GASPAR_PASS")

print(f"Your gaspar id is {GASPAR_NAME} and your password is a secret!")

Your gaspar id is loftsson and your password is a secret!


In [2]:
# initialize
registrations = {}

In [3]:
isa_base = 'https://isa.epfl.ch/imoniteur_ISAP/'
isa_login_action = '!logins.tryToConnect'

report_model = '2212045167'
isa_reports_inscriptions_cours = '!GEDREPORTS.filter'
isa_course_report = '!GEDREPORTS.bhtml'

login_form = {
    'ww_x_urlAppelant': 'isacademia.htm',
    'ww_x_username': GASPAR_NAME,
    'ww_x_password': GASPAR_PASS
}

report_data = {
    'ww_b_list': '1',
    'ww_i_reportmodel': report_model,
    'ww_i_reportModelXsl': '2212045204',  # html format
    'ww_x_MAT': '',  # course ID
    'zz_x_PERIODE_ACAD': '',  # Période académique (<select> text)  default all
    'ww_x_PERIODE_ACAD': ''   # Période académique (<select> value) default all
}

regex_years_end = re.compile('.+\d{4}.\d{4}$')


def get_isa_course_report(x_MAT, course_name):
    response = session.get(isa_base + isa_course_report, data=report_data, timeout=30)
    soup = bsoup(response.text, 'html.parser')

    if course_name not in registrations:
        # sometimes the course name is duplicated
        registrations[course_name] = {}

    reg = registrations[course_name]

    # select elements with info about the number of students per program, year, semester
    colspan_2 = soup.select('tr > td[colspan="2"]')

    programs = []
    semesters = []
    
    for el in colspan_2:
        text = el.text
        
        if 'ét.' not in text:  # program name, year and semester info
            text = text.replace('Ecole polytechnique fédérale de Lausanne, ', '')  # replace epfl prefix which occurs sometimes

            if regex_years_end.match(text):
                # edoc (phd), no semester info
                text_split = text.rsplit(', ', maxsplit=1)
                program = text_split[0]
                year = text_split[1]
                semester = ''
            else:
                text_split = text.rsplit(', ', maxsplit=2)
                program = text_split[0]
                year = text_split[1]
                semester = text_split[2]

            if year not in reg:
                reg[year] = {}
            
            if program in programs:
                idx = programs.index(program)
                if semesters[idx] == semester:
                    continue  # continue if the semester info is the same
                
                sem = semester.rsplit(maxsplit=1)[1]
                oldsem = semesters[idx].rsplit(maxsplit=1)[1]
                
                try:
                    if int(sem) > int(oldsem):
                        semesters[idx] += (', ' + sem)  # append
                    else:
                        semesters[idx] = semesters[idx][:-1]
                        semesters[idx] += (sem + ', ' + oldsem)  # prepend
                except ValueError:
#                     print(course_name, program, year, sem)
                    semesters[idx] += (', ' + sem)
                
            else:
                programs.append(program)
                semesters.append(semester)

#             if program not in reg:
                # sometimes two master semesters in a row for the same program,
                # so we only extract info from the first line
                # For example:
                # Bioengineering, 2019-2020, Master semester 3
                # Bioengineering, 2019-2020, Master semester 1
                
#                 level = ''
#                 if 'edoc' in program.lower():
#                     level = 'doctoral_school'
#                 elif 'master' in semester.lower():
#                     level = 'master'
#                 elif 'bachelor' in semester.lower():
#                     level = 'bachelor'
#                 elif 'minor' in program.lower() or 'mineur' in program.lower():
#                     level = 'minor'
#                 else:
#                     the program might be a UNIL program or exchange program, for instance
                
#                 program_key = level + program
#                 reg[year][program]['level'] = level

        else:
#             reg[year][program]['count'] = re.search('\d+', text)[0]
            # we can't use lists as keys in a dictionary/json, but string - yes!
            programs_key = '\n'.join(
                list(map(
                    lambda tup: tup[0]+', '+tup[1] if tup[1] else tup[0],
                    zip(programs, semesters)
                ))
            )
            reg[year][programs_key] = int(re.search('\d+', text)[0])
            programs = []
            semesters = []

In [195]:
# a = list(zip(['3082333', '556618501', '352250335', '1597304757', '80887096', '1597935627'], [
#     'Design of experiments (a) - Fall semester',
#  'Design of experiments (c) - Spring semester',
#  'Optimization and simulation',
#  'Machine Learning for Engineers',
#  'Product lifecycle management - concepts methods and tools',
#  'Topics in Autonomous Robotics']))

In [207]:
# for k, v in a:
#     for yr, val in registrations[v].items():
#         if 'total' in val:
#             print(val)

In [209]:
# # count total for each year
# for k, v in a:
#     for yr, val in registrations[v].items():
#         total = 0
#         for programs, count in val.items():
#             total += count

#         registrations[v][yr]['total'] = total

In [214]:
# registrations['Design of experiments (c) - Spring semester']

In [197]:
# for c, v in registrations.items():
#     for y, v2 in v.items():
#         for m, v3 in v2.items():
#             if m.endswith(', '):
#                 v2.pop(m)
#                 v2[m[:-2]] = v3

In [4]:
with requests.session() as session:
    session.post(isa_base + isa_login_action, data=login_form)
    response = session.get(isa_base + isa_reports_inscriptions_cours, data=report_data)
    soup = bsoup(response.text, 'html.parser')
    
    # get all ids and course names below
    # "Cliquez sur une des matières pour avoir les inscriptions"
    ww_x_MAT = [[link.get('onclick')[32:-42], link.text.strip()] for link in soup.find_all(class_='ww_x_MAT')]
    
    ww_x_MAT_timeouts = []

    # loop over course ids and names
    for x_MAT, course_name in ww_x_MAT:
        print(f"scraping {x_MAT}: {course_name}")
        report_data['ww_x_MAT'] = x_MAT
        
        try:
            get_isa_course_report(x_MAT, course_name)
        except requests.exceptions.Timeout:
            ww_x_MAT_timeouts.append((x_MAT, course_name))
            continue
            

scraping 1497045288: Numerical approximation of PDE's II
scraping 2829582199: "The land of thousand dances"
scraping 1580216797: (Hi)stories of technologies : from innovation to use
scraping 2079163482: 1st Workshop on Advances in CFD and MD modelling of Interface Dynamics in Capillary Two-Phase Flows
scraping 2083170884: 2D Layered Materials: Synthesis, Properties and Applications
scraping 2155744280: 2nd PhDs in Transitions Conference 2017 "Sustainability Transitions: Theory and Practice"
scraping 2238685617: 2nd Workshop on Advances in Theoretical and Computational Modelling of Interface Dynamics in Capillary Two-Phase Flows
scraping 127775050: 3D Electron Microscopy and FIB-Nanotomography
scraping 2325959939: 3D Printing with light
scraping 2739876452: 6th Machine learning in High Energy Physics Summer School
scraping 1695580335: A History of Evolutionary Theory
scraping 1951252224: A Network Tour of Data Science
scraping 2385096462: A Political History of Urban Form
scraping 17576

KeyboardInterrupt: 

In [98]:
[t[1] for t in ww_x_MAT_timeouts]

['Analysis I',
 'Analysis II',
 'Analysis III',
 'Analysis IV',
 'Computer-aided engineering I',
 'General chemistry',
 'General chemistry I (For MEU)',
 'General physics I',
 'General physics II',
 'General physics III',
 'Geometry',
 'Information, Computation, Communication',
 'Linear Algebra',
 'Numerical analysis',
 'Probabilities and statistics']

In [99]:
# Some requests timed out, so we need to narrow our search down to each academic year
# for these particular courses

# value-text pairs for <select name='ww_x_PERIODE_ACAD'>
ww_x_PERIODE_ACAD = {
    '1866895046': '2020-2021'
    '1866894985': '2019-2020',
    '1866893861': '2018-2019',
    '762820622': '2017-2018',
    '355925344': '2016-2017',
    '213638028': '2015-2016',
    '213637922': '2014-2015',
    '213637754': '2013-2014',
    '123456101': '2012-2013'
}

for x_MAT, course_name in ww_x_MAT_timeouts:
    print(f"scraping {x_MAT}: {course_name}")
    report_data['ww_x_MAT'] = x_MAT
    
    # loop over each academic year
    for key, value in ww_x_PERIODE_ACAD.items():
        report_data['zz_x_PERIODE_ACAD'] = value
        report_data['ww_x_PERIODE_ACAD'] = key
    
        # code below is copy-pasted
        try:
            get_isa_course_report(x_MAT, course_name)
        except requests.exceptions.Timeout:
            timeouts.append((x_MAT, course_name))
            continue

scraping 1705590: Analysis I
scraping 1705597: Analysis II
scraping 1772230: Analysis III
scraping 1772235: Analysis IV
scraping 1705560: Computer-aided engineering I
scraping 1772517: General chemistry
scraping 1772537: General chemistry I (For MEU)
scraping 1775427: General physics I
scraping 1775437: General physics II
scraping 1775452: General physics III
scraping 1773777: Geometry
scraping 1657719952: Information, Computation, Communication
scraping 1772095: Linear Algebra
scraping 1179245: Numerical analysis
scraping 1775642: Probabilities and statistics


In [127]:
# count total for each year
for course_name, course in registrations.items():
    for year, counts in course.items():
        total = 0
        for programs, count in counts.items():
            total += count

        registrations[course_name][year]['total'] = total

In [215]:
with open("../data/raw/isa_registrations.json", "w") as f:
    print(json.dumps(registrations), file=f)

In [216]:
print("Number of distinct courses:", len(registrations))

Number of distinct courses: 4553
