In [None]:
import json
import numpy as np
import pandas as pd

In [None]:
with open("../data/raw/epfl.json") as file:
    epfl = json.load(file)
    
with open("../data/raw/epfl_master_specs.json") as file:
    epfl_masterspecs = json.load(file)

Snapshots from the json objects:

In [None]:
epfl['master']['Data Science']['COM-480']

In [None]:
epfl_masterspecs['Civil Engineering']

Let's start with noting that only 25 master's programs are currently offered at EPFL and only 13 bachelor programs. So, we have some outdated programs in our database. Let's remove them.

In [None]:
{level: len(programs) for level, programs in epfl.items()}

In [None]:
# Only 13 bachelor programs are currently offered
# + Humanities and Social Sciences Program
# + Design Together ENAC
# https://www.epfl.ch/education/bachelor/programs/
bachelor_not_current = ['Chemistry', 'Chemical Engineering']

# Only 25 master's programs are currently offered
# + Humanities and Social Sciences Program
# https://www.epfl.ch/education/master/programs/
master_not_current = [
    'Bioengineering',
    'Life Sciences and Technologies - master program',
    'Micro- and Nanotechnologies for Integrated Systems'
]

# The following minors are missing, among possibly others:
#   Computational science and engineering
#   Mechanical engineering
# We do not take any action for now

In [None]:
# remove programs not currently offered from the main json object
for level_name in list(epfl.keys()):
    for program_name in list(epfl[level_name].keys()):
        if (
            (level_name == 'bachelor' and program_name in bachelor_not_current) or 
            (level_name == 'master' and program_name in master_not_current)
        ):
            del epfl[level_name][program_name]
            if level_name == 'master' and program_name in epfl_masterspecs:
                # remove program from the list of programs with specializations
                del epfl_masterspecs[program_name]

Let's now analyze master's specializations (no specializations at other levels) and evaluate data accuracy. We compare the data source (studyplan pages) with the studyplan brochures. We have created a separate json object `epfl_master_specs` to store information about specializations.

In [None]:
"""
The studyplan page sometimes differs greatly from the up-to-date studyplan brochure
which might indicate that the studyplan page hasn't been updated.
We skip these specializations for now (we would have to manually type them in):

Architecture
https://www.epfl.ch/education/master/wp-content/uploads/2018/08/ENAC_ARCHI_MA-1.pdf
vs.
https://edu.epfl.ch/studyplan/en/master/architecture
"""
specs_to_remove = ['Architecture']

"""
The specializations legend on studyplan pages is sometimes obsolete (hasn't been removed)
We remove these programs from the list of specialization programs

Materials Science and Engineering (only one specialization now)
https://www.epfl.ch/education/master/wp-content/uploads/2018/08/STI_MX_MA-1.pdf
vs.
https://edu.epfl.ch/studyplan/en/master/materials-science-and-engineering

Microengineering
https://www.epfl.ch/education/master/wp-content/uploads/2018/08/STI_SMT_MA_RV-1.pdf
vs.
https://edu.epfl.ch/studyplan/en/master/microengineering
"""
specs_to_remove.extend(['Materials Science and Engineering', 'Microengineering'])

"""
The specializations legend on studyplan pages is sometimes obsolete, but the studyplan itself
contains references to specializations that correctly correspond to the the studyplan brochure
In this case, we update the legend manually and fix the data

Electrical and Electronics Engineering
https://www.epfl.ch/education/master/wp-content/uploads/2018/08/STI_EL_MA-1.pdf
vs.
https://edu.epfl.ch/studyplan/en/master/electrical-and-electronics-engineering
"""
electrical_electronics_eng_specs = {
    "a": "Microelectronics circuits and systems",
    "b": "Electronic technologies and device-circuit interactions",
    "c": "Bioelectronics",
    "d": "Internet of Things (IoT)",
    "e": "Data Science and Systems",
    "f": "Signal, Image, Video and Communication",
    "g": "Wireless and Photonics Circuits and Systems",
}

We remove all programs in `specs_to_remove`:

In [None]:
for program in specs_to_remove:
    del epfl_masterspecs[program]

... and fix "Electrical and Electronics Engineering" specializations key:

In [None]:
epfl_masterspecs['Electrical and Electronics Engineering']['spec_key'] = electrical_electronics_eng_specs

We observe that some courses have listed specializations not present in the `spec_key` for the given program. We remove these references:

In [None]:
for program_name, program_spec in epfl_masterspecs.items():
    for code, course_spec in program_spec['courses'].items():
        if (any(spec not in program_spec['spec_key'] for spec in course_spec)):
            print(f"{program_name}, spec key: {list(program_spec['spec_key'].keys())}, {code}: {course_spec}")
            
            new_course_spec = [spec for spec in course_spec if spec in program_spec['spec_key']]
            epfl_masterspecs[program_name]['courses'][code] = new_course_spec
            
            print(f"Course spec changed from {course_spec} to {new_course_spec}")

Next, we add a `specs` dictionary to each program in `epfl_masterspecs` with keys and values switching roles compared to the `courses` property. This will make it easier to look up all courses with a given specialization.

In [None]:
# courses property
list(epfl_masterspecs['Computer Science']['courses'].items())[:10]

In [None]:
for program_name, program_spec in epfl_masterspecs.items():
    # initialize dictionary with empty list values
    epfl_masterspecs[program_name]['specs'] = {k: [] for k in epfl_masterspecs[program_name]['spec_key']}
    
    for course_code, course_spec in program_spec['courses'].items():
        # iterate over all specialization codes for each course, and add the course code to the specs dictionary
        for k in course_spec:
            epfl_masterspecs[program_name]['specs'][k].append(course_code)

In [None]:
{'g': epfl_masterspecs['Computer Science']['specs']['g']}

Now, we have processed the specializations and can move on.

Next step is to create two new json objects by transforming the `epfl` object. The goal will be to remove redundancies in the original object, since it has many courses occuring repeatedly, and not only enable easy access to programs, but also _individual courses_. The idea is to create one object `epfl_courses` with all course details and another object `epfl_programs` which will only include course codes for each program.

In [None]:
epfl_courses = [
    # for each program: ([code1, code2, ...], [course1, course2, ...])
    ([*program.keys()], [*program.values()])
    for level_name, level in epfl.items()
    for program_name, program in level.items()
]

In [None]:
# all courses: [[code1, code2, ...], [course1, course2, ...]]
epfl_courses = list(
    map(
        lambda x: sum(x, []),
        zip(*epfl_courses)
    )
)

Let's analyze the list of courses for all programs. Let's use pandas to help us with that:

In [None]:
courses_df = pd.DataFrame(index=epfl_courses[0], data=epfl_courses[1])

In [None]:
courses_df.columns

We observe there are a couple of fields we don't recognize, `coefficient` and `''`. Let's analyze those first.

In [None]:
# the '' column has all values either as NaN or '', so we can safely delete the column
courses_df[''].unique()

In [None]:
courses_df.drop('', axis=1, inplace=True)

We observed that the _five_ rows having `''` field as the empty string are exactly the ones that don't have the number of credits listed, but we shall fix this soon (corresponding html element is empty).

The `coefficient` column is an alias for `credits`

In [None]:
len(courses_df)

In [None]:
courses_df.coefficient.isna().sum()

In [None]:
courses_df.credits.isna().sum()

In [None]:
(courses_df.credits == courses_df.coefficient).any()

No rows have both `credits` and `coefficient`, and there are 5 rows which have neither. Let's first merge the columns and then manually fill in the 5 gaps if possible

In [None]:
courses_df.credits.fillna(courses_df.coefficient, inplace=True)
courses_df.drop('coefficient', axis=1, inplace=True)

Let's now see the reason behind the 5 gaps:

In [None]:
courses_df[courses_df.credits.isna()].index

* ENG-274 is without credits
* CH-361 is an optional course, pointing to one of the advanced general chemistry courses
* PENS-200 Ground control in Swiss law, credits are included in the ENAC week
* PHYS-300(a) is also without credits

We know that courses are repeated in the dataframe (row indices), but we need to check if the information is the same when course codes match. It is likely that they are the same, but we can't be sure because of the way the coursebooks were harvested (using program-specific query string parameters for fields in the right column of the page).

In [None]:
f"{courses_df.index.nunique()} unique courses and {len(courses_df)} courses with duplicates"

Let's see if we have a unique row count of 1 for each course code:

In [None]:
# Convert list columns to tuple to avoid this error when calling nunique():
# TypeError: unhashable type: 'list'
list_columns = ['lecturers', 'required', 'recommended', 'concepts', 'prerequisite_for', 'in_the_programs']

In [None]:
courses_df[list_columns] = courses_df[list_columns].applymap(tuple)
# lecturers and in_the_programs field is a list of lists
courses_df['lecturers'] = courses_df.lecturers.apply(lambda x: tuple(map(tuple, x)))
courses_df['in_the_programs'] = courses_df.in_the_programs.apply(lambda x: tuple(map(tuple, x)))

In [None]:
# path is always different 
group = courses_df.drop('path', axis=1).groupby(courses_df.index)

In [None]:
group.nunique().loc[(group.nunique() > 1).any(axis=1)]

There are very few exceptions that will require manual overview, and we will handle them later. Let's just get right into creating the processed `epfl_courses` dictionary and save it to a json file.

In [None]:
courses_unique_df = courses_df.loc[~courses_df.index.duplicated()]

In [None]:
len(courses_unique_df)

In [None]:
epfl_courses = courses_unique_df.to_dict(orient='index')

In [None]:
epfl_courses["AR-126"]

In [None]:
# storing the dict in a json file
with open('../data/processed/epfl_courses.json', 'w') as json_file:
    json.dump(epfl_courses, json_file, indent=4)

Create `epfl_programs`

In [None]:
# use dict comprehension to produce epfl_programs
epfl_programs = {
    level_name: {
        program_name: list(program.keys())
        for program_name, program in level.items()
    }
    for level_name, level in epfl.items()
}

#storing the dict in a json file
with open('../data/processed/epfl_programs.json', 'w') as json_file:
    json.dump(epfl_programs, json_file, indent=4)

In [None]:
epfl_programs['bachelor'].keys()

In [None]:
epfl_programs['bachelor']['Life Sciences Engineering'][:10]

Next step (goal for next milestone) is to process the registration data and integrate it with `epfl_courses`

In [None]:
import json
with open("../data/raw/isa-registrations.json") as file:
    isa = json.load(file)

In [None]:
isa['Data visualization']