In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
with open("../data/processed/epfl_programs.json") as file:
     epfl_programs = json.load(file)
with open("../data/processed/epfl_courses.json") as file:
    epfl_courses = json.load(file)
with open("../data/processed/epfl_studyplans.json") as file:
     epfl_studyplans = json.load(file)

In [None]:
programs = pd.DataFrame.from_dict(epfl_programs, orient='index')
courses = pd.DataFrame.from_dict(epfl_courses, orient='index')
studyplans = pd.DataFrame.from_dict(epfl_studyplans, orient='index')

## Programs

In [None]:
programs.T.head(5)

#### Calculating the number of courses available per education level and program

In [None]:
program_counts = programs.fillna(0).applymap(lambda a: a if a == 0 else len(a)).T
program_counts.head(10)

In [None]:
levels = programs.index
level_dict = {
    level: program_counts[level][program_counts[level]!=0]
    for level in levels
}

#### Plotting the distribution of programs with a given number of courses, per education level

In [None]:
fig = plt.figure(figsize=(20, 15))

fig.suptitle('Number of programs with a given number of courses, per education level')

rows = 3
columns = 2

for i, level in enumerate(levels):
    ax = fig.add_subplot(rows, columns, i+1)
    ax.hist(level_dict[level], bins=range(0,140,10))
    ax.set_title(level)
        
    ax.set_xticks(range(0,140,10))
    ax.set_yticks(range(12))
        
    ax.set_xlabel('# of courses')
    ax.set_ylabel('# of programs')
fig.tight_layout(pad=5.0)
plt.show()

There seems to be only two programs with >100 courses available - one in bachelor and one in master level. Let's check what they are!

In [None]:
program_counts['bachelor'][program_counts['bachelor']>90]

In [None]:
program_counts['master'][program_counts['master']>120]

## Courses

In [None]:
courses.head(5)

#### Distribution of exam forms:

In [None]:
courses['exam_form'].value_counts().to_frame().rename(columns={'exam_form':'course_count'}).T.rename(columns={'':'None'})

#### Number of courses with a given number of credits:

In [None]:
courses['credits'].fillna(0).astype(int).value_counts().sort_index().to_frame().rename(columns={'credits':'count'}).T

#### Distribution of the number of required courses per course 
(before preprocessing the 'required' column (which will mostly make the low counts higher due to non-separated lists of courses, as well as make the high counts lower due to descriptional requirements treated as required courses)

In [None]:
courses['required_count'] = courses['required'].map(lambda a: len(a))
courses['required_count'].value_counts().sort_index().to_frame().T

The courses with more than 4 prerequisites:

In [None]:
pd.set_option('display.max_colwidth', -1)
courses[courses['required_count']>4][['required_count','required']].sort_values(by=['required_count'], ascending=False)

As we can see, MICRO-424 has the only properly formatted list of required courses. The other entries contain descriptional requirements or the mix of both. This it to be cleaned by us for the Milestone 2.

## Studyplans

In [None]:
pd.reset_option('display.max_colwidth')
studyplans

#### Distribution of programs available at a given level

In [None]:
(~studyplans.T.isnull()).sum().to_frame().rename(columns={0:'count'}).T