In [1]:
import pandas as pd
import json


In [2]:
def write_processed_dict(filename, dictionary):
    """Writes the @dictionary to a processed json data file @filename"""
    with open(f'../data/processed/{filename}.json', 'w') as json_file:
        json.dump(dictionary, json_file, indent=4)

In [3]:
df = pd.read_pickle('../data/pickle/dep.pickle')

In [4]:
prereq_for = df.prerequisite_for.apply(lambda x: " ".join(list(x)))

In [5]:
import re
regex = re.compile("([A-Za-z]{2,6}(?:-|/s)?\d{3}(?:\([A-Za-z0-9]+\))?)")

In [6]:
prereq_matches = prereq_for.apply(lambda x: regex.findall(x))
prereq = prereq_matches[prereq_matches.apply(lambda x: len(x) > 0)]
prereq

AR-219                                              [PENS-303]
BIO-203                [BIOENG-390, BIO-501, BIO-503, BIO-504]
BIO-204                [BIOENG-390, BIO-501, BIO-503, BIO-504]
BIOENG-315                                        [BIOENG-442]
CH-110                                        [CH-108, CH-109]
CIVIL-308                                          [CIVIL-404]
CIVIL-532                               [CIVIL-436, CIVIL-437]
COM-301                                      [COM-402, CS-523]
COM-401                                     [COM-401, COM-408]
COM-516                                              [COM-512]
CS-107                                                [CS-108]
CS-119(c)                                         [COM-112(a)]
CS-173                                        [CS-208, CS-209]
CS-207                        [CS-322, CS-206, CS-323, CS-305]
CS-212                                                [CS-323]
CS-308                                               [C

In [7]:
required = df.required.apply(lambda x: " ".join(list(x)))

req_matches = required.apply(lambda x: regex.findall(x))
req = req_matches[req_matches.apply(lambda x: len(x) > 0)]
req

AR-126                             [AR-125, PHYS-118, CIVIL-122]
AR-219                                      [MATH-124, MATH-126]
AR-366                                                  [AR-365]
AR-435                                                  [AR-484]
BIO-603(MS)                               [MICRO-561, MICRO-562]
                                     ...                        
MICRO-424      [MICRO-420, MICRO-421, MICRO-422, MICRO-522, M...
MICRO-504                                            [Micro-331]
MICRO-507                                    [Micro-454, ME-425]
MSE-424                              [MSE-203, MSE-205, MSE-310]
MSE-715                                                [MSE-637]
Name: required, Length: 111, dtype: object

In [8]:
recommended = df.recommended.apply(lambda x: " ".join(list(x)))

rec_matches = recommended.apply(lambda x: regex.findall(x))
rec_matches[rec_matches.apply(lambda x: len(x) > 0)]

AR-402(a)                                [AR-402]
AR-402(y)                             [AR-401(Y)]
AR-485                        [AR-487, AR-402(n)]
AR-487                                [AR-401(n)]
BIO-203                        [BIO-103, BIO-104]
                             ...                 
MSE-464      [MSE-230, MSE-205, MSE-310, MSE-304]
PENS-303                        [AR-219, ENV-140]
PHYS-313                                [HUM-315]
PHYS-314                                [HUM-315]
PHYS-750                               [PHYS-448]
Name: recommended, Length: 99, dtype: object

In [9]:
#Idea : concat together required and prereq_for
#first, inverse the required into prerequisite
inv_req = {}
for k, v in req.items():
    for e in v:
        _e = e.upper()
        inv_req[_e] = inv_req.get(_e, [])
        inv_req[_e].append(k.upper())

        
print(len(inv_req))
inv_req

147


{'AR-125': ['AR-126'],
 'PHYS-118': ['AR-126'],
 'CIVIL-122': ['AR-126'],
 'MATH-124': ['AR-219'],
 'MATH-126': ['AR-219'],
 'AR-365': ['AR-366'],
 'AR-484': ['AR-435'],
 'MICRO-561': ['BIO-603(MS)'],
 'MICRO-562': ['BIO-603(MS)'],
 'BIO-205': ['BIO-692', 'BIOENG-455'],
 'MATH-106': ['BIO-692', 'BIOENG-455'],
 'BIOENG-437': ['BIOENG-433'],
 'PHYS-101': ['BIOENG-455'],
 'BIOENG-448': ['BIOENG-457'],
 'CH-108(A)': ['CH-109(A)'],
 'CH-210': ['CH-411'],
 'CH-311': ['CH-411'],
 'CH-312': ['CH-411'],
 'CH-313': ['CH-411'],
 'CS-111': ['CIVIL-260'],
 'CS-119': ['CIVIL-261'],
 'CIVIL-225': ['CIVIL-306', 'CIVIL-423'],
 'CIVIL-203': ['CIVIL-306', 'CIVIL-308', 'CIVIL-448'],
 'CIVIL-304': ['CIVIL-306', 'CIVIL-448'],
 'CIVIL-103': ['CIVIL-306', 'CIVIL-308', 'CIVIL-448'],
 'CIVIL-223': ['CIVIL-306'],
 'CIVIL-308': ['CIVIL-404'],
 'CIVIL-234': ['CIVIL-414', 'CIVIL-449'],
 'CIVIL-321': ['CIVIL-423', 'CIVIL-449'],
 'MATH-251': ['CIVIL-423'],
 'CIVIL-420': ['CIVIL-449'],
 'CIVIL-522': ['CIVIL-449'],
 'C

In [10]:
for k,v in prereq.items():
    _k = k.upper()
    inv_req[_k] = inv_req.get(_k,[])
    for elem in v:
        inv_req[_k].append(elem.upper())

In [11]:
#merged version
print(len(inv_req))
inv_req

173


{'AR-125': ['AR-126'],
 'PHYS-118': ['AR-126'],
 'CIVIL-122': ['AR-126'],
 'MATH-124': ['AR-219'],
 'MATH-126': ['AR-219'],
 'AR-365': ['AR-366'],
 'AR-484': ['AR-435'],
 'MICRO-561': ['BIO-603(MS)'],
 'MICRO-562': ['BIO-603(MS)'],
 'BIO-205': ['BIO-692', 'BIOENG-455'],
 'MATH-106': ['BIO-692', 'BIOENG-455'],
 'BIOENG-437': ['BIOENG-433'],
 'PHYS-101': ['BIOENG-455'],
 'BIOENG-448': ['BIOENG-457'],
 'CH-108(A)': ['CH-109(A)'],
 'CH-210': ['CH-411'],
 'CH-311': ['CH-411'],
 'CH-312': ['CH-411'],
 'CH-313': ['CH-411'],
 'CS-111': ['CIVIL-260'],
 'CS-119': ['CIVIL-261'],
 'CIVIL-225': ['CIVIL-306', 'CIVIL-423'],
 'CIVIL-203': ['CIVIL-306', 'CIVIL-308', 'CIVIL-448'],
 'CIVIL-304': ['CIVIL-306', 'CIVIL-448'],
 'CIVIL-103': ['CIVIL-306', 'CIVIL-308', 'CIVIL-448'],
 'CIVIL-223': ['CIVIL-306'],
 'CIVIL-308': ['CIVIL-404', 'CIVIL-404'],
 'CIVIL-234': ['CIVIL-414', 'CIVIL-449'],
 'CIVIL-321': ['CIVIL-423', 'CIVIL-449'],
 'MATH-251': ['CIVIL-423'],
 'CIVIL-420': ['CIVIL-449'],
 'CIVIL-522': ['CIV

In [12]:
for key, value in inv_req.items():
     inv_req[key] = list(set(inv_req[key]))

In [13]:
print(len(inv_req))
inv_req

173


{'AR-125': ['AR-126'],
 'PHYS-118': ['AR-126'],
 'CIVIL-122': ['AR-126'],
 'MATH-124': ['AR-219'],
 'MATH-126': ['AR-219'],
 'AR-365': ['AR-366'],
 'AR-484': ['AR-435'],
 'MICRO-561': ['BIO-603(MS)'],
 'MICRO-562': ['BIO-603(MS)'],
 'BIO-205': ['BIOENG-455', 'BIO-692'],
 'MATH-106': ['BIOENG-455', 'BIO-692'],
 'BIOENG-437': ['BIOENG-433'],
 'PHYS-101': ['BIOENG-455'],
 'BIOENG-448': ['BIOENG-457'],
 'CH-108(A)': ['CH-109(A)'],
 'CH-210': ['CH-411'],
 'CH-311': ['CH-411'],
 'CH-312': ['CH-411'],
 'CH-313': ['CH-411'],
 'CS-111': ['CIVIL-260'],
 'CS-119': ['CIVIL-261'],
 'CIVIL-225': ['CIVIL-423', 'CIVIL-306'],
 'CIVIL-203': ['CIVIL-448', 'CIVIL-308', 'CIVIL-306'],
 'CIVIL-304': ['CIVIL-448', 'CIVIL-306'],
 'CIVIL-103': ['CIVIL-448', 'CIVIL-308', 'CIVIL-306'],
 'CIVIL-223': ['CIVIL-306'],
 'CIVIL-308': ['CIVIL-404'],
 'CIVIL-234': ['CIVIL-449', 'CIVIL-414'],
 'CIVIL-321': ['CIVIL-449', 'CIVIL-423'],
 'MATH-251': ['CIVIL-423'],
 'CIVIL-420': ['CIVIL-449'],
 'CIVIL-522': ['CIVIL-449'],
 'C

In [14]:
write_processed_dict('epfl_prereq',inv_req)