In [1]:
import pandas as pd
import json
import numpy as np

In [2]:
def read_raw(filename, subdir='raw'):
    """Reads the raw json data file @filename and returns it as a dict"""
    with open(f'../data/{subdir}/{filename}.json') as file:
        return json.load(file)

def write_processed_dict(filename, dictionary):
    """Writes the @dictionary to a processed json data file @filename"""
    with open(f'../data/processed/{filename}.json', 'w') as json_file:
        json.dump(dictionary, json_file, indent=4)

In [26]:
df = pd.read_pickle('../data/pickle/dep.pickle')
courseindex = df.index

In [None]:
prereq_for = df.prerequisite_for.apply(lambda x: " ".join(list(x)))

In [None]:
import re
regex = re.compile("([A-Za-z]{2,6}[-\s]*\d{3}(?:\s*\([A-Za-z0-9]+\))?)")

In [None]:
prereq_matches = prereq_for.apply(lambda x: regex.findall(x))
prereq = prereq_matches[prereq_matches.apply(lambda x: len(x) > 0)]
prereq

In [None]:
required = df.required.apply(lambda x: " ".join(list(x)))

req_matches = required.apply(lambda x: regex.findall(x))
req = req_matches[req_matches.apply(lambda x: len(x) > 0)]
req

In [None]:
recommended = df.recommended.apply(lambda x: " ".join(list(x)))

rec_matches = recommended.apply(lambda x: regex.findall(x))
rec_matches[rec_matches.apply(lambda x: len(x) > 0)]

In [None]:
#Idea : concat together required and prereq_for
#first, inverse the required into prerequisite
inv_req = {}
for k, v in req.items():
    for e in v:
        _e = e.upper()
        inv_req[_e] = inv_req.get(_e, [])
        inv_req[_e].append(k.upper())

        
print(len(inv_req))
inv_req

In [None]:
for k,v in prereq.items():
    _k = k.upper()
    inv_req[_k] = inv_req.get(_k,[])
    for elem in v:
        inv_req[_k].append(elem.upper())

In [None]:
#merged version
print(len(inv_req))
inv_req

In [None]:
for key, value in inv_req.items():
     inv_req[key] = list(set(inv_req[key]))

In [None]:
print(len(inv_req))
inv_req

In [None]:
write_processed_dict('epfl_prereq',inv_req)

In [None]:
def fix_name(code):
    #uppercase
    upper = code.upper()
    #leave the codes that don't correspond to a course alone
    if not any(char.isdigit() for char in upper):
        return upper
    upper = upper.replace("-", "").replace(" ", "")
    #split when encountering first digit
    first_digit = re.search('\d', upper).group(0)
    index = upper.find(first_digit)
    res_left = upper[:index]
    res_right = upper[index:]
    
    return res_left + '-' + res_right

In [None]:
def dict_to_df(x):
    res = pd.DataFrame.from_dict(x, orient='index')
    res['prerequisite_for'] = res.apply(list, axis=1)
    res = res['prerequisite_for'].map(lambda x: list(filter(lambda a: a!= None, x))).reset_index().rename(columns={'index':'course_code'})
    res['course_code'] = res['course_code'].map(fix_name)
    res['prerequisite_for'] = res['prerequisite_for'].map(lambda x: list(map(fix_name, x)))
    return res

In [None]:
inv_df = dict_to_df(inv_req)
inv_df.head()

In [None]:
with open(f'../data/epfl_prereq_byhand.json') as file:
        by_hand = json.load(file)
by_hand = dict_to_df(by_hand)
by_hand.head()

In [None]:
print(inv_df['course_code'].size)
print(by_hand['course_code'].size)

In [None]:
def merge(df1, df2):
    merged = df1.merge(df2, on='course_code', how='outer')
    merged = merged.apply(lambda x: x.apply(lambda x: [] if x is np.nan else x))
    merged['prerequisite_for'] = merged['prerequisite_for_x'] + merged['prerequisite_for_y']
    merged = merged[['course_code','prerequisite_for']]
    merged['prerequisite_for'] = merged['prerequisite_for'].apply(lambda x: sorted(list(set(x))))
    return merged.set_index('course_code')

In [None]:
merged = merge(inv_df, by_hand)
merged.head()

In [None]:
def test_method():
    wrong_codes = ['com 480(a)', 'COM  480(a)', 'com480 (A)', '  com-480  (A)', 'COM-480(a)']
    good_code = 'COM-480(A)'
    for wrong_code in wrong_codes:
        fixed = fix_name(wrong_code)
        if fixed != good_code:
            print("%s != %s"% (fixed, good_code))
            return False
    return True
test_method()

In [None]:
merged = merged.sort_index()
merged.head()

In [None]:
merged = merged.groupby(['course_code'])['prerequisite_for'].apply(lambda x: sorted(list(set(sum(x, [])))))
merged

In [None]:
merged.to_json(path_or_buf='../data/merged_prerequisites.json', orient='index')

In [31]:
with open("../data/processed/merged_prerequisites.json") as file:
    merged = json.load(file)



In [64]:
# courseindex

links = [
    { "source": source, "target": target }
    for (source, targets) in merged.items()
    for target in targets
]

links_df = pd.DataFrame.from_dict(links)

In [65]:
links_df = links_df[(~links_df.source.isin(courseindex) | ~links_df.target.isin(courseindex))]

Unnamed: 0,source,target
6,AR-126,AR-201(N)
14,AR-226,AR-423(A)
16,AR-241,AR-201(N)
21,AR-242,AR-201(N)
26,AR-401(B),AR-402(B)
...,...,...
1104,PHYS-106(EN),EE-456
1105,PHYS-106(EN),EE-548
1106,PHYS-106(EN),EE-580
1107,PHYS-106(EN),EE-585


In [53]:
links_df.to_json(path_or_buf='../data/processed/links.json', orient="table")

In [3]:
master = read_raw('master', 'processed')
links = read_raw('links', 'processed')

In [13]:
master["links"] = list(links.values())

In [15]:
write_processed_dict("master", master)