In [None]:
import pandas as pd
import json
import numpy as np
from helpers import read_raw, write_processed_dict

In [None]:
df = pd.read_pickle('../data/pickle/dep.pickle')
courseindex = df.index

In [None]:
prereq_for = df.prerequisite_for.apply(lambda x: " ".join(list(x)))

In [None]:
import re
regex = re.compile("([A-Za-z]{2,6}[-\s]*\d{3}(?:\s*\([A-Za-z0-9]+\))?)")

In [None]:
prereq_matches = prereq_for.apply(lambda x: regex.findall(x))
prereq = prereq_matches[prereq_matches.apply(lambda x: len(x) > 0)]
prereq

In [None]:
required = df.required.apply(lambda x: " ".join(list(x)))

req_matches = required.apply(lambda x: regex.findall(x))
req = req_matches[req_matches.apply(lambda x: len(x) > 0)]
req

In [None]:
recommended = df.recommended.apply(lambda x: " ".join(list(x)))

rec_matches = recommended.apply(lambda x: regex.findall(x))
rec_matches[rec_matches.apply(lambda x: len(x) > 0)]

In [None]:
#Idea : concat together required and prereq_for
#first, inverse the required into prerequisite
inv_req = {}
for k, v in req.items():
    for e in v:
        _e = e.upper()
        inv_req[_e] = inv_req.get(_e, [])
        inv_req[_e].append(k.upper())

        
print(len(inv_req))
inv_req

In [None]:
for k,v in prereq.items():
    _k = k.upper()
    inv_req[_k] = inv_req.get(_k,[])
    for elem in v:
        inv_req[_k].append(elem.upper())

In [None]:
#merged version
print(len(inv_req))
inv_req

In [None]:
for key, value in inv_req.items():
     inv_req[key] = list(set(inv_req[key]))

In [None]:
print(len(inv_req))
inv_req

In [None]:
write_processed_dict('epfl_prereq',inv_req)

In [None]:
df = pd.read_pickle('../data/pickle/dep.pickle')
courseindex = df.index
print(courseindex[:20])
uppercourseindex = df.index.map(lambda x: x.upper())
print(uppercourseindex[:20])

In [None]:
def fix_name(code):
    #uppercase
    upper = code.upper()
    #leave the codes that don't correspond to a course alone
    if not any(char.isdigit() for char in upper):
        return upper
    upper = upper.replace("-", "").replace(" ", "")
    #split when encountering first digit
    first_digit = re.search('\d', upper).group(0)
    index = upper.find(first_digit)
    res_left = upper[:index]
    res_right = upper[index:]
    res = res_left + '-' + res_right
    return res 

In [None]:
def dict_to_df(x):
    res = pd.DataFrame.from_dict(x, orient='index')
    res['prerequisite_for'] = res.apply(list, axis=1)
    res = res['prerequisite_for'].map(lambda x: list(filter(lambda a: a!= None, x))).reset_index().rename(columns={'index':'course_code'})
    res['course_code'] = res['course_code'].map(fix_name)
    res['prerequisite_for'] = res['prerequisite_for'].map(lambda x: list(map(fix_name, x)))
    return res

In [None]:
def patch_name(code):
    if 'MASTER' in code or 'MINOR' in code or 'PROJECT' in code or 'LAB' in code:
        return code
    res = code.upper()
    #match case
    modifier = ""
    if res.find("(") != -1:
        modifier = re.search('\([A-Za-z0-9]{1,2}\)', res).group(0)
    if res in uppercourseindex:
        return courseindex[uppercourseindex.get_loc(res)]
    elif len(modifier)>0 and res[:-len(modifier)] in uppercourseindex:
        return courseindex[uppercourseindex.get_loc(res[:-len(modifier)])]
    elif res + "(a)" in uppercourseindex:
        return courseindex[uppercourseindex.get_loc(res_new.upper())]
    else:
        return code

In [None]:
with open(f'../data/merged_prerequisites.json') as file:
        merged = json.load(file)
merged = dict_to_df(merged).set_index('course_code')
merged = merged.sort_index()
merged = merged.groupby(['course_code'])['prerequisite_for'].apply(lambda x: sorted(list(set(sum(x, [])))))
merged.head()

In [None]:
links = [
    { "source": source, "target": target }
    for (source, targets) in merged.items()
    for target in targets
]

links_df = pd.DataFrame.from_dict(links)
links_df['source'] = links_df['source'].apply(patch_name)
links_df['target'] = links_df['target'].apply(patch_name)
links_df = links_df[(links_df.source.isin(df.index) & links_df.target.isin(df.index))]
links_df.head()

In [None]:
links_df.size

In [None]:
links_dict = links_df.to_json(path_or_buf='../data/links3004.json', orient='index')

In [None]:
#with open(f'../data/links.json', 'w') as json_file:
#        json.dump(links, json_file)

In [None]:
#inv_df = dict_to_df(inv_req)
#inv_df.head()

In [None]:
#with open(f'../data/epfl_prereq_byhand.json') as file:
#        by_hand = json.load(file)
#by_hand = dict_to_df(by_hand)
#by_hand.head()

In [None]:
print(inv_df['course_code'].size)
print(by_hand['course_code'].size)

In [None]:
def merge(df1, df2):
    merged = df1.merge(df2, on='course_code', how='outer')
    merged = merged.apply(lambda x: x.apply(lambda x: [] if x is np.nan else x))
    merged['prerequisite_for'] = merged['prerequisite_for_x'] + merged['prerequisite_for_y']
    merged = merged[['course_code','prerequisite_for']]
    merged['prerequisite_for'] = merged['prerequisite_for'].apply(lambda x: sorted(list(set(x))))
    return merged.set_index('course_code')

In [None]:
merged = merge(inv_df, by_hand)
merged.head()

In [None]:
def test_method():
    wrong_codes = ['com 480(a)', 'COM  480(a)', 'com480 (A)', '  com-480  (A)', 'COM-480(a)']
    good_code = 'COM-480(A)'
    for wrong_code in wrong_codes:
        fixed = fix_name(wrong_code)
        if fixed != good_code:
            print("%s != %s"% (fixed, good_code))
            return False
    return True
test_method()

In [None]:
merged = merged.sort_index()
merged = merged.groupby(['course_code'])['prerequisite_for'].apply(lambda x: sorted(list(set(sum(x, [])))))

In [None]:
merged.to_json(path_or_buf='../data/merged_prerequisites2.json', orient='index')

In [None]:
with open("../data/processed/merged_prerequisites.json") as file:
    merged = json.load(file)



In [None]:
# courseindex

links = [
    { "source": source, "target": target }
    for (source, targets) in merged.items()
    for target in targets
]

links_df = pd.DataFrame.from_dict(links)

In [None]:
links_df = links_df[(~links_df.source.isin(courseindex) | ~links_df.target.isin(courseindex))]

In [None]:
links_df.to_json(path_or_buf='../data/processed/links.json', orient="table")

In [None]:
master = read_raw('master', 'processed')
links = read_raw('links', 'processed')

In [None]:
master["links"] = list(links.values())

In [None]:
write_processed_dict("master", master)