In [1]:
import re
import pandas as pd
from utils import read, write

## Links

In [2]:
links = pd.DataFrame(read.read_json('req-links', 'labelled'))
courses = pd.DataFrame(read.read_json_processed('courses'))
courses_text = pd.DataFrame(read.read_json_processed('courses-text')).T

In [3]:
links_concat = pd.concat([links.source, links.target])

print('Courses in labelled links data that are currently not posted on edu.epfl.ch:\n')
print(links_concat[~links_concat.isin(courses.slug)].unique())

Courses in labelled links data that are currently not posted on edu.epfl.ch:

['ar-125' 'ar-126' 'ar-401-b' 'bio-376' 'bioeng-437' 'ch-310' 'ch-314'
 'civil-223' 'cs-211' 'cs-321' 'ee-100-b' 'ee-102-b' 'ee-432' 'env-241'
 'hum-442' 'hum-498-a' 'hum-498-b' 'hum-499-a' 'math-120' 'math-125'
 'math-260' 'math-428' 'math-460' 'me-325' 'mse-202' 'ar-526' 'ar-402-b'
 'ar-401-c' 'ar-402-c' 'bioeng-433' 'ee-433' 'ee-734' 'hum-449'
 'hum-498-c' 'hum-499-b' 'math-461' 'math-464' 'math-475' 'math-610'
 'math-465' 'eng-467' 'mgt-416' 'env-366' 'math-402' 'math-409' 'math-653'
 'math-637' 'me-432' 'me-473' 'micro-450' 'ee-461' 'bio-603-ms' 'ee-516'
 'ch-243' 'phys-328' 'phys-811' 'phys-738']


In [4]:
cols_dependency_info = ['requiredCourses', 'preparationFor', 'recommendedCourses']
dep_df = courses_text[cols_dependency_info]
# courses_text.drop(cols_dependency_info, axis=1, inplace=True)

In [5]:
regex_course_code = re.compile("([A-Za-z]{2,6}[-\s]*\d{3}(?:[A-Za-z0-9]+)?(?:\s*\([A-Za-z0-9]+\))?)", flags=re.IGNORECASE)

In [6]:
regex_code_parens = re.compile('\((.+)\)$')
regex_spaces = re.compile('\s+')
regex_letter_followedby_number = re.compile('([a-z])([0-9])', re.IGNORECASE)
regex_number_followedby_letter = re.compile('([0-9])([a-z])', re.IGNORECASE)
def construct_course_slug(course_code):
    slug = regex_code_parens.sub('-\g<1>', course_code)
    slug = regex_spaces.sub('-', slug)
    slug = regex_letter_followedby_number.sub('\g<1>-\g<2>', slug)
    slug = regex_number_followedby_letter.sub('\g<1>-\g<2>', slug)
    return slug.lower()

cols = dep_df.columns

In [7]:
def find_course_refs(text):
    if text == '':
        return []
    if type(text) != str:
        print(type(text), text, col, col_match)
    matches = regex_course_code.findall(text)
    matches = [m for m in matches]
    return matches
    
dep_df_match = []
for col in cols:
    dep_df_match.append(dep_df[col].apply(find_course_refs))

In [8]:
dep_df_match  = pd.DataFrame(dep_df_match).T

In [9]:
ids = links.id.values
slugs = courses.slug.values

col = 'requiredCourses'
ser = dep_df_match[col]

for target, sources in ser.iteritems():
    if len(sources) > 0:
        # Only keep unlabelled sources
        filtered = [source for source in sources if f'{source}-->{target}' not in ids]
        ser.loc[target] = filtered

In [10]:
candidates = ser[ser.apply(lambda v: len(v) > 0)]

In [11]:
def filter_candidates(v):
    return [d for d in v if d in slugs]
candidates = candidates.apply(filter_candidates)

In [12]:
for k, v in candidates.iteritems():
    for slug in v:
        if slug != k:        
            links = links.append({
                'source': slug,
                'target': k,
                'id': f'{slug}--{k}'
            }, ignore_index= True)

In [76]:
links.to_json('./data/labelled/req-links.json', orient='records')