In [1]:
import pandas as pd
import numpy as np
import json, requests
import time
import glob
import re
import copy

In [2]:
# Helper functions

# Due to a bug in the csv file, proper_name returns "China's International Relations" for "China���s International Relations"
def proper_name(string):
    # Return -1 if string does not contain "???"
    def get_first_qn_mark_index():
        for i in range(3, len(string)):
            if string[i-3: i] == "???":
                return i-3
        return -1
    first_qn_mark_index = get_first_qn_mark_index()
    if first_qn_mark_index == -1:
        return string
    else:
        return string[: first_qn_mark_index] + "'" + string[i + 3:]

In [3]:
# Initialize df
df = pd.read_csv("myedurec_retrieved_160721.csv", encoding="iso-8859-1")

In [4]:
# Drop invalid rows and rename invalid titles, then reset df indexing
to_drop = []
for i in range(len(df)):
    # Keep track of invalid rows to drop
    if (df['PU Module 1 Title'].iloc[i] is np.nan and df['PU Module 2 Title'].iloc[i] is np.nan) or (df['NUS Module 1 Title'].iloc[i] is np.nan and df['NUS Module 2 Title'].iloc[i] is np.nan):
        to_drop.append(i)
        
    # Rename invalid titles
    if df['PU Module 1 Title'].iloc[i] is not np.nan:
        df['PU Module 1 Title'].iloc[i] = proper_name(df['PU Module 1 Title'].iloc[i])
    if df['PU Module 2 Title'].iloc[i] is not np.nan:
        df['PU Module 2 Title'].iloc[i] = proper_name(df['PU Module 2 Title'].iloc[i])
    if df['NUS Module 1 Title'].iloc[i] is not np.nan:
        df['NUS Module 1 Title'].iloc[i] = proper_name(df['NUS Module 1 Title'].iloc[i])
    if df['NUS Module 2 Title'].iloc[i] is not np.nan:
        df['NUS Module 2 Title'].iloc[i] = proper_name(df['NUS Module 2 Title'].iloc[i])
    
df.drop(to_drop, inplace=True)
df.index = range(len(df))
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,Faculty,Partner University,PU Module 1,PU Module 1 Title,PU Mod1 Credits,PU Module 2,PU Module 2 Title,PU Mod2 Credits,NUS Module 1,NUS Module 1 Title,NUS Mod1 Credits,NUS Module 2,NUS Module 2 Title,NUS Mod2 Credits,Pre Approved?
0,Faculty of Arts & Social Sci,The Hong Kong Polytechnic University,CBS241,Elementary Chinese II (for Non-Chinese speakin...,1.0,,,,LAC2731,Department Exchange Module,3.0,,,,Y
1,Faculty of Arts & Social Sci,The Hong Kong Polytechnic University,CC2C08,Mutual Impressions of China and the West,3.0,,,,PS2238,Int'l Politics of NE Asia,4.0,,,,Y
2,Faculty of Arts & Social Sci,Hong Kong University of Science & Technology,LANG1120,Chinese for Non-Chinese Language Background St...,1.0,,,,LAC1731,Department exchange module,3.0,,,,Y
3,Faculty of Arts & Social Sci,City University of Hong Kong,AIS3126,International Political Economy,3.0,,,,PS3238,Int'l Political Economy,4.0,,,,Y
4,Faculty of Arts & Social Sci,City University of Hong Kong,GE2210,China: A Socio-Political Transformation,3.0,,,,PS2248,Chinese Politics,4.0,,,,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12204,NUS,NOC Shenzhen,INO006,Design Integration and Innovation,2.0,,,,TR3049,Top in Entrepreneurship (TIE),4.0,,,,Y
12205,NUS,NOC Shenzhen,SS085,Innovation in Cultural & Creative Industries,2.0,,,,TR3049,Top in Entrepreneurship (TIE),4.0,,,,Y
12206,NUS,NOC Shenzhen,SS085,Innovation in Cultural & Creative Industries,2.0,,,,TR3049,Top in Entrepreneurship (TIE),4.0,,,,Y
12207,NUS,NOC Shenzhen,INO006,Design Integration and Innovation,2.0,,,,TR3049,Top in Entrepreneurship (TIE),4.0,,,,Y


In [5]:
# url = "https://api.nusmods.com/v2/2020-2021/"
# moduleListJson = requests.get(url + "moduleList.json").json()

# start = time.time()
# for i, moduleJson in enumerate(moduleListJson):
#     if (i % 100 == 0):
#         print(str(i+1) + '/'+str(len(moduleListJson)))
#     moduleCode = moduleJson['moduleCode']
#     moduleDetails = requests.get(url + f"modules/{moduleCode}.json").json()
#     with open("module_details/"+f"{moduleCode}.json", "w") as f:
#         json.dump(moduleDetails, f)
    
# end = time.time()
# print("Time taken: " + str(round(end-start)) + " seconds")

In [6]:
def load_data():
    arr = []
    for fname in sorted(glob.glob('module_details/*.json')):
        with open(fname,'r') as f:
            data = json.load(f)
            arr.append(data)
    return arr
jsons = load_data()

In [7]:
def get_attributes(jsons):
    attrs = set()
    for data in jsons:
        for attribute in data:
            attrs.add(attribute)
    return attrs
attributes = get_attributes(jsons)

In [8]:
def clean_preclusions(jsons):
    preclusions_mapping = {}
    for data in jsons:
        if 'preclusion' in data:
            preclusions_mapping[data['moduleCode']] = data['preclusion']
        else:
            preclusions_mapping[data['moduleCode']] = ''
    def preclusions_string_to_set(preclusion_string):
        # Get all alphaNumeric substrings
        alpha_numerics = re.split('[^a-zA-Z0-9]', preclusion_string)

        def is_module_code(string):
            right_length = len(string) >= 4
            all_caps = string == string.upper()
            contains_digits = bool(re.search(r'\d', string))
            return right_length and all_caps and contains_digits

        return set(filter(is_module_code, alpha_numerics))
    for preclusion in preclusions_mapping:
        preclusions_mapping[preclusion] = preclusions_string_to_set(preclusions_mapping[preclusion])
    to_delete = []
    for preclusion in preclusions_mapping:
            preclusions_mapping[preclusion] = list(preclusions_mapping[preclusion])
    return preclusions_mapping
preclusions_mapping = clean_preclusions(jsons)

In [9]:
with open('preclusionMappings.json', 'w') as f:
    json.dump(preclusions_mapping, f)

In [14]:
def merge_preclusions(preclusions_mapping):
    merged_preclusions_mapping = copy.deepcopy(preclusions_mapping)

    iter_changed = True
    while iter_changed:
        iter_changed = False
        for mod in merged_preclusions_mapping:
            new_set = set(merged_preclusions_mapping[mod])
            for precluded in merged_preclusions_mapping[mod]:
                if precluded in merged_preclusions_mapping:
                    prev_length = len(new_set)
                    new_set = new_set.union(merged_preclusions_mapping[precluded])
                    if len(new_set) != prev_length:
                        iter_changed = True
            merged_preclusions_mapping[mod] = list(new_set)
    return merged_preclusions_mapping
merged_preclusions_mapping = merge_preclusions(preclusions_mapping)

In [17]:
def merge_preclusions_1_deg(preclusions_mapping):
    merged_preclusions_mapping = copy.deepcopy(preclusions_mapping)
    for mod in merged_preclusions_mapping:
        new_set = set(merged_preclusions_mapping[mod])
        for precluded in merged_preclusions_mapping[mod]:
            if precluded in merged_preclusions_mapping:
                new_set = new_set.union(merged_preclusions_mapping[precluded])
        merged_preclusions_mapping[mod] = list(new_set)
    return merged_preclusions_mapping
merged_preclusions_mapping_1_deg = merge_preclusions_1_deg(preclusions_mapping)

In [15]:
with open('mergedPreclusionMappingsComplete.json', 'w') as f:
    json.dump(merged_preclusions_mapping_complete, f)

In [13]:
merged_preclusions_mapping['PC1141']

['PC1431X', 'PC1433', 'PC1141', 'PC1431FC', 'PC1142', 'PC1431']

In [20]:
merged_preclusions_mapping_1_deg['PC1141']

['PC1433', 'PC1141', 'PC1142', 'PC1431', 'PC1431FC', 'PC1431X']