In [28]:
import pandas as pd
import numpy as np
import json, requests
import time
import glob
import re

In [7]:
# Helper functions

# Due to a bug in the csv file, proper_name returns "China's International Relations" for "China���s International Relations"
def proper_name(string):
    # Return -1 if string does not contain "???"
    def get_first_qn_mark_index():
        for i in range(3, len(string)):
            if string[i-3: i] == "???":
                return i-3
        return -1
    first_qn_mark_index = get_first_qn_mark_index()
    if first_qn_mark_index == -1:
        return string
    else:
        return string[: first_qn_mark_index] + "''" + string[i + 3:]

In [8]:
# Initialize df
df = pd.read_csv("myedurec_retrieved_160721.csv", encoding="iso-8859-1")

In [9]:
# Drop invalid rows and rename invalid titles, then reset df indexing
to_drop = []
for i in range(len(df)):
    # Keep track of invalid rows to drop
    if (df['PU Module 1 Title'].iloc[i] is np.nan and df['PU Module 2 Title'].iloc[i] is np.nan) or (df['NUS Module 1 Title'].iloc[i] is np.nan and df['NUS Module 2 Title'].iloc[i] is np.nan):
        to_drop.append(i)
        
    # Rename invalid titles
    if df['PU Module 1 Title'].iloc[i] is not np.nan:
        df['PU Module 1 Title'].iloc[i] = proper_name(df['PU Module 1 Title'].iloc[i])
    if df['PU Module 2 Title'].iloc[i] is not np.nan:
        df['PU Module 2 Title'].iloc[i] = proper_name(df['PU Module 2 Title'].iloc[i])
    if df['NUS Module 1 Title'].iloc[i] is not np.nan:
        df['NUS Module 1 Title'].iloc[i] = proper_name(df['NUS Module 1 Title'].iloc[i])
    if df['NUS Module 2 Title'].iloc[i] is not np.nan:
        df['NUS Module 2 Title'].iloc[i] = proper_name(df['NUS Module 2 Title'].iloc[i])
    
df.drop(to_drop, inplace=True)
df.index = range(len(df))
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,Faculty,Partner University,PU Module 1,PU Module 1 Title,PU Mod1 Credits,PU Module 2,PU Module 2 Title,PU Mod2 Credits,NUS Module 1,NUS Module 1 Title,NUS Mod1 Credits,NUS Module 2,NUS Module 2 Title,NUS Mod2 Credits,Pre Approved?
0,Faculty of Arts & Social Sci,The Hong Kong Polytechnic University,CBS241,Elementary Chinese II (for Non-Chinese speakin...,1.0,,,,LAC2731,Department Exchange Module,3.0,,,,Y
1,Faculty of Arts & Social Sci,The Hong Kong Polytechnic University,CC2C08,Mutual Impressions of China and the West,3.0,,,,PS2238,Int'l Politics of NE Asia,4.0,,,,Y
2,Faculty of Arts & Social Sci,Hong Kong University of Science & Technology,LANG1120,Chinese for Non-Chinese Language Background St...,1.0,,,,LAC1731,Department exchange module,3.0,,,,Y
3,Faculty of Arts & Social Sci,City University of Hong Kong,AIS3126,International Political Economy,3.0,,,,PS3238,Int'l Political Economy,4.0,,,,Y
4,Faculty of Arts & Social Sci,City University of Hong Kong,GE2210,China: A Socio-Political Transformation,3.0,,,,PS2248,Chinese Politics,4.0,,,,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12204,NUS,NOC Shenzhen,INO006,Design Integration and Innovation,2.0,,,,TR3049,Top in Entrepreneurship (TIE),4.0,,,,Y
12205,NUS,NOC Shenzhen,SS085,Innovation in Cultural & Creative Industries,2.0,,,,TR3049,Top in Entrepreneurship (TIE),4.0,,,,Y
12206,NUS,NOC Shenzhen,SS085,Innovation in Cultural & Creative Industries,2.0,,,,TR3049,Top in Entrepreneurship (TIE),4.0,,,,Y
12207,NUS,NOC Shenzhen,INO006,Design Integration and Innovation,2.0,,,,TR3049,Top in Entrepreneurship (TIE),4.0,,,,Y


In [10]:
# url = "https://api.nusmods.com/v2/2020-2021/"
# moduleListJson = requests.get(url + "moduleList.json").json()

# start = time.time()
# for i, moduleJson in enumerate(moduleListJson):
#     if (i % 100 == 0):
#         print(str(i+1) + '/'+str(len(moduleListJson)))
#     moduleCode = moduleJson['moduleCode']
#     moduleDetails = requests.get(url + f"modules/{moduleCode}.json").json()
#     with open("module_details/"+f"{moduleCode}.json", "w") as f:
#         json.dump(moduleDetails, f)
    
# end = time.time()
# print("Time taken: " + str(round(end-start)) + " seconds")

In [19]:
jsons = []
for fname in sorted(glob.glob('module_details/*.json')):
    with open(fname,'r') as f:
        data = json.load(f)
        jsons.append(data)

In [22]:
attributes = set()
for data in jsons:
    for attribute in data:
        attributes.add(attribute)

In [35]:
preclusions_mapping = {}
for data in jsons:
    if 'preclusion' in data:
        preclusions_mapping[data['moduleCode']] = data['preclusion']
    else:
        preclusions_mapping[data['moduleCode']] = None

In [69]:
def preclusions_string_to_set(preclusion_string):
    if preclusion_string == None:
        return set()
    alpha_numerics = re.split('[^a-zA-Z0-9]', preclusion_string)
    
    def is_module_code(string):
        right_length = len(string) >= 4
        all_caps = string == string.upper()
        contains_digits = bool(re.search(r'\d', string))
        return right_length and all_caps and contains_digits
    
    return set(filter(is_module_code, alpha_numerics))

In [70]:
max_length, max_string = 0, ""
for preclusion in preclusions_mapping:
    for mod in preclusions_string_to_set(preclusions_mapping[preclusion]):
        if len(mod) > max_length:
            max_length = len(mod)
            max_string = mod
max_length, max_string

(9, 'CS1010XCP')

In [72]:
for preclusion in preclusions_mapping:
    for mod in preclusions_string_to_set(preclusions_mapping[preclusion]):
        if len(mod) >= 9:
            print(mod)

CS1010XCP
CS1010XCP
CS1010XCP
DMX1401AI
ESE1001FC
GEK1548FC
DMX1401AI


In [55]:
preclusions_string_to_set('CM2264, CM3262, CM3265, CM3266')

{'CM2264', 'CM3262', 'CM3265', 'CM3266'}

In [36]:
preclusions_mapping

{'AA1201': None,
 'AC5002': None,
 'AC5003': None,
 'AC5004': None,
 'AC5005': None,
 'AC5010': None,
 'ACC1002': 'Students who have passed FNA1002 are not allowed to take ACC1002.',
 'ACC1002X': 'Students who have passed CS1304 or EC3212 or BK1003 or BZ1002 or BH1002 or BZ1002E or BH1002E or FNA1002E or FNA1002X are not allowed to take ACC1002X.',
 'ACC1006': 'Students who have passed FNA1006 are not allowed to take ACC1006.',
 'ACC1701': 'ACC1002; ACC1002X; EC2204',
 'ACC1701X': 'ACC1002; ACC1002X',
 'ACC2002': 'BH2002 or BZ3102 or BK2001  or FNA2002 or IE4242',
 'ACC2706': 'ACC2002',
 'ACC2707': 'ACC3601',
 'ACC2708': 'ACC3601',
 'ACC2709': 'ACC1006',
 'ACC3603': 'Students who have passed FNA3121 are not allowed to take ACC3603.',
 'ACC3604': 'Students who have passed FNA3122 or LL4055 are not allowed to take ACC3604.',
 'ACC3605': 'Students who have passed FNA3127 or LL4056 are not allowed to take ACC3605.',
 'ACC3614': 'Students who have passed FNA3126 are not allowed to take ACC3

In [42]:
string_to_split = pd.DataFrame(module_detail_dict).iloc[-3]['prerequisite']
list(filter(lambda x: len(x) >= 4 and x == x.upper(), re.split('[^a-zA-Z0-9]', string_to_split)))

['CS1010S', 'LSM2253', 'LSM3241', 'CS2220']

In [33]:
pd.DataFrame(module_detail_dict).iloc[-3]

prereqTree             {'and': [{'or': ['CS1010S', 'LSM2253']}, {'or'...
fulfillRequirements                                                 None
corequisite                                                         None
aliases                                                             None
title                                  Advanced Topics in Bioinformatics
attributes                                               {'mpes1': True}
prerequisite           (CS1010S or equivalent or LSM2253) AND (LSM324...
moduleCode                                                        ZB4171
faculty                                                          Science
semesterData           [{'semester': 1, 'timetable': [{'classNo': '1'...
preclusion                                                      YSC4211C
acadYear                                                       2020/2021
description            This is a seminar-style module based on the li...
moduleCredit                                       