### README
This notebook is to be ran whenever there is an update to the module mappings on myedurec

Some manual work will need to be done to check if any new universities have been added, and we will have to update
the country/continent those universities are in, as well as cross-reference with existing universities to see if they
are the same entity

In [1]:
import pandas as pd
import numpy as np
import json, requests
import time
import glob
import re
import copy
import pycountry_convert as pc

In [2]:
# This fname is the csv file downloaded from myedurec
fname = "myedurec_retrieved_160721.csv"

In [3]:
# Initialize df
# This encoding is required because there are some non-ASCII characters
df = pd.read_csv(fname, encoding="iso-8859-1")

In [4]:
def proper_name(string):
    if type(string) != str:
        return string
    else:
        return string.replace('???',"'")

def clean_data(original_data):
    data = original_data
    not_nan = lambda x: x is not np.nan
    data = data[data['Partner University'].map(not_nan)]
    data = data[data['PU Module 1 Title'].map(not_nan) | data['PU Module 2 Title'].map(not_nan)]
    data = data[data['NUS Module 1 Title'].map(not_nan) | data['NUS Module 2 Title'].map(not_nan)]
    data = data[data['NUS Module 1'].map(not_nan) | data['NUS Module 2'].map(not_nan)]
    data['PU Module 1 Title'] = data['PU Module 1 Title'].map(proper_name)
    data['PU Module 2 Title'] = data['PU Module 2 Title'].map(proper_name)
    data['NUS Module 1 Title'] = data['NUS Module 1 Title'].map(proper_name)
    data['NUS Module 2 Title'] = data['NUS Module 2 Title'].map(proper_name)
    return data

In [5]:
cleaned_data = clean_data(df)
cleaned_data.to_csv("cleaned_mappings.csv",index=False)
cleaned_data.head()

Unnamed: 0,Faculty,Partner University,PU Module 1,PU Module 1 Title,PU Mod1 Credits,PU Module 2,PU Module 2 Title,PU Mod2 Credits,NUS Module 1,NUS Module 1 Title,NUS Mod1 Credits,NUS Module 2,NUS Module 2 Title,NUS Mod2 Credits,Pre Approved?
0,Faculty of Arts & Social Sci,The Hong Kong Polytechnic University,CBS241,Elementary Chinese II (for Non-Chinese speakin...,1.0,,,,LAC2731,Department Exchange Module,3.0,,,,Y
1,Faculty of Arts & Social Sci,The Hong Kong Polytechnic University,CC2C08,Mutual Impressions of China and the West,3.0,,,,PS2238,Int'l Politics of NE Asia,4.0,,,,Y
2,Faculty of Arts & Social Sci,Hong Kong University of Science & Technology,LANG1120,Chinese for Non-Chinese Language Background St...,1.0,,,,LAC1731,Department exchange module,3.0,,,,Y
3,Faculty of Arts & Social Sci,City University of Hong Kong,AIS3126,International Political Economy,3.0,,,,PS3238,Int'l Political Economy,4.0,,,,Y
4,Faculty of Arts & Social Sci,City University of Hong Kong,GE2210,China: A Socio-Political Transformation,3.0,,,,PS2248,Chinese Politics,4.0,,,,Y


### READ THIS BEFORE RUNNING THE FOLLOWING CHUNK OF CODE!
Rename the old 'school_country_continent_mapping.csv' to 'OLD.csv' before running!

This block of code has some work done manually since there is no easy way to check
which country each school is in. We will have to add the countries in manually if 
we miss out the first step!

In [6]:
# df1 = pd.read_csv('OLD.csv')

# countries = set(df1['Country'])
# full_continent_name = {
#     'AF': 'Africa',
#     'AS': 'Asia',
#     'EU': 'Europe',
#     'NA': 'North America',
#     'OC': 'Oceania',
#     'SA': 'South America'
# }

# for i in range(len(df1)):
#     if df1['Country'][i] != 'UNKNOWN':
#         c = pc.country_name_to_country_alpha2(df1['Country'][i], cn_name_format="default")
#         continent = pc.country_alpha2_to_continent_code(c)
#         df1['Continent'][i] = full_continent_name[continent]
#     else:
#         df1['Continent'][i] = 'Europe'

# df1.to_csv('school_country_continent_mapping.csv',index=False)
# del df1

### Note:
This list is created manually. I could not think of any fast/efficient way to determine if 2 schools
are the same entity

In [7]:
mappings = pd.read_csv("cleaned_mappings.csv")

modified_mappings = mappings.copy()

equivalent_schools_mapping = {
    'Aalto University': ['Aalto University, Helsinki'],
    'Aarhus School of Business': ['Aarhus University'],
    'Cornell University': ['Cornell Univ Coll of Agriculture & Life Sciences', 'Cornell Univ Coll of Human Ecology'],
    'Georgetown University': ['Georgetown University Law Center','Georgetown University, Washington D.C.'],
    'Humboldt University of Berlin': ['Humboldt-Universitaet zu Berlin'],
    'Imperial College London': ['Imperial College Business School'],
    'Leiden University': ['Leiden University Medical Center (LUMC)'],
    'University College London': ['University College London, University of London'],
    'Universite Catholique De Louvain': ['Universite Catholique de Louvain'],
}

for school in equivalent_schools_mapping:
    for equiv in equivalent_schools_mapping[school]:
        tmp_df = modified_mappings[modified_mappings['Partner University'] == equiv]
        modified_mappings['Partner University'].iloc[tmp_df.index] = [school] * len(tmp_df)

UCs_df = modified_mappings[modified_mappings['Partner University'] == 'University of California']
for school in modified_mappings['Partner University'].unique():
    if 'University of California' in school and school != 'University of California':
        to_append = UCs_df.copy()
        to_append['Partner University'] = [school] * len(to_append)
        modified_mappings = modified_mappings.append(to_append)
modified_mappings.index = range(len(modified_mappings))

school_country_continent_mapping = pd.read_csv('school_country_continent_mapping.csv')
country, continent = [], []
for i in range(len(modified_mappings)):
    school = modified_mappings['Partner University'][i]
    df = school_country_continent_mapping[school_country_continent_mapping['Partner University'] == school]
    try:
        country.append(df['Country'].iloc[0])
        continent.append(df['Continent'].iloc[0])
    except:
        print(school)
modified_mappings['Country'] = country
modified_mappings['Continent'] = continent

modified_mappings.to_csv('cleaned_mappings_with_locations.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


This following cell is to download the data from the NUSMods api

Only run this to update current data

In [8]:
# url = "https://api.nusmods.com/v2/2020-2021/"
# moduleListJson = requests.get(url + "moduleList.json").json()

# start = time.time()
# for i, moduleJson in enumerate(moduleListJson):
#     if (i % 100 == 0):
#         print(str(i+1) + '/'+str(len(moduleListJson)))
#     moduleCode = moduleJson['moduleCode']
#     moduleDetails = requests.get(url + f"modules/{moduleCode}.json").json()
#     with open("module_details/"+f"{moduleCode}.json", "w") as f:
#         json.dump(moduleDetails, f)
    
# end = time.time()
# print("Time taken: " + str(round(end-start)) + " seconds")

In [9]:
def load_data():
    arr = []
    for fname in sorted(glob.glob('module_details/*.json')):
        with open(fname,'r') as f:
            data = json.load(f)
            arr.append(data)
    return arr
jsons = load_data()

In [10]:
def get_attributes(jsons):
    attrs = set()
    for data in jsons:
        for attribute in data:
            attrs.add(attribute)
    return attrs
attributes = get_attributes(jsons)

In [11]:
def join_preclusions(jsons):
    preclusions_mapping = {}
    for data in jsons:
        if 'preclusion' in data:
            preclusions_mapping[data['moduleCode']] = data['preclusion']
        else:
            preclusions_mapping[data['moduleCode']] = ''
    def preclusions_string_to_set(preclusion_string):
        # Get all alphaNumeric substrings
        alpha_numerics = re.split('[^a-zA-Z0-9]', preclusion_string)

        def is_module_code(string):
            right_length = len(string) >= 4
            all_caps = string == string.upper()
            contains_digits = bool(re.search(r'\d', string))
            return right_length and all_caps and contains_digits

        return set(filter(is_module_code, alpha_numerics))
    for preclusion in preclusions_mapping:
        preclusions_mapping[preclusion] = preclusions_string_to_set(preclusions_mapping[preclusion])
    to_delete = []
    for preclusion in preclusions_mapping:
            preclusions_mapping[preclusion] = list(preclusions_mapping[preclusion])
    return preclusions_mapping
preclusions_mapping = join_preclusions(jsons)

In [12]:
with open('preclusionMappings.json', 'w') as f:
    json.dump(preclusions_mapping, f)

In [13]:
def get_equivalent_modules(jsons):
    equivalent_modules = join_preclusions(jsons)
    
    module_titles = {}
    for json in jsons:
        title = json['title'].upper()
        if title not in module_titles:
            module_titles[title] = []
        module_titles[title].append(json['moduleCode'])

    equivalent_modules = copy.deepcopy(preclusions_mapping)
    for module_title in module_titles:
        equivalents = module_titles[module_title]
        for module_code in equivalents:
            for equiv in equivalents:
                if equiv not in equivalent_modules[module_code]:
                    equivalent_modules[module_code].append(equiv)
    return equivalent_modules

In [14]:
equivalent_modules = get_equivalent_modules(jsons)
with open('equivalentModuleMappings.json', 'w') as f:
    json.dump(equivalent_modules, f)

In [16]:
modules = []
for j in jsons:
    modules.append(f"{j['moduleCode']} {j['title']}")
modules.sort()

with open('../src/data/modules.json', 'w') as f:
    json.dump(modules,f)

countries = sorted(modified_mappings['Country'].unique())
with open('../src/data/countries.json', 'w') as f:
    json.dump(countries, f)
continents = sorted(modified_mappings['Continent'].unique())
with open('../src/data/continents.json', 'w') as f:
    json.dump(continents, f)