In [1]:
import pandas as pd
import numpy as np
import json, requests

In [2]:
mappings = pd.read_csv("cleaned_mappings.csv")
mappings.head()

Unnamed: 0,Faculty,Partner University,PU Module 1,PU Module 1 Title,PU Mod1 Credits,PU Module 2,PU Module 2 Title,PU Mod2 Credits,NUS Module 1,NUS Module 1 Title,NUS Mod1 Credits,NUS Module 2,NUS Module 2 Title,NUS Mod2 Credits,Pre Approved?
0,Faculty of Arts & Social Sci,The Hong Kong Polytechnic University,CBS241,Elementary Chinese II (for Non-Chinese speakin...,1.0,,,,LAC2731,Department Exchange Module,3.0,,,,Y
1,Faculty of Arts & Social Sci,The Hong Kong Polytechnic University,CC2C08,Mutual Impressions of China and the West,3.0,,,,PS2238,Int'l Politics of NE Asia,4.0,,,,Y
2,Faculty of Arts & Social Sci,Hong Kong University of Science & Technology,LANG1120,Chinese for Non-Chinese Language Background St...,1.0,,,,LAC1731,Department exchange module,3.0,,,,Y
3,Faculty of Arts & Social Sci,City University of Hong Kong,AIS3126,International Political Economy,3.0,,,,PS3238,Int'l Political Economy,4.0,,,,Y
4,Faculty of Arts & Social Sci,City University of Hong Kong,GE2210,China: A Socio-Political Transformation,3.0,,,,PS2248,Chinese Politics,4.0,,,,Y


In [3]:
with open('equivalentModuleMappings.json','r') as f:
    equivalent_module_mappings = json.load(f)

In [4]:
def get_equivalent_modules(modules, equivalent_module_mappings=equivalent_module_mappings):
    output = set()
    for module in modules:
        output.update(set(equivalent_module_mappings[module]))
    return output

### Note:
This list is created manually. I could not think of any fast/efficient way to determine if 2 schools
are the same entity

In [5]:
# equivalent_schools_mapping = {
#     'Aalto University': ['Aalto University, Helsinki'],
#     'Aarhus School of Business': ['Aarhus University'],
#     'Cornell University': ['Cornell Univ Coll of Agriculture & Life Sciences', 'Cornell Univ Coll of Human Ecology'],
#     'Georgetown University': ['Georgetown University Law Center','Georgetown University, Washington D.C.'],
#     'Humboldt University of Berlin': ['Humboldt-Universitaet zu Berlin'],
#     'Imperial College London': ['Imperial College Business School'],
#     'Leiden University': ['Leiden University Medical Center (LUMC)'],
#     'University College London': ['University College London, University of London'],
#     'Universite Catholique De Louvain': ['Universite Catholique de Louvain'],
# }

# schools = list(equivalent_schools_mapping.keys())
# for school in schools:
#     for equiv in equivalent_schools_mapping[school]:
#         equivalent_schools_mapping[equiv] = equivalent_schools_mapping[school]
#     equivalent_schools_mapping[school].append(school)

# def get_equivalent_schools(schools, equivalent_schools_mapping=equivalent_schools_mapping):
#     output = set()
#     for school in schools:
#         if school in equivalent_schools_mapping:
#             output.update(set(equivalent_schools_mapping[school]))
#         else:
#             output.update(set([school]))
#     return output

# UCs = {'University of California, Berkeley': ['University of California'],
#     'University of California, Davis': ['University of California'],
#     'University of California, Irvine': ['University of California'],
#     'University of California, Los Angeles': ['University of California'],
#     'University of California, Merced': ['University of California'],
#     'University of California, Riverside': ['University of California'],
#     'University of California, San Diego': ['University of California'],
#     'University of California, Santa Barbara': ['University of California'],
#     'University of California, Santa Cruz': ['University of California']
# }
# for UC in UCs:
#     UCs[UC].append(UC)
# for UC in UCs:
#     equivalent_schools_mapping[UC] = UCs[UC]

In [6]:
modified_mappings = mappings.copy()

equivalent_schools_mapping = {
    'Aalto University': ['Aalto University, Helsinki'],
    'Aarhus School of Business': ['Aarhus University'],
    'Cornell University': ['Cornell Univ Coll of Agriculture & Life Sciences', 'Cornell Univ Coll of Human Ecology'],
    'Georgetown University': ['Georgetown University Law Center','Georgetown University, Washington D.C.'],
    'Humboldt University of Berlin': ['Humboldt-Universitaet zu Berlin'],
    'Imperial College London': ['Imperial College Business School'],
    'Leiden University': ['Leiden University Medical Center (LUMC)'],
    'University College London': ['University College London, University of London'],
    'Universite Catholique De Louvain': ['Universite Catholique de Louvain'],
}

for school in equivalent_schools_mapping:
    for equiv in equivalent_schools_mapping[school]:
        tmp_df = modified_mappings[modified_mappings['Partner University'] == equiv]
        modified_mappings['Partner University'].iloc[tmp_df.index] = [school] * len(tmp_df)

UCs_df = modified_mappings[modified_mappings['Partner University'] == 'University of California']
for school in modified_mappings['Partner University'].unique():
    if 'University of California' in school and school != 'University of California':
        to_append = UCs_df.copy()
        to_append['Partner University'] = [school] * len(to_append)
        modified_mappings = modified_mappings.append(to_append)
modified_mappings.index = range(len(modified_mappings))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [28]:
def get_department(title):
    end_index = 0
    for i, char in enumerate(title):
        if not char.isalpha():
            end_index = i
            break
    return title[:end_index]
    
def department_filter(departments, mappings=modified_mappings):
    if len(departments) == 0:
        return mappings

    filter_function = lambda title : (type(title) == str) and (get_department(title) in departments)
    return mappings[mappings['NUS Module 1'].map(filter_function) | mappings['NUS Module 2'].map(filter_function)]


def module_filter(modules, mappings=modified_mappings):
    if len(modules) == 0:
        return mappings
    
    equivalent_modules = get_equivalent_modules(modules)
    filter_function = lambda title: (title is not np.nan) and ((title in equivalent_modules) or (title[:-1] in equivalent_modules))
    return mappings[mappings['NUS Module 1'].map(filter_function) | mappings['NUS Module 2'].map(filter_function)]

def school_filter(schools, mappings=modified_mappings):
    if len(schools) == 0:
        return mappings
    
    filter_function = lambda school: school in schools
    return mappings[mappings['Partner University'].map(filter_function)]

def essential_module_filter(modules, mappings=modified_mappings):
    schools = set(mappings['Partner University'])
    for module in modules:
        schools_with_mod = set(module_filter([module],mappings=mappings)['Partner University'])
        schools.intersection_update(schools_with_mod)
    output = module_filter(modules,school_filter(schools,mappings=mappings))
    output.sort_values('Partner University',inplace=True)
    return output

def optional_module_filter(modules, mappings=modified_mappings):
    schools = set()
    for module in modules:
        schools_with_mod = set(module_filter([module],mappings=mappings)['Partner University'])
        schools.update(schools_with_mod)
    output = module_filter(modules,school_filter(schools,mappings=mappings))
    output.sort_values('Partner University',inplace=True)
    return output

In [140]:
def algorithm(essential_modules,optional_modules=[],schools=[],mappings=modified_mappings):
    """
    Returns a dictionary with key-value pairs being universities and the mappings of NUS modules to
    partner university modules. 
    
    output = {
        uni1: {NUS_mod1: [NUS_mod1_title, [[PU_mod1, PU_mod1_title], [PU_mod1, PU_mod1_title], ...]]},
    }
    """
    if schools != []:
        restricted_by_school = school_filter(schools,mappings)
    else:
        restricted_by_school = mappings
    
    restricted_by_essential_modules = essential_module_filter(essential_modules, restricted_by_school)
    schools_with_essential_modules = restricted_by_essential_modules['Partner University'].unique()
    truth_series = [school in schools_with_essential_modules for school in mappings['Partner University']]
    valid_schools_with_optional_modules = mappings[truth_series]
    
    output = restricted_by_essential_modules
    if optional_modules != []:
        tmp_df = optional_module_filter(optional_modules, valid_schools_with_optional_modules)
        output = output.append(tmp_df)
    output.sort_values('Partner University',inplace=True)
    
    dict_output = {}
    for university in output['Partner University'].unique():
        tmp_df = output[output['Partner University'] == university]
        NUS_modules = set(tmp_df['NUS Module 1'].dropna()).union(tmp_df['NUS Module 2'].dropna())
        university_mappings = {}
        for mod in NUS_modules:
            module_mapped = tmp_df[(tmp_df['NUS Module 1'] == mod) | (tmp_df['NUS Module 2'] == mod)]
            if module_mapped['NUS Module 1 Title'].iloc[0] is not np.nan:
                module_name = module_mapped['NUS Module 1 Title'].iloc[0]
            else:
                module_name = module_mapped['NUS Module 2 Title'].iloc[0]
            
            equivalent_PU_modules = []
            for i in range(len(module_mapped)):
                row = module_mapped.iloc[i]
                if row['PU Module 1'] is not np.nan:
                    equivalent_PU_modules.append([row['PU Module 1'], row['PU Module 1 Title']])
                else:
                    equivalent_PU_modules.append([row['PU Module 2'], row['PU Module 2 Title']])
            university_mappings[mod] = [module_name, equivalent_PU_modules]
        dict_output[university] = university_mappings
    
    return dict_output

In [151]:
essential_modules = ['CS2030','CS2040','ST2131']
optional_modules = ['MA2104','MA2101','MA2108']

algorithm(essential_modules, optional_modules)

{'Korea University': {'CS2040C': ['Data Structures and Algorithms',
   [['COSE 213', 'Data Structures']]],
  'CS2030': ['Programming Methodology II',
   [['KECE443', 'Object-Oriented Programming and Laboratory']]],
  'ST2334': ['Probability and Statistics', [['IWC207', 'Statistics']]]},
 'University of California, Los Angeles': {'CS2040C': ['Data Structures and Algorithms',
   [['ECS036C', 'Data Structures, Algorithms, & Programming']]],
  'CS2030': ['Programming Methodology II',
   [['COM SCI 32', 'Introduction to Computer Science II']]],
  'ST2334': ['Probability and Statistics',
   [['STA131A', 'Probability Theory'],
    ['AMS5', 'Statistics'],
    ['MATH170A', 'Probability Theory']]],
  'CS2040': ['Data Structures and Algorithms',
   [['CS180', 'Introduction to Algorithms and Complexity'],
    ['ECS032B', 'Introduction to Data Structures']]],
  'MA2108': ['Mathematical Analysis I', [['MATH131A', 'Analysis']]],
  'MA2104': ['Multivariable Calculus',
   [['MAT021D', 'Vector Analysis'