### 1. Combine 15 parts into a large file

In [1]:
import os
import pandas as pd

df_a = pd.read_csv('final_syllabus_summary2024.csv')

df_b = pd.read_csv('mismatches2024.csv')
df_c = pd.read_csv('dup2024.csv')
df_d = pd.read_csv('unk2024.csv')
df_a.set_index('File', inplace=True)
df_b.set_index('File', inplace=True)
df_c.set_index('File', inplace=True)
df_d.set_index('File', inplace=True)
df_a.update(df_b)
df_a.update(df_c)
df_a.update(df_d)
df_a.reset_index(inplace=True)

In [2]:
DEPARTMENT_MAP = {
    'Africana Studies': 'AFRI',
    'American Studies': ['AMST', 'ETHN', 'NAIS', 'PHUM', 'STS'],
    'Anthropology': 'ANTH',
    'Applied Mathematics': 'APMA',
    'Archaeology and the Ancient World': 'ARCH',
    'Bio-Medical (PLME & MED)': 'MED',
    'Biology': 'BIOL',
    'Brown Arts Institute': 'ARTS',
    'Business, Entrepreneurship, Organizations': 'BEO',
    'Center for Language Studies': ['ARAB', 'EINT', 'HNDI', 'LANG', 'PRSN', 'SIGN', 'TKSH', 'YORU'],
    'Chemistry': 'CHEM',
    'Classics': ['CLAS', 'CREK', 'LATN', 'MGRK', 'SANS'],
    'Cognitive, Linguistic, and Psychological Sciences': ['CLPS', 'LING'],
    'Cognitive and Psychological Sciences': 'CPSY',
    'Cogut Institute for the Humanities': 'HMAN',
    'Comparative Literature': 'COLT',
    'Computer Science': 'CSCI',
    'Data Science Initiative': ['DATA', 'DSIO'],
    'Early Modern World': 'EMOW',
    'Earth, Environmental and Planetary Sciences': 'EEPS',
    'East Asian Studies': ['CHIN', 'EAST', 'JAPN', 'KREA', 'VIET'],
    'Economics': 'ECON',
    'Education': 'EDUC',
    'Egyptology and Assyriology': ['ASYR', 'EGYT'],
    'Engineering': 'ENGN',
    'English': 'ENGL',
    'Environmental Studies': 'ENVS',
    'French Studies': 'FREN',
    'German Studies': 'GRMN',
    'Hispanic Studies': 'HISP',
    'History': 'HIST',
    'History of Art and Architecture': 'HIAA',
    'Italian Studies': 'ITAL',
    'Judaic Studies': ['HEBR', 'JUDS'],
    'Linguistics': 'LING',
    'Literary Arts': 'LITR',
    'Mathematics': 'MATH',
    'Medieval Studies': 'MDVL',
    'Middle East Studies': 'MES',
    'Modern Culture and Media': 'MCM',
    'Music': 'MUSC',
    'Neuroscience': 'NEUR',
    'Pembroke Center Teach and Rearch': 'GNSS',
    'Philosophy': 'PHIL',
    'Physics': 'PHYS',
    'Political Science': 'POLS',
    'Portuguese and Brazilian Studies': 'POBS',
    'Public Health': ['BHDS', 'GPHP', 'HCL', 'PHP'],
    'Religious Studies': ['COST', 'RELS'],
    'Slavic Studies': ['CZCH','PLSH','RUSS', 'SLAV'],
    'Sociology': 'SOC',
    'Theatre Arts and Performance Studies': 'TAPS',
    'Urban Studies': 'URBN',
    'Visual Art': 'VISA',
    'Watson Institute': ['IAPA', 'MPA'],
}
# reverse lookup: code >> dept name
CODE_TO_DEPT = {}
for dept, codes in DEPARTMENT_MAP.items():
    if isinstance(codes, list):
        for c in codes:
            CODE_TO_DEPT[c] = dept
    else:
        CODE_TO_DEPT[codes] = dept
# map each department to a knowledge area

KNOWLEDGE_AREA_MAP = {
    'Africana Studies': 'Social Sciences',
    'American Studies': 'Social Sciences',
    'Anthropology': 'Social Sciences',
    'Applied Mathematics': 'Physical Sciences',
    'Archaeology and the Ancient World': 'Humanities',
    'Bio-Medical (PLME & MED)': 'Life Sciences',
    'Biology': 'Life Sciences',
    'Brown Arts Institute': 'Arts',
    'Business, Entrepreneurship, Organizations': 'Social Sciences',
    'Center for Language Studies': 'Humanities',
    'Chemistry': 'Physical Sciences',
    'Classics': 'Humanities',
    'Cognitive, Linguistic, and Psychological Sciences':'Social Sciences',
    'Cognitive and Psychological Sciences': 'Life Sciences',
    'Cogut Institute for the Humanities': 'Humanities',
    'Comparative Literature': 'Humanities',
    'Computer Science': 'Physical Sciences',
    'Data Science Initiative': 'Physical Sciences',
    'Early Modern World': 'Humanities',
    'Earth, Environmental and Planetary Sciences': 'Physical Sciences',
    'East Asian Studies': 'Humanities',
    'Economics': 'Social Sciences',
    'Education': 'Social Sciences',
    'Egyptology and Assyriology': 'Humanities',
    'Engineering': 'Physical Sciences',
    'English': 'Humanities',
    'Environmental Studies': 'Physical Sciences',
    'French Studies': 'Humanities',
    'German Studies': 'Humanities',
    'Hispanic Studies': 'Humanities',
    'History': 'Social Sciences',
    'History of Art and Architecture': 'Humanities',
    'Italian Studies': 'Humanities',
    'Judaic Studies': 'Humanities',
    'Linguistics': 'Social Sciences',
    'Literary Arts': 'Humanities',
    'Mathematics': 'Physical Sciences',
    'Medieval Studies': 'Humanities',
    'Middle East Studies': 'Humanities',
    'Modern Culture and Media': 'Humanities',
    'Music': 'Humanities',
    'Neuroscience': 'Life Sciences',
    'Pembroke Center Teach and Rearch': 'Humanities',
    'Philosophy': 'Humanities',
    'Physics': 'Physical Sciences',
    'Political Science': 'Social Sciences',
    'Portuguese and Brazilian Studies': 'Humanities',
    'Public Health': 'Life Sciences',
    'Religious Studies': 'Humanities',
    'Slavic Studies': 'Humanities',
    'Sociology': 'Social Sciences',
    'Theatre Arts and Performance Studies': 'Humanities',
    'Urban Studies': 'Social Sciences',
    'Visual Art': 'Humanities',
    'Watson Institute': 'Social Sciences',
}

COURSE_CODE_MAP = {
    'Africana Studies': 'AFRI',
    'American Sign Language': 'SIGN',
    'American Studies': 'AMST',
    'Anthropology': 'ANTH',
    'Applied Mathematics': 'APMA',
    'Arabic': 'ARAB',
    'Archaeology and the Ancient World': 'ARCH',
    'Assyriology': 'ASYR',
    'Behavioral and Social Health Sciences': 'BHDS',
    'Biology': 'BIOL',
    'Bio-Medical (PLME & MED)': 'MED',
    'Brown Arts Institute': 'ARTS',
    'Business, Entrepreneurship, Organizations': 'BEO',
    'Chemistry': 'CHEM',
    'Chinese': 'CHIN',
    'Classical Greek': 'CREK',
    'Classics': 'CLAS',
    'Cognitive and Psychological Sciences': 'CPSY',
    'Cognitive, Linguistic, and Psychological Sciences': 'CLPS',
    'Cogut Institute for the Humanities': 'HMAN',
    'Comparative Literature': 'COLT',
    'Computer Science': 'CSCI',
    'Contemplative Studies': 'COST', 
    'Czech': 'CZCH',
    'Data Science': 'DATA',
    'Early Modern World': 'EMOW',
    'Earth, Environmental and Planetary Sciences': 'EEPS',
    'East Asian Studies': 'EAST',
    'Economics': 'ECON', 
    'Education': 'EDUC',
    'Egyptology': 'EGYT',
    'Engineering': 'ENGN',
    'English': 'ENGL',
    'English for International Students': 'EINT', 
    'Environmental Studies': 'ENVS',
    'Ethnic Studies': 'ETHN',
    'French': 'FREN',
    'Gender and Sexuality Studies': 'GNSS', 
    'German': 'GRMN',
    'Global Public Health': 'GPHP',
    'Health Care Leadership': 'HCL', 
    'Hebrew': 'HEBR',
    'Hindi': 'HNDI',
    'Hispanic Studies': 'HISP',
    'History': 'HIST',
    'History of Art and Architecture': 'HIAA',
    'International and Public Affairs': 'IAPA', 
    'Italian': 'ITAL',
    'Japanese': 'JAPN',
    'Judaic Studies': 'JUDS',
    'Korean': 'KREA',
    'Language Studies': 'LANG', 
    'Latin': 'LATN',
    'Linguistics': 'LING',
    'Literary Arts': 'LITR',
    'Master of Public Affairs': 'MPA', 
    'Mathematics': 'MATH',
    'Medieval Studies': 'MDVL',
    'Middle East Studies': 'MES',
    'Modern Culture and Media': 'MCM',
    'Modern Greek': 'MGRK',
    'Music': 'MUSC',
    'Native American and Indigenous Studies': 'NAIS',
    'Neuroscience': 'NEUR',
    'Persian': 'PRSN',
    'Philosophy': 'PHIL',
    'Physics': 'PHYS',
    'Polish': 'PLSH',
    'Political Science': 'POLS',
    'Portuguese and Brazilian Studies': 'POBS',
    'Public Health': 'PHP',
    'Public Humanities': 'PHUM',
    'Religious Studies': 'RELS',
    'Russian': 'RUSS',
    'Sanskrit': 'SANS',
    'Science, Technology, and Society': 'STS',
    'Slavic Studies': 'SLAV',
    'Sociology': 'SOC',
    'Theatre Arts and Performance Studies': 'TAPS',
    'Turkish': 'TKSH',
    'Urban Studies': 'URBN',
    'Vietnamese': 'VIET',
    'Visual Art': 'VISA',
    'Yoruba': 'YORU'
}

In [3]:
df = df_a
df['Course Code Prefix'] = df['Course Code'].str.split(' ').str[0].str.split('/').str[0]

calculated_departments = df['Course Code Prefix'].map(CODE_TO_DEPT)
calculated_knowledge_areas = calculated_departments.map(KNOWLEDGE_AREA_MAP)


df['Department'] = calculated_departments.where(calculated_departments.notna(), df['Department'])
df['Knowledge Area'] = calculated_knowledge_areas.where(calculated_knowledge_areas.notna(), df['Knowledge Area'])


df.drop(columns=['Course Code Prefix'], inplace=True)

df.to_csv('new_final_2024.csv',index=False)