In [1]:
import pandas as pd

In [2]:
file1 = pd.read_excel("data\Course_Section_Search_-_Central Term 1.xlsx", skiprows=1)
file2 = pd.read_excel("data\Course_Section_Search_-_Central Term 2 and Summer 2025.xlsx", skiprows=1)

In [3]:
file1_phys = file1.copy()
file2_phys = file2.copy()

In [4]:
phys_courses = pd.concat([file1_phys, file2_phys], ignore_index=True)
phys_courses.shape

(4091, 44)

In [5]:
phys_courses = phys_courses[phys_courses['Course Number'] < 500]

In [6]:
phys_courses.loc[:, 'Course Code'] = phys_courses['Course Subject'] + " " + phys_courses['Course Number'].astype(str)

In [7]:
phys_courses_final = phys_courses.drop_duplicates(subset=['Course Code'], keep='first')

In [8]:
phys_courses_final.shape

(658, 45)

### Extracting Requisites

In [9]:
import re

course_description = phys_courses_final[['Course Code', 'Description']].copy()
course_description.loc[:, 'reqs'] = course_description['Description'].str.extract(
    r'((prerequisite|corequisite)[\s\S]*)', 
    flags=re.IGNORECASE
)[0]
course_description.sample(5)

Unnamed: 0,Course Code,Description,reqs
2029,STAT_V 201,Classical and simulation-based techniques for ...,Prerequisite: DSCI 100.
342,BIOL_V 336,"Natural selection; population genetics, quanti...","Prerequisite: One of BIOL 233, BIOL 234."
1276,EOSC_V 325,Theories of storage and movement of water with...,Prerequisite: (a) 3rd year standing in Science...
730,CPSC_V 110,Fundamental program and computation structures...,
1669,MICB_V 405,Computational methods to analyze genome and pr...,"Prerequisite: One of MICB 301, MICB 325, BIOC ..."


In [10]:
pattern = r'\b[A-Z]{4}\s\d{3}\sis\srecommended\.'

# Remove the sentence if present
course_description['reqs'] = course_description['reqs'].str.replace(pattern, '', regex=True)
course_description.sample(3)

Unnamed: 0,Course Code,Description,reqs
3310,EOSC_V 240,Theory and practice of site investigation for ...,Prerequisite: EOSC 210. Corequisite: CIVL 210.
630,CHEM_V 318,Fundamental aspects of chemical catalysis: kin...,"Prerequisite: One of CHEM 218, CHEM 250."
3413,FSCT_V 490,Block registration for the expert witness test...,


In [11]:
def extract_reqs(text, keyword):
    if isinstance(text, str):
        match = re.search(fr'{keyword}:.*?[.\]]', text)
        return match.group(0) if match else ''
    return ''

# Create 'prereqs' and 'coreqs' columns
course_description['prereqs'] = course_description['reqs'].apply(lambda x: extract_reqs(x, 'Prerequisite'))
course_description['coreqs'] = course_description['reqs'].apply(lambda x: extract_reqs(x, 'Corequisite'))

In [12]:
course_description['prereq_courses'] = course_description['prereqs'].apply(
    lambda x: re.findall(r'[A-Z]{4}\s*\d{3}', str(x)) if isinstance(x, str) else []
)
course_description['coreq_courses'] = course_description['coreqs'].apply(
    lambda x: re.findall(r'[A-Z]{4}\s*\d{3}', str(x)) if isinstance(x, str) else []
)
course_description.sample(5)

Unnamed: 0,Course Code,Description,reqs,prereqs,coreqs,prereq_courses,coreq_courses
1100,DSCI_V 430,Ethical application of data science and machin...,"Prerequisite: One of CPSC 330, CPSC 340, STAT ...","Prerequisite: One of CPSC 330, CPSC 340, STAT ...",,"[CPSC 330, CPSC 340, STAT 301, STAT 406]",[]
1583,MATH_V 450,Asymptotic expansions. Asymptotic evaluation o...,Prerequisite: MATH 400.,Prerequisite: MATH 400.,,[MATH 400],[]
2472,BIOL_V 362,"The cytoskeleton, cell dynamics, and regulatio...","Prerequisite: BIOL 200 and one of APBI 312, AP...","Prerequisite: BIOL 200 and one of APBI 312, AP...",,"[BIOL 200, APBI 312, APBI 351, BIOL 260, BIOL ...",[]
2696,CHEM_V 403,Surfaces and phenomena occurring at surfaces a...,"Prerequisite: CHEM 304 and one of MATH 200, MA...","Prerequisite: CHEM 304 and one of MATH 200, MA...",,"[CHEM 304, MATH 200, MATH 217, MATH 226, MATH ...",[]
1573,MATH_V 425,"Smooth manifolds, smooth maps, immersions and ...","Prerequisite: One of MATH 221, MATH 223 and on...","Prerequisite: One of MATH 221, MATH 223 and on...",,"[MATH 221, MATH 223, MATH 217, MATH 227, MATH ...",[]


In [13]:
course_description.drop(columns=['prereqs', 'coreqs', 'reqs'], inplace=True)

In [14]:
course_description.sample()

Unnamed: 0,Course Code,Description,prereq_courses,coreq_courses
3310,EOSC_V 240,Theory and practice of site investigation for ...,[EOSC 210],[CIVL 210]


In [15]:
def standardize_courses(list):
    standardized_list = []
    for course in list:
        # Use regex to find courses with no space before the number
        standardized_course = re.sub(r'([A-Z]+)(\d+)', r'\1 \2', course)
        standardized_list.append(standardized_course)
    return standardized_list

# Apply the function to the prereq_courses column
course_description['prereq_courses'] = course_description['prereq_courses'].apply(standardize_courses)
course_description['coreq_courses'] = course_description['coreq_courses'].apply(standardize_courses)

In [16]:
course_description['Course Code'] = course_description['Course Code'].str.replace('_V', '', regex=False)

In [17]:
course_description.sample()

Unnamed: 0,Course Code,Description,prereq_courses,coreq_courses
3598,MATH 419,"Random walks, Markov chains, branching process...",[MATH 418],[]


In [18]:
themes = pd.read_csv('data/course_themes.csv')
themes.head(1)

Unnamed: 0,Department,Course Code,Human Health,Data and Computation,AI,Clean Energy and Materials,Climate,Science and Society,Faculty of Arts: Ways of Knowing,Notes
0,CAPS,ANAT_V 392,,,,,,,,


In [19]:
themes.drop(columns=['Department', 'Notes'], inplace=True)
themes.head(1)

Unnamed: 0,Course Code,Human Health,Data and Computation,AI,Clean Energy and Materials,Climate,Science and Society,Faculty of Arts: Ways of Knowing
0,ANAT_V 392,,,,,,,


In [20]:
themes.iloc[:, 1:] = themes.iloc[:, 1:].notna() 
themes.head(1)

  themes.iloc[:, 1:] = themes.iloc[:, 1:].notna()


Unnamed: 0,Course Code,Human Health,Data and Computation,AI,Clean Energy and Materials,Climate,Science and Society,Faculty of Arts: Ways of Knowing
0,ANAT_V 392,False,False,False,False,False,False,False


In [21]:
themes['Course Code'] = themes['Course Code'].str.replace('_V', '', regex=False)
themes.sample()

Unnamed: 0,Course Code,Human Health,Data and Computation,AI,Clean Energy and Materials,Climate,Science and Society,Faculty of Arts: Ways of Knowing
233,CPSC 404,False,False,False,False,False,False,False


In [22]:
themes['themes'] = themes.iloc[:, 1:].apply(lambda x: list(themes.columns[1:][x]), axis=1)

In [23]:
themes = themes[['Course Code', 'themes']]
themes.sample()

Unnamed: 0,Course Code,themes
389,MICB 211,[]


In [24]:
courses_with_themes = course_description.merge(themes, on='Course Code', how='left')

In [25]:
courses_with_themes.sample(10)

Unnamed: 0,Course Code,Description,prereq_courses,coreq_courses,themes
628,EOSC 473,"Methods of data acquisition, study and analysi...","[EOSC 372, EOSC 373]",[],[]
166,CPSC 298,Approved and supervised technical work experie...,[],[],[]
325,MATH 437,"Divisibility, congruences, Diophantine equatio...",[],"[MATH 320, MATH 319, MATH 322]",[]
387,PHYS 170,"Statics of particles, equilibrium or rigid bod...",[],[],[]
139,CHEM 398,Approved and supervised technical work experie...,[],[],[]
702,PHYS 310,Fundamental principles and applications of dat...,"[MATH 152, MATH 221, MATH 223, MATH 200, MATH ...",[],[]
438,STAT 399,Work experience in an industrial research sett...,[STAT 398],[],[]
54,BIOL 329,"How cells react to external signals, and how t...",[BIOL 200],[],[]
276,ISCI 360,Application of systems science encompassing ge...,[],[],[Science and Society]
483,BIOL 314,"The units of biodiversity, from genes to ecosy...","[APBI 260, BIOL 230, FRST 201, GEOB 207, GEOS ...",[],[]


In [26]:
# Use in case of extracting all prereqs

import json

phys_courses_json = []

for _, row in courses_with_themes.iterrows():
    course_entry = {
        "course_code": row['Course Code'],
        "description": row['Description'],
        "prerequisites": row['prereq_courses'],  # Directly using the list from the CSV
        "corequisites": row['coreq_courses'],  # You can include other columns as needed
        "themes": row['themes']
    }
    phys_courses_json.append(course_entry)

# Convert to JSON string (optional, for saving or viewing)
courses_json_str = json.dumps(phys_courses_json, indent=4)

# Save the JSON to a file
with open('data/all_courses.json', 'w') as json_file:
    json_file.write(courses_json_str)

In [27]:
# Load the JSON data
with open('data/all_courses.json', 'r') as file:
    data = json.load(file)

# Extract all valid course codes into a set for fast lookup
valid_course_codes = {course['course_code'] for course in data}

# Filter the prerequisites for each course
for course in data:
    # Keep only those prerequisites that are in the valid course codes
    course['prerequisites'] = [prereq for prereq in course['prerequisites'] if prereq in valid_course_codes]

for course in data:
    course['corequisites'] = [coreq for coreq in course['corequisites'] if coreq in valid_course_codes]

# Save the modified data back to the JSON file
with open('data/all_courses.json', 'w') as file:
    json.dump(data, file, indent=4)

print("Prerequisites filtered successfully!")

Prerequisites filtered successfully!
