In [2]:
import pandas as pd

In [3]:
REQS_FILE = 'data/CU_SR_OPEN_DATA_CATALOG.csv'
REQS_FILE_ENCODING = 'utf-16le'

In [4]:
reqs: pd.DataFrame = pd.read_csv(REQS_FILE, encoding=REQS_FILE_ENCODING)
reqs.head(3)

Unnamed: 0,Course ID,Subject,Catalog,Long Title,Class Units,Component Code,Component Descr,Pre Requisite Description,Career,Equivalent Courses
0,26,ACCO,220,Financial and Managerial Accounting,3.0,LEC,Lecture,"Never Taken/Not Registered: ACCO213, ACCO21...",UGRD,
1,27,ACCO,230,Introduction to Financial Accounting,3.0,LEC,Lecture,"Never Taken/Not Registered: ACCO213, ACCO220, ...",UGRD,
2,28,ACCO,240,Introduction to Managerial Accounting,3.0,LEC,Lecture,"Never Taken/Not Registered: ACCO218, ACCO22...",UGRD,


In [20]:
# Looking at undergrad COMP courses (so as to clean the pre-requisite description)
ugrd_comp_reqs: pd.DataFrame = reqs[(reqs["Career"] == "UGRD") & (reqs["Subject"] == "COMP")][["Catalog", "Pre Requisite Description"]]
ugrd_comp_reqs.head(3)

Unnamed: 0,Catalog,Pre Requisite Description
1876,201,Course Prerequisite: MATH201
1877,201,Course Prerequisite: MATH201
1878,218,Never Taken/Not Registered: COMP248 You mus...


In [56]:
# TODO: Clean the pre-requisite description (if possible)
# e.g: Replace pre-requisite, requisite, done before, etc. with Prerequisite
cleaned_ugrad_comp_reqs: pd.DataFrame = ugrd_comp_reqs.copy()
import re
def term_equivalences(entry: str) -> str:
    entry = entry.replace("prerequisite", "Prerequisite")
    entry = entry.replace("PREREQ", "Prerequisite")
    entry = entry.replace("Pre-requisite", "Prerequisite")
    entry = entry.replace("Course Prerequisite", "Prerequisite")

    entry = entry.replace("corequisite", "Corequisite")
    entry = entry.replace("Co-requisite", "Corequisite")
    entry = entry.replace("Course Corequisite", "Corequisite")

    # Prefer these representations
    entry = entry.replace("You must complete 1 of the following rules.", "At least one of")

    return entry

def add_new_lines(entry: str) -> str:
    # We are sensitive to whitespace for these regexes, so trim the entry
    entry = entry.strip()
    # Add new lines between sections for legibility and ease of parsing
    entry = re.sub(r"[\s]+Prerequisite",
                   r"\nPrerequisite",
                   entry)
    entry = re.sub(r"[\s]+Corequisite",
                   r"\nCorequisite",
                   entry)
    entry = re.sub(r"[\s]+At least one of",
                   r"\nAt least one of",
                   entry)
    return entry

def clean_line_endings(entry: str) -> str:
    # Remove trailing commas, colons, whitespace, periods, and keywords such as "and"
    entry = re.sub(r"[,|:|;|\s|and|\.]+$",
                   r"",
                   entry,
                   flags=re.MULTILINE)
    return entry

def remove_unnecessary_text(entry: str) -> str:
    # These are global rules, so they don't need to be explicitely written
    entry = re.sub(r"Must complete all 200 level courses before enrolling in[\s]*(any)?[\s]*400 level course(s.)?",
                   r"",
                   entry,
                   flags=re.IGNORECASE)
    entry = re.sub(r"Students must complete all 200-level courses required for their program before registering for any 400-level courses.",
                   r"",
                   entry,
                   flags=re.IGNORECASE)
    # This entry doesn't add any new information...
    entry = re.sub(r"Prerequisite/Corequisite: ",
                   r"",
                   entry)
    # Just preference
    entry = re.sub(r"• ",
                   r"",
                   entry)
    return entry


# Replace NA values with empty strings
cleaned_ugrad_comp_reqs['Pre Requisite Description'] = cleaned_ugrad_comp_reqs['Pre Requisite Description'].fillna("")
# Make similar terms all be the same
cleaned_ugrad_comp_reqs['Pre Requisite Description'] = cleaned_ugrad_comp_reqs['Pre Requisite Description'].apply(term_equivalences)
# Enhance legibility (e.g: "Must complete: abc Course prerequisites: abc" -> "Must complete: abc\nCourse prerequisites: abc")
cleaned_ugrad_comp_reqs['Pre Requisite Description'] = cleaned_ugrad_comp_reqs['Pre Requisite Description'].apply(remove_unnecessary_text)
cleaned_ugrad_comp_reqs['Pre Requisite Description'] = cleaned_ugrad_comp_reqs['Pre Requisite Description'].apply(add_new_lines)
cleaned_ugrad_comp_reqs['Pre Requisite Description'] = cleaned_ugrad_comp_reqs['Pre Requisite Description'].apply(clean_line_endings)

cleaned_ugrad_comp_reqs.head(10)

Unnamed: 0,Catalog,Pre Requisite Description
1876,201,Prerequisite: MATH201
1877,201,Prerequisite: MATH201
1878,218,Never Taken/Not Registered: COMP248 You must c...
1879,218,Never Taken/Not Registered: COMP248 You must c...
1880,228,"Corequisite: MATH204, MATH203\nPrerequisite: C..."
1881,228,"Corequisite: MATH204, MATH203\nPrerequisite: C..."
1882,228,"Corequisite: MATH204, MATH203\nPrerequisite: C..."
1883,228,"Corequisite: MATH204, MATH203\nPrerequisite: C..."
1884,232,"Prerequisite: MATH204, MATH203\nNever Taken: C..."
1885,232,"Prerequisite: MATH204, MATH203\nNever Taken: C..."
