In [78]:
import requests
from bs4 import BeautifulSoup
import re
import sqlite3

In [45]:
def get_subjects(semester):
    """
    Fetch the HTML document of the Cornell class roster home page for the given
    semester. Parse the home page HTML and return a dictionary with subject
    codes and their names.

    Parameter semester: the semester to search
    Precondition: semester is a str in form of season + year. Season should be 2
    capital letters among "SP","SU","FA","WI"; and year should be a 2-digits int
    Example: "SP25", "FA26"
    """
    assert isinstance(semester, str) and len(semester) == 4
    assert semester[:2] in ["SP", "SU", "FA", "WI"]

    home_url = f"https://classes.cornell.edu/browse/roster/{semester}"
    response = requests.get(home_url)

    if response.status_code != 200:
        raise Exception(
            f"Failed to fetch the home page. "
            f"Status code: {response.status_code}"
        )

    result = {}

    home_page = BeautifulSoup(response.text,"html.parser")
    li_tags = home_page.find_all("li", {"class": "browse-subjectdescr"})
    for li_tag in li_tags:
        a_tag = li_tag.find("a")  # Get the <a> tag
        subject_link = a_tag["href"]
        pos = subject_link.find("subject/")
        subject_code = subject_link[pos+8:]
        subject_name = a_tag.text
        result[subject_code] = subject_name

    return result
print(get_subjects("SP25"))

{'AAS': 'Asian American Studies', 'AEM': 'Applied Economics & Management', 'AEP': 'Applied & Engineering Physics', 'AIIS': 'American Indian and Indigenous Studies', 'AIRS': 'Aerospace Studies', 'ALS': 'Agriculture & Life Sciences', 'AMST': 'American Studies', 'ANSC': 'Animal Science', 'ANTHR': 'Anthropology', 'ARAB': 'Arabic', 'ARCH': 'Architecture', 'ARKEO': 'Archaeology', 'ART': 'Art', 'ARTH': 'History of Art', 'AS': 'Arts & Sciences', 'ASIAN': 'Asian Studies', 'ASL': 'American Sign Language', 'ASRC': 'Africana Studies & Research Center', 'ASTRO': 'Astronomy', 'BCS': 'Bosnian, Croatian, Serbian', 'BEE': 'Biological & Environmental Engineering', 'BENGL': 'Bengali', 'BIOAP': 'Animal Physiology & Anatomy', 'BIOCB': 'Computational Biology', 'BIOEE': 'Ecology & Evolutionary Biology', 'BIOG': 'Biology: General Courses', 'BIOMG': 'Molecular Biology and Genetics', 'BIOMI': 'Microbiology', 'BIOMS': 'Biomedical Sciences', 'BIONB': 'Neurobiology & Behavior', 'BME': 'Biomedical Engineering', 'BS

In [65]:
def get_courses(semester,subject_code):
    """
    Parameter semester: the semester to search
    Precondition: semester is a str in form of season + year. Season should be 2
    capital letters among "SP","SU","FA","WI"; and year should be a 2-digits int
    Example: "SP25", "FA26"

    Parameter subject_code: the abbreviation of a subject at Cornell
    Precondition: subject_code is a str
    Example: "AEM", "CS", "PHYS"
    """
    subject_url = (
        f"https://classes.cornell.edu/browse/roster/{semester}/"
        f"subject/{subject_code}"
    )
    response = requests.get(subject_url)
    if response.status_code != 200:
        raise Exception(
            f"Failed to fetch the {subject_code} page. "
            f"Status code: {response.status_code}"
        )
    subject_page = BeautifulSoup(response.text,"html.parser")
    subject_codes = subject_page.find_all("div",class_="title-subjectcode")
    result = []
    for code in subject_codes:
        pos = code.text.find(" ")
        subject_code = code.text[:pos] + code.text[pos+1:]
        result.append(subject_code)
    return result

In [68]:
def get_course_details(semester,course_code):
    """
    Return a
    """
    match = re.match(r"([A-Za-z]+)(\d+)", course_code)
    if match:
        subject = match.group(1)
        code = match.group(2)

    course_url = (f"https://classes.cornell.edu/browse/roster/{semester}"
                  f"/class/{subject}/{code}")
    response = requests.get(course_url)
    if response.status_code != 200:
        raise Exception(
            f"Failed to fetch the {course_code} page. "
            f"Status code: {response.status_code}"
        )
    course_page = BeautifulSoup(response.text,"html.parser")

    course_title = course_page.find("div", class_="title-coursedescr")
    credits = course_page.find("span", class_="credits")
    distr = course_page.find("span", class_="catalog-distr")
    prereq = course_page.find("span", class_="catalog-prereq")
    instructors = course_page.find_all("span", class_="tooltip-iws")

    course_title = course_title.text if course_title else "N/A"
    credits = credits.text if credits else "N/A"
    distr = distr.text if distr else "N/A"
    prereq = prereq.text if prereq else "N/A"
    return prereq

In [71]:
courses = get_courses("SP25","CS")
result = []
for course in courses:
    prereq = get_course_details("SP25",course)
    print(f"{course}: {prereq}")
    result.append(prereq)
print("finish")

CS1110: N/A
CS1112: Prerequisites/Corequisites Prerequisite: MATH 1110 or equivalent.
CS1132: N/A
CS1133: N/A
CS1340: N/A
CS1700: N/A
CS1998: N/A
CS2043: Prerequisites/Corequisites Prerequisite: one programming course or equivalent programming experience.
CS2110: Prerequisites/Corequisites Prerequisite: CS 1110 or CS 1112 or equivalent course on programming in a procedural language.
CS2800: Prerequisites/Corequisites Prerequisite or corequisite: MATH 1110 or equivilent, one programming course or permission of instructor.
CS3110: Prerequisites/Corequisites Prerequisite: CS 2110 or equivalent programming experience. Prerequisite or corequisite: CS 2800.
CS3152: Prerequisites/Corequisites Prerequisite: CS 2110 for programmers, or permission of the instructor. Corequisite: ENGRC 3152.
CS3410: Prerequisites/Corequisites Prerequisite: CS 2110 or equivalent programming experience.
CS3420: Prerequisites/Corequisites Prerequisite: ECE 2300/ENGRD 2300.
CS3700: Prerequisites/Corequisites Prerequi

In [242]:
def parse_prerequisites(prereq_text):
    """
    Parse prerequisite text into a nested list, with the last element of the
    nested list indicating whether it needs further explanation.
    "A or B and C" should be converted to [[A,B],C]

    If the prereq text is in format "topic (e.g. course1, course2)", or
    "1) topic: ...; 2) topic ..." only keep the topic part and replace the topic
    into several specified courses. For example, "linear algebra" is replaced by
    "MATH 2210 or MATH 2310 or MATH 2940".

    For text in "a, b, c, or d" structures, replace each comma with "or";
    for text in "a, b, c or d" structures, replace each comma with "and";
    for "a/b", replace "/" with "or"; replace each ";" with "and".

    Split each text by "and", and then split each sub-text by "or". Only keep
    the course code part in the text and append them into a nested list. Remove
    any repeated course in the nested list.

    Parameter prereq_text: The raw prerequisite text.
    Precondition: a str from Cornell's class roster
    """
    if not prereq_text:
        return []

    prereq_text = prereq_text.replace("\xa0", " ")
    note = False
    print(prereq_text)

    patterns = [
        (r"\(.*?\)", ""), (r"For\s.*?majors[:;].*?[.;]", ""),
        (r"Note:.*?;", ""), (r"[A-Za-z\s]+(?:degree|experience),", ""),
        (r"([A-Z]+\s\d{4})-([A-Z]+\s\d{4})",r"\2")
    ]
    for pattern,replace in patterns:
        if re.search(pattern, prereq_text, flags=re.IGNORECASE):
            prereq_text=re.sub(pattern,replace,prereq_text,flags=re.IGNORECASE)
            note = True

    if re.search(r"\d\)\s*.*?:", prereq_text):
        # print("1)")
        # check structure "1)...; 2)...; 3)...;
        matches = re.findall(r"\d\)\s*(.*?):", prereq_text)
        topics = [match.strip() for match in matches]
        prereq_text = " and ".join(topics)
        note = True
    if prereq_text.find(", or permission of the instructor.")!= -1:
        # print("instructor")
        prereq_text=prereq_text.replace(", or permission of the instructor.","")
        note = True
    if prereq_text.find(", or permission of instructor.")!= -1:
        # print("instructor")
        prereq_text=prereq_text.replace(", or permission of instructor.","")
        note = True
    if prereq_text.find("performance")!=-1 or prereq_text.find("excellent")!=-1:
        # print("performance")
        note = True

    if re.search(r',\s*or\s', prereq_text):
        # check structure A, B, or C
        prereq_text = re.sub(r',(?=\s*[^o])', ' or ', prereq_text)
        # replace "," with " or "

    prereq_text = re.sub(r"[;,]\s*$", "", prereq_text) # remove trailing semicolons

    prereq_text = prereq_text.replace(", ", " and ")
    prereq_text = prereq_text.replace(";", " and ")
    prereq_text = prereq_text.replace("/", " or ")

    replacements = {
    "linear algebra":"MATH 2210 or MATH 2230 or MATH 2310 or MATH 2940",
    "single-variable calculus": "MATH 1910 or MATH 1120",
    "calculus": "MATH 1920 or MATH 2220 or MATH 2240",
    "core statistics": "STSCI 2100 or MATH 1710",
    "probability theory":"BTRY 3080 or CS 2800 or ECON 3130 or ENGRD 2700 or MATH 4710",

    "one programming course": "CS 1110 or CS 1112 or CS 1132 or CS 1133",
    "knowledge of programming": "CS 1110 or CS 1112 or CS 1132 or CS 1133",
    "core programming": "CS 1110 or CS 1112",
    "Python":"CS 1110 or CS 1112 or CS 1133",
    "MATLAB": "CS 1132", "C++":"CS 2024",
    "programming proficiency": "CS 2110 or CS 2112",
    "data structures": "CS 2110 or CS 2112",
    "discrete mathematics": "CS 2800",
    "introductory ML course": "CS 3780",
    }

    for topic,course in replacements.items():
        if re.search(rf"\b{re.escape(topic)}\b",
        prereq_text, flags=re.IGNORECASE):
            prereq_text = re.sub(
            rf"\b{re.escape(topic)}\b",
            course,prereq_text,flags=re.IGNORECASE)
            note = True

    and_split = re.split(r"\s+and\s+", prereq_text)
    or_split = [re.split(r"\s+or\s+", item) for item in and_split]

    # Matches patterns like "CS 1110", "MATH 1920"
    course_code_pattern = r"[A-Z]{2,10}\s\d{4}"
    nested_list = []
    for group in or_split:
        course_list = [
            re.sub(r"\s", "", match.group()) # Remove spaces in matched course codes
            for item in group
            for match in re.finditer(course_code_pattern, item)
        ]
        nested_list.append(course_list)
    for sublist in nested_list:
        if len(sublist) == 0:
            # print("empty")
            note = True
    cleaned_list = [sublist for sublist in nested_list if sublist]
    result = remove_repeat(cleaned_list)
    result.append(note)
    print(prereq_text)
    return result
# def parse_prerequisites(prereq_text):
#     """
#     Parse prerequisite text into a nested list, with the last element of the
#     nested list indicating whether it needs further explanation.
#     "A or B and C" should be converted to [[A,B],C]

#     If the prereq text is in format "topic (e.g. course1, course2)", or
#     "1) topic: ...; 2) topic ..." only keep the topic part and replace the topic
#     into several specified courses. For example, "linear algebra" is replaced by
#     "MATH 2210 or MATH 2310 or MATH 2940".

#     For text in "a, b, c, or d" structures, replace each comma with "or";
#     for text in "a, b, c or d" structures, replace each comma with "and";
#     for "a/b", replace "/" with "or"; replace each ";" with "and".

#     Split each text by "and", and then split each sub-text by "or". Only keep
#     the course code part in the text and append them into a nested list. Remove
#     any repeated course in the nested list.

#     Parameter prereq_text: The raw prerequisite text.
#     Precondition: a str from Cornell's class roster
#     """
#     if not prereq_text:
#         return []

#     prereq_text = prereq_text.replace("\xa0", " ")
#     note = False

#     if re.search(r"\(.*?\)", prereq_text): # check and remove (e.g. ...)
#         print("()")
#         prereq_text = re.sub(r"\(.*?\)", "", prereq_text)
#         note = True
#     if re.search(r"For\s.*?majors[:;].*?[.;]",prereq_text,flags=re.IGNORECASE):
#         print("for ... major")
#         # check and remove "for ... majors: ..."
#         prereq_text = re.sub(r"For\s.*?majors[:;].*?[.;]", "", prereq_text, flags=re.IGNORECASE)
#         note = True
#     if re.search(r"Note:.*?;",prereq_text,flags=re.IGNORECASE): # check and remove "Note: ... "
#         print("note")
#         prereq_text = re.sub(r"Note:.*?;", "", prereq_text, flags=re.IGNORECASE)
#         note = True
#     if re.search(r"[A-Za-z\s]+(?:degree|experience),",prereq_text,flags=re.IGNORECASE):
#         print("degree")
#         # check and remove "...degree" or "...experience"
#         prereq_text = re.sub(r"[A-Za-z\s]+(?:degree|experience),", "",prereq_text,flags=re.IGNORECASE)
#         note = True
#     if re.search(r"\d\)\s*.*?:", prereq_text):
#         print("1)")
#         # check structure "1)...; 2)...; 3)...;
#         matches = re.findall(r"\d\)\s*(.*?):", prereq_text)
#         topics = [match.strip() for match in matches]
#         prereq_text = " and ".join(topics)
#         note = True
#     if re.search(r"([A-Z]+\s\d{4})-([A-Z]+\s\d{4})",prereq_text):
#         print("-")
#         # check structure "MATH 1110-MATH 1120" and only keep the second course
#         prereq_text = re.sub(r"([A-Z]+\s\d{4})-([A-Z]+\s\d{4})",r"\2",prereq_text)
#         note = True
#     if prereq_text.find(", or permission of the instructor.")!= -1:
#         print("instructor")
#         prereq_text = prereq_text.replace(", or permission of the instructor.","")
#         note = True
#     if prereq_text.find("performance")!=-1 or prereq_text.find("excellent")!=-1:
#         print("performance")
#         note = True
#     if re.search(r',\s*or\s', prereq_text):
#         # check structure A, B, or C
#         prereq_text = re.sub(r',(?=\s*[^o])', ' or ', prereq_text)
#         # replace "," with " or "

#     prereq_text = re.sub(r"[;,]\s*$", "", prereq_text) # remove trailing semicolons

#     prereq_text = prereq_text.replace(", ", " and ")
#     prereq_text = prereq_text.replace(";", " and ")
#     prereq_text = prereq_text.replace("/", " or ")
#     prereq_text = prereq_text.replace("Python", "CS 1110 or CS 1112 or CS 1133") # for CS courses
#     prereq_text = prereq_text.replace("linear algebra", "MATH 2210 or MATH 2230 or MATH 2310 or MATH 2940") # for CS courses
#     prereq_text = prereq_text.replace("probability theory", "BTRY 3080 or CS 2800 or ECON 3130 or ENGRD 2700 or MATH 4710") # for CS courses
#     prereq_text = prereq_text.replace("MATLAB", "CS 1132") # for CS courses
#     prereq_text = prereq_text.replace("programming proficiency", "CS 2110 or CS 2112")
#     prereq_text = prereq_text.replace("single-variable calculus", "MATH 1910 or MATH 1120")
#     prereq_text = prereq_text.replace("calculus", "MATH 1920 or MATH 2220 or MATH 2240")
#     prereq_text = prereq_text.replace("one programming course", "CS 1110 or CS 1112 or CS 1132 or CS 1133")
#     prereq_text = prereq_text.replace("knowledge of programming", "CS 1110 or CS 1112 or CS 1132 or CS 1133")
#     prereq_text = prereq_text.replace("discrete math", "CS 2800")
#     prereq_text = prereq_text.replace("introductory ML course", "CS 3780")
#     prereq_text = prereq_text.replace("data structures", "CS 2110")

#     and_split = re.split(r"\s+and\s+", prereq_text)
#     or_split = [re.split(r"\s+or\s+", item) for item in and_split]

#     course_code_pattern = r"[A-Z]{2,10}\s\d{4}"  # Matches patterns like "CS 1110", "MATH 1920"
#     nested_list = []
#     for group in or_split:
#         course_list = [match.group() for item in group for match in re.finditer(course_code_pattern, item)]
#         nested_list.append(course_list)
#     for sublist in nested_list:
#         if len(sublist) == 0:
#             print("empty")
#             note = True
#     cleaned_list = [sublist for sublist in nested_list if sublist]
#     result = remove_repeat(cleaned_list)
#     result.append(note)
#     return result

# Example usage
CS3780 = (
"probability theory (e.g. BTRY 3080, CS 2800, ECON 3130, "
"ENGRD 2700, MATH 4710) and linear algebra (e.g. MATH 2210, "
"MATH 2310, MATH 2940), single-variable calculus (e.g. MATH 1110, "
"MATH 1920) and programming proficiency (e.g. CS 2110)."
)
MATH4260 = "MATH 2210 or MATH 2940 or equivalent, knowledge of programming, CS 3220 or CS 4210/MATH 4250, or permission of the instructor."
INFO4300 = "1) Linear algebra: strong performance in MATH 2940 or equivalent; 2) Discrete math: strong performance in CS 2800 or equivalent. Note: The linear algebra and discrete math requirements can also be fulfilled with a strong performance in INFO 2950; and 3) Programming proficiency: CS 2110 or equivalent with strong Python skills and familiarity with IPython Notebooks, or permission of instructor."
CS4300 = "1) linear algebra: strong performance in MATH 2940 or equivalent; 2) discrete math: strong performance in CS 2800 or equivalent. Note: the linear algebra and discrete math requirements can also be fulfilled with a strong performance in INFO 2950; and 3) programming proficiency: CS 2110 or equivalent with strong Python skills and familiarity with IPython Notebooks, or permission of instructor."
CS5433 =  ("a good level of programming experience--specifically, the "
        "ability to deal with challenging programming tasks--familiarity with "
        "common algorithms and data structures, and an understanding of basic "
        "concepts in discrete mathematics.")


print(parse_prerequisites(CS5433))

a good level of programming experience--specifically, the ability to deal with challenging programming tasks--familiarity with common algorithms and data structures, and an understanding of basic concepts in discrete mathematics.
a good level of programming experience--specifically and the ability to deal with challenging programming tasks--familiarity with common algorithms and CS 2110 or CS 2112 and and an understanding of basic concepts in CS 2800.
[['CS2110', 'CS2112'], ['CS2800'], True]


In [213]:
def separate_prereq(text):
    """
    Return a list that separates prereq and coreq

    Use re search to find "Prerequisite:", "Corequisite:", and
    "Prerequisite or corequisite:", append words follow them in a list.

    Parameter text: text is the string that would be separated
    Precondition: text is a str object
    """
    text = text.replace("Prerequisite: ","A_Prerequisite: ")
    text = text.replace("Prerequisite or corequisite: ","B_Prerequisite or corequisite: ")
    text = text.replace("Corequisite: ","C_Corequisite: ")

    prereq_match = re.search(r"a_prerequisite:(.*?)(b_prerequisite or corequisite|c_corequisite|$)", text, re.IGNORECASE)
    prereq_or_coreq_match = re.search(r"b_prerequisite or corequisite:(.*?)(c_corequisite|$)", text, re.IGNORECASE)
    coreq_match = re.search(r"c_corequisite:(.*)", text, re.IGNORECASE)

    result = []
    prereq = prereq_match.group(1).strip() if prereq_match else None
    result.append(prereq)
    coreq = coreq_match.group(1).strip() if coreq_match else None
    result.append(coreq)
    prereq_or_coreq = prereq_or_coreq_match.group(1).strip() if prereq_or_coreq_match else None
    result.append(prereq_or_coreq)

    return result
separate_prereq("Prerequisite or corequisite: MATH 1110 or equivilent, one programming course or permission of instructor.")

[None,
 None,
 'MATH 1110 or equivilent, one programming course or permission of instructor.']

In [214]:
def remove_repeat(nested_list):
    """
    Return a nested list with repeated element removed

    It first checks and removes repeated sublists, and then it checks and
    removes repeated individual element in the nested list.

    Parameters nested_list: A nested list of courses.
    Precondition: nested_list is a 2D list
    """
    unique_sublists = list(set(map(tuple, nested_list)))

    nested_list = [list(sublist) for sublist in unique_sublists]

    course_to_sublist = {}

    nested_list_sorted = sorted(nested_list, key=len)

    for sublist in nested_list_sorted:
        for course in sublist:
            if course not in course_to_sublist:
                course_to_sublist[course] = sublist

    result = []
    for sublist in nested_list:
        unique_courses = [course for course in sublist if course_to_sublist[course] == sublist]
        result.append(unique_courses)

    return result

In [235]:
text = """Prerequisites/Corequisites Prerequisite for programmers: INFO 3152, CS 3300/INFO 3300 or CS 4620 or CS 3700 or CS 5414, or permission of the instructor. Prerequisite for designers: INFO 3152 and INFO 3450, or permission of the instructor. Corequisite: ENGRC 4152."""

# Define the pattern
pattern = r"Prerequisite for\s.*?:\s*(.*?)(?:[.;])"

# Extract matches excluding the "Prerequisite for..." part
matches = re.findall(pattern, text)

# Join the results with "or"
combined_prerequisites = " or ".join(matches).strip()

print(combined_prerequisites)

INFO 3152, CS 3300/INFO 3300 or CS 4620 or CS 3700 or CS 5414, or permission of the instructor or INFO 3152 and INFO 3450, or permission of the instructor
