# BU Hub Course Search

# Run the Code Box Below to Initialize the Course Search

## Recommended: Click the box below and then press "Shift" + "Enter/Return"

### This shortcut will run the box and take you to the next code box

#### NOTE: This will take ~1 minutes to run. If the left of your code box looks like this: `In [*]`, your program is loading

In [None]:
# Click here and then press "Shift" + "Enter" to run this code box

# Imports

import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import re

# Initializing DataFrame

pd.set_option('display.max_rows', 40)
pd.set_option('display.max_columns', 30)
pd.set_option('display.width', 100)

# Functions

def scrape_course_df(colleges = ['cgs','cas','com','wheelock','khc','eng','cfa','cds','grs','questrom','sar','sha']):
    course_data = []
    hub_data = []
    
    for college in colleges:
        url = f"https://www.bu.edu/academics/{college}/courses/"
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'lxml')
        num_pages = int(list(soup.find('div', class_='pagination').strings)[-1])
        for page in range(1,num_pages+1):
            url = f"https://www.bu.edu/academics/{college}/courses/{page}/"
            r = requests.get(url)
            soup = BeautifulSoup(r.content, 'lxml')
            courses = soup.find('ul', class_='course-feed').find_all('li',class_='')
            for course in courses:
                course_txts = list(course.strings)
                course_title_txt = course_txts[1]

                course_id = course_title_txt[:10]
                course_college = course_id[:3]
                course_dept = course_id[4:6]
                course_number = course_id[7:10]
                course_name = course_title_txt[12:]

                course_desc_txt = " ".join(course_txts[2:])
                course_desc_not_wspace = re.search("\w", course_desc_txt) # Non-whitespace characters indexes

                if course_desc_not_wspace:
                    start_idx = course_desc_not_wspace.start()
                    course_desc_txt = course_desc_txt[start_idx:]
                    course_desc = course_desc_txt

                    # coures_credits
                    course_credits = re.findall("\. (\d\.?\d? ?cr)", course_desc_txt)
                    if course_credits:
                        course_credits = course_credits[-1]
                        course_desc = re.sub(course_credits + '.', "", course_desc)
                    else:
                        course_credits = ""

                    # course_availability
                    course_availability = re.findall("\. [^.]*[sS]emester[^.]*|\. ([^.]* [sS]em[ .])", course_desc_txt)
                    if course_availability:
                        course_availability = course_availability[-1]
                        course_desc = re.sub(course_availability + '.', "", course_desc)
                    else:
                        course_availability = ""

                    # course_prerequisites
                    course_prereqs = re.findall("Prereq:[^.;]*.|(?:Undergraduate |Graduate )?Prerequisites:[^\n]*.", course_desc_txt)
                    if course_prereqs:
                        course_prereqs = course_prereqs[0]
                        course_desc = course_desc.replace(course_prereqs,"")
                        # Commented below is useless
                        #course_prereqs_remover = course_prereqs.replace("(","\(").replace(")","\)") # () messes with re.sub(), so have to add \ before ( and )
                        #course_desc = re.sub(course_prereqs_remover, "", course_desc)
                    else:
                        course_prereqs = ""

                    course_desc = re.sub("[eE]ffective\s*(?:Fall|Spring).*", "", course_desc)
                    course_desc = course_desc.strip()

                    # hub_data
                    course_hubs = course.find('ul', class_='cf-hub-offerings')
                    if course_hubs:
                        for course_hub in course_hubs.find_all('li'):
                            course_hub_text = course_hub.get_text()
                            if course_hub_text:
                                hub_data.append([course_id, course_hub_text])

                else:
                    course_credits = ""
                    course_availability = ""
                    course_prereqs = ""
                    course_desc = ""

                course_data.append([course_id, course_college, course_dept, course_number, course_name, course_credits, 
                                    course_availability, course_prereqs, course_desc]) 
    
    # Creating DataFrames
    course_df = pd.DataFrame(course_data, columns = ["Course_id", "College", "Department", "Number", "Title", 
                                                    "Credits", "Availability", "Prerequisites", "Description"])
    hub_df = pd.DataFrame(hub_data, columns = ["Course_id","Hub"])
    
    # Dropping Duplicates
    course_df = course_df.drop_duplicates(subset=['Course_id']).reset_index(drop=True)
    hub_df = hub_df.drop_duplicates().reset_index(drop=True)
    
    # Accronymizing hub names
    hub_nickname_dict = {
        "PLM": "Philosophical Inquiry and Life's Meanings",
        "AEX": "Aesthetic Exploration",
        "HCO": "Historical Consciousness",
        "SI1": "Scientific Inquiry I",
        "SI2": "Scientific Inquiry II",
        "SO1": "Social Inquiry I",
        "SO2": "Social Inquiry II",
        "QR1": "Quantitative Reasoning I",
        "QR2": "Quantitative Reasoning II",
        "IIC": "The Individual in Community",
        "GCI": "Global Citizenship and Intercultural Literacy",
        "ETR": "Ethical Reasoning",
        "FYW": "First-Year Writing Seminar",
        "WRI": "Writing, Research, and Inquiry",
        "WIN": "Writing-Intensive Course",
        "OSC": "Oral and/or Signed Communication",
        "DME": "Digital/Multimedia Expression",
        "CRT": "Critical Thinking",
        "RIL": "Research and Information Literacy",
        "TWC": "Teamwork/Collaboration",
        "CRI": "Creativity/Innovation" 
    }

    hub_nickname_dict = {v: k for k, v in hub_nickname_dict.items()}

    hub_df['Hub_abbr'] = hub_df.replace({'Hub': hub_nickname_dict}).Hub
    
    # Replace None with Empty strings
    #course_df.fillna("")
    
    # Making description appear in its entirety when looking at a dataframe
    max_description_len = max(list(map(lambda x: len(x), course_df["Description"])))
    pd.set_option('display.max_colwidth', max_description_len)

    return (course_df, hub_df)
    
def courses_with_hubs(how_many, needed_hubs, display_all=False):
    
    # Exceptions
    if type(how_many) != int: # Wrong datatype for input
        if type(how_many) != float:
            raise TypeError("first parameter should be an integer")
        else:
            how_many = int(how_many)
    elif how_many < 1:
        raise Exception("first parameter should be a positive number")
    elif how_many > np.unique(hub_df["Course_id"], return_counts=True)[1].max():
        raise Exception("There are no classes with " + str(how_many) + " or more hubs")
    
    eligible_courses = []
    for Course_id in course_df["Course_id"]:
        needed_hubs_counter = 0
        Course_hubs = ""
        for hub_abbr in hub_df[hub_df["Course_id"] == Course_id]["Hub_abbr"]:
            Course_hubs += hub_abbr + " "
            if hub_abbr in needed_hubs:
                needed_hubs_counter += 1
        if needed_hubs_counter >= how_many:
            eligible_courses.append([Course_id,Course_hubs])
    eligible_courses = np.array(eligible_courses)
    
    # Display all rows?
    if display_all: 
        pd.set_option('display.max_rows', len(eligible_courses))
    else:
        pd.set_option('display.max_rows', 40)
    
    if eligible_courses.size == 0: # In case there are no eligible courses
        raise ValueError("No classes matching this criteria")
    
    selected_df = course_df.loc[course_df['Course_id'].isin(eligible_courses[:,0])].copy()
    selected_df["Hubs"] = eligible_courses[:,1]
    selected_df = selected_df.sort_values(by=['Course_id'])
    selected_df = selected_df.reset_index(drop=True)
    return selected_df

def display_reg_info_from_list(Course_ids):
    for Course_id in Course_ids:
        df = create_reg_df(Course_id)
        if not df.empty:
            display(df)
            
def create_reg_df(Course_id, section=''): # Data Scraping a class from course registration site
    College, Dept, Number = Course_id.split(' ')
    URL = f"""https://www.bu.edu/link/bin/uiscgi_studentlink/1698859245?ModuleName=univschr.pl
              &SearchOptionDesc=Class+Number
              &SearchOptionCd=S
              &KeySem=20244
              &ViewSem=Spring+2024
              &College={College}
              &Dept={Dept}
              &Course={Number}
              &Section={section}"""
    URL = re.sub("\s", "", URL)

    r = requests.get(URL)

    soup = BeautifulSoup(r.content, 'lxml')

    data = []
    table = soup.find_all('table', border='', cellpadding='')
    
    if not table: # If page does not exist
        return pd.DataFrame([])

    rows = table[1].find_all('tr')
    
    col_names = rows[0].find_all('th')
    col_names = [ele.get_text() for ele in col_names]
    
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        if len(cols) <= len(col_names):
            data.append(cols)
    
    reg_df = pd.DataFrame(data, columns=col_names)
    if '' in reg_df.columns:
        reg_df = reg_df.drop([''],axis=1)
    
    next_page_Course = " ".join([ele['value'] for ele in soup.find_all('input', type='text')])
    next_page_Course_id = next_page_Course[:10]
    next_page_section = next_page_Course[-2:]
    if next_page_Course_id == Course_id:
        reg_df = pd.concat([reg_df, create_reg_df(next_page_Course_id, next_page_section)])
    
    reg_df = reg_df[reg_df[col_names[1]].str[:9].replace("\s", "", regex=True) == Course_id.replace(" ","")]
    reg_df = reg_df.reset_index(drop=True)
    reg_df = reg_df.replace({'Title/Instructor': r'(\w)([A-Z])'}, {'Title/Instructor': r'\1 w/ \2'}, regex=True)
    return reg_df

# Creating Tables

colleges = ['cgs','cas','com','wheelock','khc','eng','cfa','cds','grs','questrom','sar','sha']

course_df, hub_df = scrape_course_df(colleges)

# Write the Full Name or Abbreviation of each Hub You Need Fulfilled Below

Replace the ____ with your Hubs in the format seen in the samples

Format of your_needed_hubs: `["Hub 1 Name or Abbreviation", "Hub 2 Name or Abbreviation"]` 

# Write the Minimum Number of Needed Hubs Per Class Below

Replace the ____ with a number 1-4

#### Optional: Change display_all to True - Replace False with True if you want to see all classes listed, otherwise you will be given a preview of up to 10 classes

# When ready, select on the code box and press "Shift" + "Enter" or the "▶ Run" button near the top of the screen.

#### Recommendation: Use abbreviations rather than full names

#### NOTE: If the left of your code box looks like this `In [*]`, your program is loading

# Hub Abbreviations to Hub Names

`Abbr: Name
"PLM": "Philosophical Inquiry and Life's Meanings",
"AEX": "Aesthetic Exploration",
"HCO": "Historical Consciousness",
"SI1": "Scientific Inquiry I",
"SI2": "Scientific Inquiry II",
"SO1": "Social Inquiry I",
"SO2": "Social Inquiry II",
"QR1": "Quantitative Reasoning I",
"QR2": "Quantitative Reasoning II",
"IIC": "The Individual in Community",
"GCI": "Global Citizenship and Intercultural Literacy",
"ETR": "Ethical Reasoning",
"FYW": "First-Year Writing Seminar",
"WRI": "Writing, Research, and Inquiry",
"WIN": "Writing-Intensive Course",
"OSC": "Oral and/or Signed Communication",
"DME": "Digital/Multimedia Expression",
"CRT": "Critical Thinking",
"RIL": "Research and Information Literacy",
"TWC": "Teamwork/Collaboration",
"CRI": "Creativity/Innovation"
`

In [None]:
# My applications

# Replace the _____ spaces. Then, press "Ctrl" + "Enter"

your_needed_hubs = [______] # <-- !YOUR INPUT HERE!

min_number_of_hubs = ______ # <-- !YOUR INPUT HERE!

display_all = True # <-- !OPTIONAL!: Change to False if you only want a preview of up to 10 classes to appear

# Sample needed_hubs lists
zachs_needed_hubs = ["PLM", "AEX", "IIC", "GCI", "WRI", "WIN", "OSC", "RIL", "CRI"] # What Hubs Zach needs
ellas_needed_hubs = ["Quantitative Reasoning I", "SO2", "Scientific Inquiry II", "WIN"] # What Hubs Ella needs
graces_needed_hubs = ["IIC", "Writing-Intensive Course"] # What Hubs Grace needs

# Classes with needed Hubs
needed_df = courses_with_hubs(min_number_of_hubs, your_needed_hubs, display_all)
reg_df_list = []
for course_id in needed_df["Course_id"]:
    one_course_reg_df = create_reg_df(course_id)
    if not one_course_reg_df.empty:
        one_course_reg_df.insert(0,"Course_id",course_id)
        reg_df_list.append(one_course_reg_df)
reg_df = pd.concat(reg_df_list, axis=0, ignore_index=True)

# Classes currently running with needed Hubs
running_needed_df = needed_df[needed_df["Course_id"].isin(reg_df["Course_id"].unique())].reset_index(drop=True)
display(running_needed_df)

# Registration Checker

Replace the _____ with a course id

Format of course_id: `"College Department Number"`


In [None]:
course_id = _____

# Sample course_ids
sample_course_id1 = "CAS RN 100"
sample_course_id2 = "CDS DS 110"
sample_course_id3 = "CAS AN 102"

create_reg_df(course_id)