In [498]:
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import re

pd.set_option('display.max_rows', 40)
pd.set_option('display.max_columns', 30)
pd.set_option('display.width', 100)

In [521]:
# Data Scraping from course registration site
College, Dept, Number = 'CDS', 'DS', 320
URL = f"""https://www.bu.edu/link/bin/uiscgi_studentlink/1698859245?ModuleName=univschr.pl
          &SearchOptionDesc=Class+Number
          &SearchOptionCd=S
          &KeySem=20244
          &ViewSem=Spring+2024
          &College={College}
          &Dept={Dept}
          &Course={Number}
          &Section="""
URL = re.sub("\s", "", URL)

r = requests.get(URL)
  
soup = BeautifulSoup(r.content, 'lxml')

col_names = []
data = []
table = soup.find_all('table', border='', cellpadding='')


rows = table[1].find_all('tr')
for row in rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data.append([ele for ele in cols if ele]) # Get rid of empty values

pd.DataFrame(data, columns=["Course","Title/Instructor","Credits","Type","Open Seats","Bld","Room","Day","Start",
                            "Stop","Notes"])

Unnamed: 0,Course,Title/Instructor,Credits,Type,Open Seats,Bld,Room,Day,Start,Stop,Notes
0,,,,,,,,,,,
1,"Students registering for CDS DS320 must register for two sections: a Lec section, and a Dis section.","Students registering for CDS DS320 must register for two sections: a Lec section, and a Dis section.",,,,,,,,,
2,"Students registering for CDS DS320 must register for two sections: a Lec section, and a Dis section.",,,,,,,,,,
3,CDS DS320 A1,Algs Data SciGoldner,4.0,Lecture,2.0,EPC,209,"Tue,Thu",2:00pm,3:15pm,Permission Required
4,CDS DS320 A2,Algs Data SciGoldner,0.0,Discussion,0.0,FLR,121,Wed,3:35pm,4:25pm,Permission RequiredClass Full
5,CDS DS320 A3,Algs Data SciGoldner,0.0,Discussion,0.0,FLR,121,Wed,4:40pm,5:30pm,Permission RequiredClass Full
6,"Students registering for CDS DS340 must register for two sections: a Lec section, and a Dis section.","Students registering for CDS DS340 must register for two sections: a Lec section, and a Dis section.",,,,,,,,,
7,"Students registering for CDS DS340 must register for two sections: a Lec section, and a Dis section.",,,,,,,,,,
8,CDS DS340 A1,Intro Ml & AiGold,4.0,Lecture,31.0,EPC,207,"Tue,Thu",2:00pm,3:15pm,Permission Required
9,CDS DS340 A2,Intro Ml & AiGold,0.0,Discussion,6.0,CDS,164,Thu,9:30am,10:45am,Permission Required


In [480]:
# Functions

def scrape_course_website(course_data, hub_data, url):
    r = requests.get(url)

    soup = BeautifulSoup(r.content, "html.parser")

    for course in soup.find_all('div', class_='cf-course-card'):
        
        # course_data
        course_id = course.find('span', class_='cf-course-id')
        course_college, course_dept, course_number = course_id.find_all('span')
        course_name = course.find('h3', class_='bu_collapsible')
        course_details = course.find('p', class_='meta cf-course-info')
        course_credits, course_availability, course_prereqs = course_details.find_all('span')
        course_desc = course.find('p', class_='cf-course-description')
        course_info = [course_id, course_college, course_dept, course_number, course_name, course_credits, 
                       course_availability, course_prereqs, course_desc]
        course_data.append([ele.get_text() for ele in course_info])
        
        # hub_data
        course_hubs = course.find('ul', class_='cf-hub-offerings')
        for course_hub in course_hubs.find_all('li'):
            hub_info = [course_id, course_hub]
            hub_data.append([ele.get_text() for ele in hub_info])
            
def courses_with_hubs(how_many, needed_hubs, display_all=False):    
    for idx, _ in enumerate(needed_hubs):
        for key, value in hub_nickname_dict.items():
            needed_hubs[idx] = needed_hubs[idx].replace(key, value)
    
    # Exceptions
    if type(how_many) != int:
        if type(how_many) != float:
            raise TypeError("first parameter should be an integer")
        else:
            how_many = int(how_many)
    elif how_many < 1:
        raise Exception("first parameter should be a positive number")
    elif how_many > np.unique(hub_df["Course_id"], return_counts=True)[1].max():
        raise Exception("There are no classes with " + str(how_many) + " or more hubs")
    
    eligible_courses = []
    for Course_id in course_df["Course_id"]:
        needed_hubs_counter = 0
        Course_hubs = ""
        for hub_abbr in hub_df[hub_df["Course_id"] == Course_id]["Hub_abbr"]:
            Course_hubs += hub_abbr + " "
            if hub_abbr in needed_hubs:
                needed_hubs_counter += 1
        if needed_hubs_counter >= how_many:
            
            eligible_courses.append([Course_id,Course_hubs])
    eligible_courses = np.array(eligible_courses)
    
    if display_all:
        pd.set_option('display.max_rows', len(eligible_courses))
    else:
        pd.set_option('display.max_rows', 40)
    
    if eligible_courses.size == 0: # In case there are no eligible courses
        raise ValueError("No classes matching this criteria")
    
    selected_df = course_df.loc[course_df['Course_id'].isin(eligible_courses[:,0])].copy()
    selected_df["Hubs"] = eligible_courses[:,1]
    selected_df = selected_df.sort_values(by=['Course_id'])
    selected_df = selected_df.reset_index(drop=True)
    return selected_df

#def search_by_course_id(Course_id)

In [458]:
# Creating tables

course_data = []
hub_data = []
URLs = ['https://www.bu.edu/hub/hub-courses/philosophical-aesthetic-and-historical-interpretation/', 
        'https://www.bu.edu/hub/hub-courses/scientific-and-social-inquiry/', 
        'https://www.bu.edu/hub/hub-courses/quantitative-reasoning/', 
        'https://www.bu.edu/hub/hub-courses/diversity-civic-engagement-and-global-citizenship/', 
        'https://www.bu.edu/hub/hub-courses/communication/', 
        'https://www.bu.edu/hub/hub-courses/intellectual-toolkit/']

for url in URLs:
    scrape_course_website(course_data, hub_data, url)

max_description_len = max(list(map(lambda x: len(x), np.array(course_data)[:,8])))
pd.set_option('display.max_colwidth', max_description_len)

course_df = pd.DataFrame(course_data, columns=["Course_id", "College", "Department", "Number", "Title", "Credits", 
                                               "Availability", "Prerequisites", "Description"])
course_df = course_df.drop_duplicates(ignore_index=True)
hub_df = pd.DataFrame(hub_data, columns=["Course_id","Hub"])
hub_df = hub_df.drop_duplicates(ignore_index=True)

hub_nickname_dict = {
    "PLM": "Philosophical Inquiry and Life's Meanings",
    "AEX": "Aesthetic Exploration",
    "HCO": "Historical Consciousness",
    "SI1": "Scientific Inquiry I",
    "SI2": "Scientific Inquiry II",
    "SO1": "Social Inquiry I",
    "SO2": "Social Inquiry II",
    "QR1": "Quantitative Reasoning I",
    "QR2": "Quantitative Reasoning II",
    "IIC": "The Individual in Community",
    "GCI": "Global Citizenship and Intercultural Literacy",
    "ETR": "Ethical Reasoning",
    "FYW": "First-Year Writing Seminar",
    "WRI": "Writing, Research, and Inquiry",
    "WIN": "Writing-Intensive Course",
    "OSC": "Oral and/or Signed Communication",
    "DME": "Digital/Multimedia Expression",
    "CRT": "Critical Thinking",
    "RIL": "Research and Information Literacy",
    "TWC": "Teamwork/Collaboration",
    "CRI": "Creativity/Innovation" 
}

hub_nickname_dict = {v: k for k, v in hub_nickname_dict.items()}

hub_df['Hub_abbr'] = hub_df.replace({'Hub': hub_nickname_dict}).Hub

In [481]:
# My applications

zachs_needed_hubs_abbr = ["PLM", "AEX", "IIC", "GCI", "WRI", "WIN", "OSC", "RIL", "CRI"] # What Hubs I need to graduate
        
courses_with_hubs(3, zachs_needed_hubs_abbr, True)

Unnamed: 0,Course_id,College,Department,Number,Title,Credits,Availability,Prerequisites,Description,Hubs
0,CAS AA 210,CAS,AA,210,American Minstrelsy,4 credits.,2nd sem.,"First Year Writing Seminar (e.g., WR 100 or WR 120)","An American entertainment historically rooted in commodified performance of ""blackness"", this course engages with the complicated history of minstrelsy as both a racist and progressive art form. Course material surveys the minstrel tradition and its influence on popular entertainment. Effective Spring 2022, this course fulfills a single unit in each of the following BU Hub areas: Writing-Intensive Course, Aesthetic Exploration, Research and Information Literacy.",AEX RIL WIN
1,CAS AA 404,CAS,AA,404,Seminar on Sociology of Families,4 credits.,Either sem.,junior or senior standing and at least two previous Sociology courses; or consent of instructor. First-Year Writing Seminar (WR 120 or equivalent),"Explores the rise of ""modern"" families and the plurality of contemporary family forms and processes in global contexts. Particular attention to intersections of race, class, and gender inequalities and their implications for family life. Effective Fall 2022, this course fulfills a single unit in each of the following BU Hub areas: Writing-Intensive Course, Global Citizenship and Intercultural Literacy, Research and Information Literacy.",GCI RIL WIN
2,CAS AA 411,CAS,AA,411,"Race, Memory, and Diaspora in US Popular Music",4 credits.,Either sem.,,"Examines selected popular and vernacular musical cultures in the U.S. within a broad historical, political, and economic context; how global musical practices brought by people to the U.S. have been shaped by the unique space of the nation; and how these styles are the product of interracial and intercultural dialogues, struggles, and negotiation processes that continue to produce new hybrid forms. Will develop ability to hear and appreciate entanglements that immerse music-making within competing interests and sensibilities, using key concepts on race, ethnicity, class, gender, and sexuality. Effective Spring 2022, this course fulfills a single unit in each of the following BU Hub areas: Global Citizenship and Intercultural Literacy, Aesthetic Exploration, Research and Information Literacy.",AEX GCI RIL
3,CAS AA 519,CAS,AA,519,Inequality and American Politics,4 credits.,Either sem.,"First Year Writing Seminar (e.g., WR 100 or WR 120)","This course examines the role of income inequality in shaping American politics and policy. Combining research from history, political science, economics, and public policy scholars, we will consider a range of important topics, including inequality in public voice, money and politics, and attitudes towards redistribution. We will apply this knowledge as part of a final paper project in metropolitan Boston. Effective Spring 2020, this course fulfills a single unit in each of the following BU Hub areas: The Individual in Community, Writing- Intensive Course, Research and Information Literacy.",IIC RIL WIN
4,CAS AH 220,CAS,AH,220,Islamic Art and Architecture,4 credits.,Either sem.,,"Examines key monuments of Islamic art and architecture within their historical and cultural context, and emphasizes the diversity within the visual cultures of the Islamic world. Carries humanities divisional credit in CAS. Effective Fall 2018, this course fulfills a single unit in each of the following BU Hub areas: Aesthetic Exploration, Global Citizenship and Intercultural Literacy, Research and Information Literacy.",AEX GCI RIL
5,CAS AH 313,CAS,AH,313,Imperial Reflections: Early Modern Islamic Art and Architecture,4 credits.,,,"Architecture, manuscripts, textiles, metalwork, and ceramics of the Mughal, Ottoman, and Safavid Empires. Focus on the formation of imperial styles, intersections between art and politics, and the importance of the arts in dynastic legitimization. Effective Fall 2019, this course fulfills a single unit in each of the following BU Hub areas: Aesthetic Exploration, Global Citizenship and Intercultural Literacy, Research and Information Literacy.",AEX GCI RIL
6,CAS AH 327,CAS,AH,327,Arts of China,4 credits.,Either sem.,,"Explores major works of Chinese art, from bronze vessels, Buddhist caves, ink painting, to contemporary performance. Addresses topics such as constructions of monumentality, cultural exchange, displays of power, literati identity, feminine space, and quests for modernization. Effective Fall 2020, this course fulfills a single unit in each of the following BU Hub areas: Global Citizenship and Intercultural Literacy, Aesthetic Exploration, Research and Information Literacy.",AEX GCI RIL
7,CAS AH 364,CAS,AH,364,"Art and Architecture in Madrid, 1561 - Today",4 credits.,,enrollment in the Madrid Spanish & European Studies Program.,"An introduction to Spanish art history (1561-Present) that examines the most representative works and movements. Students explore the visual representation (painting, sculpture, architecture and street art) and the associated social values (power, gender, ethnicity, and ""otherness"") in both historical and current global contexts. Effective Fall 2020, this course fulfills a single unit in each of the following BU Hub areas: Global Citizenship and Intercultural Literacy, Aesthetic Exploration, Creativity/Innovation.",AEX GCI CRI
8,CAS AH 399,CAS,AH,399,History and Theory of Landscape Architecture,4 credits.,,"First Year Writing Seminar (e.g., WR 100 or WR 120).","Explores man's relationship with nature by a study of selected built environments from antiquity to the present. Focus on both the private garden and the public park--here considered as works of art--and their changing forms, meaning, and interpretations. Effective Spring 2021, this course fulfills a single unit in each of the following BU Hub areas: Writing- Intensive Course, Global Citizenship and Intercultural Literacy, Aesthetic Exploration.",AEX GCI WIN
9,CAS AH 495,CAS,AH,495,Seminar: Twentieth Century Art,4 credits.,Either sem.,"CAS AH 111 and CAS AH 112; and two courses at the 200 level or higher, or consent of the instructor.","Examines major artists and artistic currents of the twentieth century. Topics vary each year. Some background in the history of modern art is recommended. Topic for Fall 2020: Picasso. Explores more than eight decades of incessant art making by Pablo Picasso. How his friends, his lovers, and his preoccupation with eroticism and death affected his imagery. Students master fundamental currents of European Modernism. Effective Fall 2020, this course fulfills a single unit in each of the following BU Hub areas: Oral and/or Signed Communication, Aesthetic Exploration, Research and Information Literacy.",AEX OSC RIL
