## Dataset 1: UCSD CAPE Reviews
- Contains the student evaluations for each class at UCSD, including percentage of students who recommend the instructor and the class, the average number of weeks spent on the class, and the average grade expected and received for the class
- To run the code below, you will need the following:
    - You will need to first log in to your UCSD account so that the cookies on the cape.ucsd.edu website contains your log-in information that is necessary to access the CAPE reviews
    - Once you've logged into your UCSD account, go to the cape.ucsd.edu page and download the cookies in a json format. One way to do this is to use the Chrome extension "Export cookie JSON file for Puppeteer" (https://chromewebstore.google.com/detail/export-cookie-json-file-f/nmckokihipjgplolmcmjakknndddifde?pli=1)

Credit to u/MaxtheBat on Reddit for the code below. He posted the code on scraping UCSD CAPE data in the Reddit post below:

https://www.reddit.com/r/UCSD/comments/14uh5q5/since_capes_is_being_retired_i_scraped_all_its/

In [2]:
import requests
import json
from bs4 import BeautifulSoup

In [10]:
# Load in cookies
cookies_raw = json.load(open('cookies/cape.ucsd.edu.cookies.json', 'r'))
cookies = {cookie['name']: cookie['value'] for cookie in cookies_raw}

In [11]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36',
    'Accept-Encoding': '*',
    'Connection': 'keep-alive'
}

In [12]:
url = 'https://cape.ucsd.edu/responses/Results.aspx?Name=%2C&CourseNumber='

In [13]:
# Initiate get request to CAPEs (with all entries)
response = requests.get(url, cookies=cookies, headers=headers)

In [15]:
# Parse request and scrape table
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')
table_body = table.find('tbody')
rows = table_body.find_all('tr')

In [16]:
# Parse each row for data and put in list
data = []
for row in rows:
    cols = row.find_all('td')
    url = 'https://cape.ucsd.edu/' + row.find('a')['href'].strip('../')
    cols = [ele.text.strip().replace(',', '') for ele in cols]
    cols.append(url)
    data.append([ele for ele in cols if ele])

In [37]:
# Open file
with open('data/capes_data.csv', 'w', encoding='utf-8') as file:
    # Write file header
    file.write('Instructor,Course,Quarter,Total Enrolled in Course,Total CAPEs Given,Percentage Recommended Class,Percentage Recommended Professor,Study Hours per Week,Average Grade Expected,Average Grade Received,Evalulation URL\n')
    
    # Write course data
    for course in data:
        file.write(','.join(course))
        file.write('\n')

## Dataset 2: UCSD Course Catalog

In [32]:
catalog_url = 'https://catalog.ucsd.edu/front/courses.html'

In [33]:
# Fetch the content from the URL
response = requests.get(catalog_url)
content = response.content

In [34]:
# Parse the content with BeautifulSoup
soup = BeautifulSoup(content, 'html.parser')

In [35]:
# Find all links in the page
links = soup.find_all('a', href=True)

In [40]:
links

[<a class="sr-only skip-to-main" href="#main-content">Skip to main content</a>,
 <a class="title-header title-header-large" href="/">
             General Catalog
     </a>,
 <a class="title-header title-header-short" href="/">
             General Catalog
     </a>,
 <a class="title-logo" href="http://www.ucsd.edu">UC San Diego</a>,
 <a href="https://catalog.ucsd.edu/front/courses.html">Courses/Curricula/Faculty</a>,
 <a href="../about/index.html">About <span class="caret"></span> </a>,
 <a href="../about/about-uc-san-diego/index.html">About UC San Diego</a>,
 <a href="https://catalog.ucsd.edu/academic-integrity.html">Academic Integrity</a>,
 <a href="../about/policies/index.html">Regulations &amp; Policies</a>,
 <a href="../about/calendars/index.html">Calendars</a>,
 <a href="../about/additional-resources/index.html">Additional Resources</a>,
 <a href="../undergraduate/index.html">Undergraduate <span class="caret"></span> </a>,
 <a href="../undergraduate/overview/index.html">Undergra

In [50]:
# Get all links for the department pages that contain course information
base_url = "../courses/"
department_links = [link['href'] for link in links if link['href'].startswith(base_url)]
department_links = ["https://catalog.ucsd.edu" + link.strip('..') for link in department_links]

In [197]:
import pandas as pd
import re

In [216]:
missing_units = {'COMM 101A': '4', 'HIEU 124': '4', \
                 'HILA 119': '4', 'JAPN 180': '4', \
                 'JAPN 190': '4', 'LIGN 9GS': '4', \
                 'USP 131': '4', 'USP 141A': '6', \
                 'USP 141B': '6'}

In [419]:
course_info = []

for dept in department_links:
    print(dept)
    # get content from department page
    response = requests.get(dept)
    content = response.content
    soup = BeautifulSoup(content, 'html.parser')

    # Get the course names and course descriptions
    course_name_elements = soup.find_all('p', class_='course-name')

    # Extract the course code, department, title, units, description and prerequisites
    for tag in course_name_elements:
        course_description_element = tag.find_next_siblings('p', limit=1)[0]

        content = response.content
    soup = BeautifulSoup(content, 'html.parser')

    # Get the course names and course descriptions
    course_name_elements = soup.find_all('p', class_='course-name')

    # Extract the course code, department, title, units, description and prerequisites
    for tag in course_name_elements:
        course_description_element = tag.find_next_siblings('p', limit=1)[0]

        full_course_name = tag.get_text().strip()
        full_course_description = course_description_element.get_text().strip()

        # Special Case: For Languages classes, the course code is formatted differently
        if full_course_name.startswith("Linguistics"):
            course_code = full_course_name.split('(')[1].split('.')[0].replace(')', '')
        else:
            if '.' in full_course_name:

                course_code = full_course_name.split('.')[0]
            else:

                course_code = ' '.join(full_course_name.split()[:2]).split('.')[0]

        course_dept = course_code.split()[0]

        # Special Case: Some courses don't have units listed
        if "(" in full_course_name:
            if "." in full_course_name:
                course_title = '('.join(full_course_name.split('(')[:-1]).split('.')[1].strip() # full_course_name.split('(')[-2].split('.')[1].strip()
            else:
                course_title = ' '.join(full_course_name.split(' ')[2:]).strip()
            course_units = re.findall(r'\((.*?)\)', full_course_name)[-1]
        else:
            course_title = ' '.join(full_course_name.split(' ')[2:]).strip()
            course_units = missing_units[course_code]

        course_description = full_course_description.split('Prerequisites:')[0].strip()

        if "Prerequisites" in full_course_description:
            course_prerequisites = full_course_description.split('Prerequisites:')[1].strip()
        else:
            course_prerequisites = "none"

        # if LISP courses are formatted as, for example, LISP 5A, 5B, 5C, then make separate courses for each of them
        if 'LISP' in course_code and len(course_code.split()) != 2:
            for num in course_code.split()[1:]:
                num = num.strip(',')
                course_code = course_dept + " " + num
                course_info.append([course_code, course_dept, course_title, course_units, course_description, course_prerequisites])

        # if course codes contain - or /
    #         elif len(course_code.split('-') != 1):
    #             course
        else:
            course_info.append([course_code, course_dept, course_title, course_units, course_description, course_prerequisites])

https://catalog.ucsd.edu/courses/AIP.html
https://catalog.ucsd.edu/courses/AASM.html
https://catalog.ucsd.edu/courses/AWP.html
https://catalog.ucsd.edu/courses/ANTH.html
https://catalog.ucsd.edu/courses/AAPI.html
https://catalog.ucsd.edu/courses/AUDL.html
https://catalog.ucsd.edu/courses/BIOI.html
https://catalog.ucsd.edu/courses/BIOL.html
https://catalog.ucsd.edu/courses/BIOM.html
https://catalog.ucsd.edu/courses/CHEM.html
https://catalog.ucsd.edu/courses/CLS.html
https://catalog.ucsd.edu/courses/CHIN.html
https://catalog.ucsd.edu/courses/CLAS.html
https://catalog.ucsd.edu/courses/CCS.html
https://catalog.ucsd.edu/courses/CSP.html
https://catalog.ucsd.edu/courses/CLIN.html
https://catalog.ucsd.edu/courses/CLRE.html
https://catalog.ucsd.edu/courses/COGS.html
https://catalog.ucsd.edu/courses/COMM.html
https://catalog.ucsd.edu/courses/css.html
https://catalog.ucsd.edu/courses/CGS.html
https://catalog.ucsd.edu/courses/CAT.html
https://catalog.ucsd.edu/courses/DSC.html
https://catalog.ucsd

In [420]:
course_info_df = pd.DataFrame(course_info, columns=['Code', 'Department', 'Title', 'Units', 'Description', 'Prerequisites'])
course_info_df

Unnamed: 0,Code,Department,Title,Units,Description,Prerequisites
0,AIP 97,AIP,Academic Internship,"2, 4",Individual placements for field learning. Must...,"lower-division standing, completion of thirty ..."
1,AIP 197,AIP,Academic Internship Program,"2, 4, 6, 8, 10, 12",Individual internship placements integrated wi...,upper-division standing; department approval.
2,AIP 197DC,AIP,"UCDC: Washington, DC Internship","6, 8, 10",This internship is attached to the University ...,upper-division standing; department approval.
3,AIP 197P,AIP,Public Service Internship,"4, 8, 12",Individual placements for field learning perfo...,ninety units completed; 2.5 minimum cumulative...
4,AIP 197T,AIP,Academic Internship Program—Special Programs,2,Individual placements for field learning assoc...,ninety units minimum completed; 2.5 minimum cu...
...,...,...,...,...,...,...
7029,WCWP 100,WCWP,Academic Writing,4,An upper-division workshop course in argumenta...,junior/senior standing and must be a Warren Co...
7030,WCWP 160,WCWP,Technical Writing for Scientists and Engineers,4,An upper-division workshop-style writing cours...,junior/senior standing.
7031,WARR 189,WARR,Academic Mentoring and the Writing Process,2,Students will gain a fundamental understanding...,permission of instructor is required to enroll.
7032,WCWP 198,WCWP,Group Study,2,A directed group study involving research and ...,none


In [421]:
def check_code(code):
    if '/' in code or '-' in code:
        print(code)

In [523]:
course_info_df.to_csv('data/course_catalog.csv')

## Dataset 3: UCSD Schedule of Classes
- Scraped for Winter 2024 because Spring 2024 is not available yet

In [466]:
def get_subject_links(main_page_url):
    response = requests.get(main_page_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the "Subjects" section
    # This depends on the structure of the main page
    # Adjust the selector as per the actual HTML structure
    subject_section = soup.find('div', {'id': 'subject_Panel'}) 

    subject_links = []
    if subject_section:
        links = subject_section.find_all('a')
        for link in links:
            href = link.get('href')
            if href and href.startswith('courseList.aspx?name=') and not href.endswith('dept=true'):
                full_link = main_page_url + href
                subject_links.append(full_link)

    return subject_links

In [513]:
def get_courses_from_subject(subject_url):
    response = requests.get(subject_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the section under "Select your Course:" header
    course_section = soup.find('h3', text='Select your Course:').find_next_sibling('ul')

    # Find all links within this section
    course_links = course_section.find_all('a') if course_section else []
    
    course_list = []
    for course in course_links:
        if 'coursemain' in course.get('href'):
            course = ' '.join(course.text.split()[:2])
            course_list.append(course)
    
    return list(set(course_list))

In [514]:
starting_letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W']
all_courses = []

# for each possible starting letter of the subjects
for letter in starting_letters:
    main_page_url = "https://courses.ucsd.edu/?u_letter=" + letter
    
    print(letter)
    
    # get all subjects starting with the letter
    subject_links = get_subject_links(main_page_url)
    
    # for each subject
    for subject_link in subject_links:
        subject_name = subject_link.split('=')[-1]
        
        # get all courses in that subject
        courses = get_courses_from_subject('https://courses.ucsd.edu/courseList.aspx?name=' + subject_name)
        all_courses += courses

A
B
C
D
E
F
G
H
I
J
L
M
N
O
P
R
S
T
U
V
W


In [520]:
len(all_courses)

2297

In [521]:
all_courses

['AAS 10',
 'AAS 11',
 'AESE 279A',
 'AESE 278C',
 'AESE 241',
 'AESE 278B',
 'AIP 197P',
 'AIP 197',
 'AIP 197DC',
 'ANAR 135',
 'ANAR 121',
 'ANAR 117',
 'ANAR 100',
 'ANAR 156',
 'ANAR 167',
 'ANBI 100',
 'ANBI 134',
 'ANBI 112',
 'ANBI 136',
 'ANBI 159',
 'ANBI 120',
 'ANBI 116',
 'ANBI 143',
 'ANES 299',
 'ANES 402',
 'ANES 403',
 'ANES 401',
 'ANES 296',
 'ANES 410',
 'ANES 496',
 'ANSC 135',
 'ANSC 164',
 'ANSC 173',
 'ANSC 139',
 'ANSC 148',
 'ANSC 184',
 'ANSC 188',
 'ANSC 100',
 'ANSC 150',
 'ANSC 125',
 'ANSC 122',
 'ANTH 246',
 'ANTH 260',
 'ANTH 23',
 'ANTH 273',
 'ANTH 281B',
 'ANTH 202',
 'ANTH 269',
 'ANTH 200',
 'ANTH 5',
 'ANTH 500',
 'ANTH 296',
 'ANTH 196B',
 'ANTH 280C',
 'ANTH 45',
 'ANTH 110',
 'ANTH 280B',
 'ANTH 298',
 'ANTH 230',
 'ANTH 106',
 'ANTH 128A',
 'ANTH 299',
 'ANTH 279',
 'ANTH 295',
 'ANTH 107',
 'ANTH 21',
 'ANTH 87',
 'ANTH 102',
 'ANTH 111',
 'AUD 291',
 'AUD 284',
 'AUD 236',
 'AUD 276',
 'AUD 263',
 'AUD 278',
 'AUD 296',
 'AUD 298',
 'AUD 299

In [526]:
# Exporting Winter 2024 courses to a csv file
pd.DataFrame(all_courses, columns=['Course']).to_csv('data/winter2024.csv')