In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
import os
import numpy as np

CAPEURL = 'https://cape.ucsd.edu/responses/Results.aspx'
CAPETITLE = 'Course And Professor Evaluations (CAPE)'
YEAR = 22

def get_raw_cape_dataframe(course:list):

    options = webdriver.ChromeOptions()
    options.add_argument("user-data-dir=C:/Users/kalki/AppData/Local/Google/Chrome/User Data")
    options.add_argument("profile-directory=Profile 7")
    driver = webdriver.Chrome(options=options)
    driver.get(f'https://cape.ucsd.edu/responses/Results.aspx?Name={course[0]}+{course[1]}&CourseNumber=')
    wait = WebDriverWait(driver, 60)
    element = wait.until(expected_conditions.title_contains(CAPETITLE))
    # read in the dataset from the html file
    df = pd.read_html(driver.page_source)[0]
    driver.quit()

    return df

courses = [
    'ece 15',
    'ece 25',
    'ece 35',
    'ece 45',
    'ece 65',
    'ece 101',
    'ece 109',
    'nano 11',
    'nano 15',
    'nano 107',
    'nano 108',
    'cse 11',
    'cse 12',
    'cse 15L',
    'cse 20',
    'cse 21',
    'cse 30',
    'dsc 10',
    'dsc 20',
    'dsc 30',
    'dsc 40a',
    'dsc 40b',
    'dsc 80'
]

In [3]:
vals = {}

numseq = {}
curr = 0
prev = courses[0]

for course in courses:
    if os.path.exists(f'courses/{course}.csv'):
        df = pd.read_csv(f'courses/{course}.csv')
    else:
        parsed = course.split(' ')
        df = get_raw_cape_dataframe([parsed[0], parsed[1]])
        df = (
            df.assign(
                GPA=df.get('Avg Grade Received')
                .apply(lambda s : float(s[s.find('(')+1:s.find(')')]) if type(s) == str else None)
            )
            .assign(dept=parsed[0])
            .assign(num=parsed[1])
        )

    yearly_total = (
        df[
        (df.get('Term') == f'FA{YEAR-1}') | 
        (df.get('Term') == f'WI{YEAR}') | 
        (df.get('Term') == f'SP{YEAR}') | 
        (df.get('Term') == f'S1{YEAR}') |
        (df.get('Term') == f'S2{YEAR}') | 
        (df.get('Term') == f'S3{YEAR}')]
        .get('Enroll').sum()
    )
    gpa_avg = df.get("GPA").mean()
    vals[course] = [yearly_total, gpa_avg]
    df.to_csv(f'courses/{course}.csv')

    if prev.split(' ')[0] == course.split(' ')[0]:
        curr += 1
    else:
        curr = 1
    numseq[course] = curr
    prev = course

In [4]:
df = pd.DataFrame().assign(
    course = np.array(courses)
)
df = df.assign(
    gpa=df.get('course').apply(lambda x : vals[x][1]),
    num=df.get('course').apply(lambda x : vals[x][0]),
    dept=[x.split(' ')[0] for x in courses],
    id=df.get('course').apply(lambda x : numseq[x])
)

df.to_csv('data.csv')
df

Unnamed: 0,course,gpa,num,dept,id
0,ece 15,2.8564,580,ece,1
1,ece 25,3.007391,205,ece,2
2,ece 35,2.491132,647,ece,3
3,ece 45,2.89,490,ece,4
4,ece 65,2.767857,431,ece,5
5,ece 101,2.7425,365,ece,6
6,ece 109,2.617895,542,ece,7
7,nano 11,3.344857,289,nano,1
8,nano 15,3.424737,198,nano,2
9,nano 107,3.13375,29,nano,3
