In [3]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
import os
import pickle

total_rows = 0

if (not os.path.exists("../depts")):
    os.mkdir("../depts")

YEAR = 23
#sp23 not here yet
terms = [f'FA{YEAR-1}', f'WI{YEAR}', F'SP{YEAR-1}']
last_two = terms + [f'FA{YEAR-2}', f'WI{YEAR-1}', f'SP{YEAR-2}']

CAPEURL = 'https://cape.ucsd.edu/responses/Results.aspx'
CAPETITLE = 'Course And Professor Evaluations (CAPE)'

with open('all_depts.pick', 'rb') as f:
    all_depts = pickle.load(f)

# taken from BetterCapes
# https://github.com/andportnoy/smartercapes.com/blob/master/tools.py
def get_raw_cape_dataframe(dept:str):

    options = webdriver.ChromeOptions()
    # update this with your local path, turning on "start where i left off" helps for sso
    options.add_argument("user-data-dir=/home/linux/.config/google-chrome/")
    options.add_argument("profile-directory=Default")
    driver = webdriver.Chrome(options=options)

    driver.get(f'https://cape.ucsd.edu/responses/Results.aspx?Name=&CourseNumber={dept}')
    wait = WebDriverWait(driver, 60)
    element = wait.until(expected_conditions.title_contains(CAPETITLE))
    # read in the dataset from the html file
    df = pd.read_html(driver.page_source)[0]
    driver.quit()

    return df

def get_yearly_students(yearly, course):
        if course not in yearly.index:
            return 0
        return yearly.get('Enroll').loc[course]

def clean_df(df, dept):
    df = (
        df[
            ['Instructor', 'Course', 'Term', 
            'Avg Grade Received', 'Enroll']
        ]
        .assign(Course = df.get('Course').str.split(' - ').apply(lambda x: x[0]))
    )

    yearly = df[df.Term.isin(terms)].groupby('Course').sum()
    df = df[df.Term.isin(last_two)]

    df = df.dropna()

    df = (df
        .assign(
            GPA=(df.get('Avg Grade Received')
                .str.split('(')
                .apply(lambda x : x[-1])
                .str.rstrip(")")
                .astype('float'))
        )
    )
    df = df.assign(total_grade_points = df.get('GPA')*df.get('Enroll'))
    df = df.groupby('Course').sum().reset_index()
    df = (df
        .assign(yearly_num=df.get('Course').apply(lambda x : get_yearly_students(yearly, x)))
        .assign(GPA = df.get('total_grade_points')/df.get('Enroll'))
        .drop(columns=['total_grade_points'])
        .assign(dept=df.get('Course').str.split(' ').apply(lambda x: x[0]))
        .assign(num=df.get('Course').str.split(' ').apply(lambda x: x[1]))
    )

    df = df[df.get('dept') == dept]
    df = df[df.get('yearly_num')>0]

    def remove_str(s:str):
        if s[-1].isalpha():
            return remove_str(s[:-1])
        return s
    df = df.assign(num = df.get('num').apply(remove_str).astype("int"))
    df = df.sort_values(by=['num', 'Course']).set_index('Course').reset_index()
    df = df.assign(index=df.index).set_index('Course')
    return df

In [4]:

dfs = []
for dept in all_depts:
    if os.path.exists(f'../depts/{dept}.csv'):
        clean = pd.read_csv(f'../depts/{dept}.csv')
    else:
        raw = get_raw_cape_dataframe(dept)
        total_rows += raw.shape[0]
        clean = clean_df(raw, dept)
        clean.to_csv(f'../depts/{dept}.csv')
        
    dfs.append(clean)

data = pd.concat(dfs)
data.to_csv('data.csv')

  yearly = df[df.Term.isin(terms)].groupby('Course').sum()
  df = df.groupby('Course').sum().reset_index()
  yearly = df[df.Term.isin(terms)].groupby('Course').sum()
  df = df.groupby('Course').sum().reset_index()
  yearly = df[df.Term.isin(terms)].groupby('Course').sum()
  df = df.groupby('Course').sum().reset_index()
  yearly = df[df.Term.isin(terms)].groupby('Course').sum()
  df = df.groupby('Course').sum().reset_index()
  yearly = df[df.Term.isin(terms)].groupby('Course').sum()
  df = df.groupby('Course').sum().reset_index()
  yearly = df[df.Term.isin(terms)].groupby('Course').sum()
  df = df.groupby('Course').sum().reset_index()
  yearly = df[df.Term.isin(terms)].groupby('Course').sum()
  df = df.groupby('Course').sum().reset_index()
  yearly = df[df.Term.isin(terms)].groupby('Course').sum()
  df = df.groupby('Course').sum().reset_index()
  yearly = df[df.Term.isin(terms)].groupby('Course').sum()
  df = df.groupby('Course').sum().reset_index()
  yearly = df[df.Term.isin(terms)].gr

In [5]:
data

Unnamed: 0_level_0,Enroll,GPA,yearly_num,dept,num,index
Course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAS 10,310,3.700258,135,AAS,10,0
AAS 11,65,3.670000,65,AAS,11,1
AAS 170,31,3.680000,31,AAS,170,2
AAS 190,83,4.000000,44,AAS,190,3
ANAR 100,20,3.300000,20,ANAR,100,0
...,...,...,...,...,...,...
VIS 183B,66,3.775303,22,VIS,183,63
VIS 185,21,3.410000,19,VIS,185,64
WCWP 10A,1717,3.677536,931,WCWP,10,0
WCWP 10B,1774,3.724340,832,WCWP,10,1


In [9]:
data.sort_values(by='GPA').head(20)

Unnamed: 0_level_0,Enroll,GPA,yearly_num,dept,num,index
Course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MATH 2,460,2.089783,283,MATH,2,0
ANBI 111,25,2.23,19,ANBI,111,1
MATH 110,79,2.300253,41,MATH,110,26
SIO 131,33,2.31,33,SIO,131,33
COGS 152,38,2.32,19,COGS,152,39
SE 101A,348,2.350374,199,SE,101,3
PHYS 163,28,2.4,13,PHYS,163,40
ANBI 140,38,2.41,38,ANBI,140,5
SOCI 168G,26,2.48,6,SOCI,168,51
POLI 146A,208,2.52,106,POLI,146,53


In [8]:
temp = data.assign(total_points=data.get('GPA')*data.get('Enroll'))
temp = temp.groupby('dept').sum().dropna()
temp = temp.assign(GPA=temp.get('total_points') / temp.get('Enroll')).drop(columns='total_points')
temp.sort_values('GPA').head(15)

  temp = temp.groupby('dept').sum().dropna()


Unnamed: 0_level_0,Enroll,GPA,num,index
dept,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AWP,3077,2.822597,7,1
ECON,25954,2.917813,4129,630
CHEM,43248,2.998185,4367,1081
BIPN,9172,3.029871,2907,190
MATH,63310,3.044209,8402,2415
SE,5459,3.057789,4104,528
ECE,13632,3.058859,6285,1275
HUM,8682,3.109922,15,10
PHYS,43627,3.136468,2477,861
PHIL,8786,3.189942,4213,820


In [7]:
total_rows

62775