In [17]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
import os
import pickle

YEAR = 22
terms = [f'FA{YEAR-1}', f'WI{YEAR}', F'SP{YEAR}', F'S1{YEAR}', f'S2{YEAR}', f'S3{YEAR}']

CAPEURL = 'https://cape.ucsd.edu/responses/Results.aspx'
CAPETITLE = 'Course And Professor Evaluations (CAPE)'

with open('all_depts.pick', 'rb') as f:
    all_depts = pickle.load(f)

# taken from BetterCapes
# https://github.com/andportnoy/smartercapes.com/blob/master/tools.py
def get_raw_cape_dataframe(dept:str):

    options = webdriver.ChromeOptions()
    options.add_argument("user-data-dir=C:/Users/kalki/AppData/Local/Google/Chrome/User Data")
    options.add_argument("profile-directory=Profile 7")
    driver = webdriver.Chrome(options=options)
    
    # driver.get(f'https://cape.ucsd.edu/responses/Results.aspx?Name={course[0]}+{course[1]}&CourseNumber=')
    driver.get(f'https://cape.ucsd.edu/responses/Results.aspx?Name=&CourseNumber={dept}')
    wait = WebDriverWait(driver, 60)
    element = wait.until(expected_conditions.title_contains(CAPETITLE))
    # read in the dataset from the html file
    df = pd.read_html(driver.page_source)[0]
    driver.quit()

    return df

def clean_df(df, dept):
    df = (
        df[
            ['Instructor', 'Course', 'Term', 'Rcmnd Class',
            'Rcmnd Instr', 'Avg Grade Expected',
            'Avg Grade Received', 'Enroll']
        ]
        .assign(Course = df.get('Course').str.split(' - ').apply(lambda x: x[0]))
    )

    yearly = df[df.Term.isin(terms)].groupby('Course').sum()
    def get_yearly_students(course):
        if course not in yearly.index:
            return 0
        return yearly.get('Enroll').loc[course]

    df = df.dropna()

    df = (df
        .assign(
            GPA=(df.get('Avg Grade Received')
                .str.split('(')
                .apply(lambda x : x[-1])
                .str.rstrip(")")
                .astype('float'))
        )
    )
    df = df.assign(total_grade_points = df.get('GPA')*df.get('Enroll'))
    df = df.groupby('Course').sum().reset_index()
    df = (df
        .assign(yearly_num=df.get('Course').apply(get_yearly_students))
        .assign(GPA = df.get('total_grade_points')/df.get('Enroll'))
        .drop(columns=['total_grade_points', 'Enroll'])
        .assign(dept=df.get('Course').str.split(' ').apply(lambda x: x[0]))
        .assign(num=df.get('Course').str.split(' ').apply(lambda x: x[1]))
    )

    df = df[df.get('dept') == dept]
    df = df[df.get('yearly_num')>0]

    def remove_str(s:str):
        if s[-1].isalpha():
            return remove_str(s[:-1])
        return s
    df = df.assign(num = df.get('num').apply(remove_str).astype("int"))
    df = df.sort_values(by=['num', 'Course']).set_index('Course').reset_index()
    df = df.assign(index=df.index).set_index('Course')
    return df

In [26]:
dfs = []
for dept in all_depts:
    if os.path.exists(f'depts/{dept}.csv'):
        clean = pd.read_csv(f'depts/{dept}.csv')
    else:
        raw = get_raw_cape_dataframe(dept)
        clean = clean_df(raw, dept)
        clean.to_csv(f'depts/{dept}.csv')
        
    dfs.append(clean)

data = pd.concat(dfs)
data.to_csv('data.csv')

  yearly = df[df.Term.isin(terms)].groupby('Course').sum()
  df = df.groupby('Course').sum().reset_index()
  yearly = df[df.Term.isin(terms)].groupby('Course').sum()
  df = df.groupby('Course').sum().reset_index()
  yearly = df[df.Term.isin(terms)].groupby('Course').sum()
  df = df.groupby('Course').sum().reset_index()
  yearly = df[df.Term.isin(terms)].groupby('Course').sum()
  df = df.groupby('Course').sum().reset_index()
  yearly = df[df.Term.isin(terms)].groupby('Course').sum()
  df = df.groupby('Course').sum().reset_index()
  yearly = df[df.Term.isin(terms)].groupby('Course').sum()
  df = df.groupby('Course').sum().reset_index()
  yearly = df[df.Term.isin(terms)].groupby('Course').sum()
  df = df.groupby('Course').sum().reset_index()
  yearly = df[df.Term.isin(terms)].groupby('Course').sum()
  df = df.groupby('Course').sum().reset_index()
  yearly = df[df.Term.isin(terms)].groupby('Course').sum()
  df = df.groupby('Course').sum().reset_index()
  yearly = df[df.Term.isin(terms)].gr

In [27]:
data

Unnamed: 0_level_0,GPA,yearly_num,dept,num,index
Course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AAS 10,3.651734,199,AAS,10,0
AAS 190,4.000000,83,AAS,190,1
ANAR 116,3.350000,16,ANAR,116,0
ANAR 135,4.000000,22,ANAR,135,1
ANAR 143,3.420694,36,ANAR,143,2
...,...,...,...,...,...
VIS 183B,3.593932,23,VIS,183,71
VIS 185,3.558923,19,VIS,185,72
WCWP 10A,3.229711,1003,WCWP,10,0
WCWP 10B,3.329744,973,WCWP,10,1


In [30]:
data.sort_values(by='index')

Unnamed: 0_level_0,GPA,yearly_num,dept,num,index
Course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AAS 10,3.651734,199,AAS,10,0
ETHN 1,3.725019,403,ETHN,1,0
FMPH 40,3.368392,452,FMPH,40,0
GLBH 20,3.534110,461,GLBH,20,0
GSS 20,3.720000,71,GSS,20,0
...,...,...,...,...,...
POLI 174,3.428298,22,POLI,174,85
POLI 176,3.530667,60,POLI,176,86
POLI 178,3.780000,24,POLI,178,87
POLI 191A,3.933810,34,POLI,191,88


In [29]:
data[data.get('dept')=='POLI']

Unnamed: 0_level_0,GPA,yearly_num,dept,num,index
Course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
POLI 5,3.549141,49,POLI,5,0
POLI 5D,3.110909,82,POLI,5,1
POLI 10,3.079661,96,POLI,10,2
POLI 10D,3.273114,259,POLI,10,3
POLI 11D,3.162922,249,POLI,11,4
...,...,...,...,...,...
POLI 174,3.428298,22,POLI,174,85
POLI 176,3.530667,60,POLI,176,86
POLI 178,3.780000,24,POLI,178,87
POLI 191A,3.933810,34,POLI,191,88
