In [40]:
import pandas as pd

courses = pd.read_csv('ucdavis_courses.csv')
registrar_table = pd.DataFrame(columns=['CourseId', 'ProfName', 'Quarter', 'CRN'])
registrar_dict = {}
start = 0

In [38]:
import requests
import requests_cache
from datetime import timedelta
from ratelimit import limits, sleep_and_retry
import warnings
import re

warnings.filterwarnings("ignore")

# Enable caching
requests_cache.install_cache("registrar")

headers = {
    'Accept': '*/*',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
}


term_codes = ['202401', '202403', '202410']
data = {
    'termCode': '',
    'course_number': '',
    'multiCourse': '',
    'course_title': '',
    'instructor': '',
    'subject': '-',
    'course_start_eval': '-',
    'course_start_time': '-',
    'course_end_eval': '-',
    'course_end_time': '-',
    'course_status': '-',
    'course_level': '-',
    'course_units': '-',
    'virtual': '-',
    'runMe': '1',
    'clearMe': '1',
    'reorder': '',
    'gettingResults': '0',
}

@sleep_and_retry
@limits(calls=10, period=timedelta(seconds=10).total_seconds())
def extract():
    global registrar_table, data, start
    course_list = list(courses['Course Code'])
    for i in range(start, len(course_list), 250):
        search_text = ','.join(course_list[i:i+250])
        data['multiCourse'] = search_text
        for term in term_codes:
            try:
                data['termCode'] = term
                with requests.post(
                        'https://registrar-apps.ucdavis.edu/courses/search/course_search_results.cfm',
                        headers=headers,
                        data=data,
                        stream=True) as response:
                    search_df = pd.read_html(response.text)[0]
                    search_df = search_df.iloc[4:]
                    search_df.columns = ['0', '1', '2', '3', '4', '5', '6']
                    search_df.drop(columns=['0', '2', '3', '6'], inplace=True)
                    search_df.rename(columns={'1': 'CourseId', '4': 'ProfName', '5': 'CRN'}, inplace=True)
                    search_df['Quarter'] = [term for _ in range(search_df.shape[0])]
                    search_df['CRN'] = [crn.replace('view ', '') for crn in search_df['CRN']]
                    courses_list = [re.findall(r'(\w+ \d+[A-Z]*)', cid) for cid in search_df['CourseId']]
                    search_df['CourseId'] = [cid[0] if len(cid) != 0 else '' for cid in courses_list]
                    
                    registrar_table = pd.concat([registrar_table, search_df], ignore_index=True)
                    for index, rows in search_df.iterrows():
                        if rows['CourseId'] in registrar_dict:
                            if (term not in registrar_dict[rows['CourseId']] 
                                    or registrar_dict[rows['CourseId']][term] == ''):
                                crn = re.findall(r'(\d+)', rows['CRN'])
                                registrar_dict[rows['CourseId']][term] = crn[0] if len(crn) != 0 else ''
                        else:
                            registrar_dict[rows['CourseId']] = {}
                            crn = re.findall(r'(\d+)', rows['CRN'])
                            registrar_dict[rows['CourseId']][term] = crn[0] if len(crn) != 0 else ''
                    
                    print(term, i)
                    start = i
            except Exception as e:
                print(data, e)

extract()

202401 9750
202403 9750
202410 9750


In [108]:
exclude_list = []

In [110]:
from tqdm.notebook import tqdm_notebook

@sleep_and_retry
@limits(calls=50, period=timedelta(seconds=10).total_seconds())
def extract_prof_names():
    crns = set()
    
    global exclude_list

    for course, term_dict in registrar_dict.items():
        for term, crn in term_dict.items():
            if crn != '':
                crns.add((crn, term, course))
    progress = tqdm_notebook(range(registrar_table.shape[0] + len(exclude_list), len(crns)), "Prof name extraction progress")
    for crn, term, course in crns:
        if (crn, term) not in zip(list(registrar_table['CRN']), list(registrar_table['Quarter'])) and crn not in exclude_list:
            try:
                # print({"crn": crn, "termCode": term})
                course_response = requests.post(
                    'https://registrar-apps.ucdavis.edu/courses/search/course.cfm',
                    headers=headers,
                    data={
                        "crn": crn,
                        "termCode": term
                    },
                )
            
                table = pd.read_html(course_response.text)
                prof_name = table[0].iloc[4, 0].replace('Instructor:', '').strip()
                if len(prof_name) != 0:
                    registrar_table.loc[len(registrar_table)] = [course, prof_name, term, crn]
                else:
                    exclude_list.append(crn)
                progress.update(1)
            except Exception as e:
                print({"crn": crn, "termCode": term})
                raise e
    registrar_table.drop_duplicates(inplace=True)

extract_prof_names()

Prof name extraction progress:   0%|          | 0/6 [00:00<?, ?it/s]

In [112]:
registrar_table.to_csv('registrar_data.csv', index=False)

In [63]:
crns1 = set()

            
print(len(crns1))

4723


In [39]:
import re
prof_dict = {}
quarter_dict = {}

for index, rows in registrar_table.iterrows():
    prof_name = rows['ProfName']
    if not '@ Denotes' in prof_name and not 'The Staff' in prof_name:
        try:
            prof_name = re.sub(r'\d+.\d+', '', prof_name)
            course_code = rows['CourseId']
            if course_code not in prof_dict:
                prof_dict[course_code] = prof_name
            if course_code not in quarter_dict:
                quarter_dict[course_code] = rows['Quarter']   
        except Exception as e:
            print(rows['CourseId'])
            # raise e

** TOPIC: TO BE ANNOUNCED **
** TOPIC: TO BE ANNOUNCED **
** TOPIC: TO BE ANNOUNCED **
** TOPIC: FINAL FANTASY VII: JRPGS, ECO-CRITICISMAND ADAPTION **
** TOPIC: FINAL FANTASY VII: JRPGS, ECO-CRITICISMAND ADAPTION **
** TOPIC: LATINA/O/X LITERATURE & MEDIA **
** TOPIC: LATINA/O/X LITERATURE & MEDIA **
** TOPIC: EMPIRE, RACIAL CAPITALISM, AND LITERATURESOF SPECULATION **


In [43]:
registrar_table = pd.DataFrame([
    [course_code, prof_name, quarter] for course_code, prof_name, quarter in 
    zip(list(prof_dict.keys()), list(prof_dict.values()), list(quarter_dict.values()))
], columns=['CourseId', 'ProfName', 'Quarter'])

In [95]:
registrar_table.to_csv('registrar_data.csv', index=False)

In [3]:
print('Number of unique professors - ', len(set(list(registrar_table['ProfName']))))

Number of unique professors -  1957
