In [41]:
import requests
import os,json
from os.path import join, isdir
from pprint import pprint
import itertools as it
import dask
import re

In [53]:
class RMPInterface:
    def __init__(self, outDir):
        if not isdir(outDir):# create the output directory if it does not exist
            os.mkdir(outDir)
        self.SoCPrefix = 'http://web-app.usc.edu/web/soc/api'
        self.RMPPrefix = 'https://www.ratemyprofessors.com'
        
    '''
    get the list of active instructors of the given semester. 
    Take ~10 secs to run [7.62 s ± 545 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)]
    parameter: 
        semester, str, for instance: '20193' (2019 Fall)
    return:
        prof name and home departments, list of tuples, for instance: [(aaron cote, CSCI, computer science)]
    '''
    def getAllInstructors(self, semester):
        url = '/'.join([self.SoCPrefix,'depts', semester]) # form a valid url
        schoolList = self.__make_request(url)['department']
        deptList = list(map(self.__dept_extractor, schoolList)) 
        deptList = list(it.chain.from_iterable(deptList)) # flatten the data structure, [{code:csci, name:comp science},{}]
        # now use get instructor info (multithreading)
        #multithreader = []
        #for dept in deptList:
        #    multithreader.append(dask.delayed(self.__get_dept_course)(dept,semester))
        multithreader = [ dask.delayed(self.__get_dept_course)(dept,semester) for dept in deptList]
        # flattened result
        return list(it.chain.from_iterable(dask.compute(*multithreader, scheduler = 'threads',num_workers=12))) 
         
    '''
    get RMP score of the given professor
    parameter: 
        professor information, tuple, generated by getAllInstructors, for instance (Aaron Cote, CSCI, comp science)
        
    '''
    #---------------------------
    # private helpers 
    '''
    retrieve information from the given url 
    parameter:
        url: a valid url 
    return:
        a json file 
    '''
    def __make_request(self, url):
        try:
            response = requests.post(url)
        except: # in case of time-out, reconnect
            response = requests.post(url)
        return response.json()
    '''
    extract department names
    parameter: 
        dictionary: a dictionary of school information
    return:
    '''
    def __dept_extractor(self, dictionary):
        try:
            if type(dictionary['department']) == list:
                return dictionary['department']
            else:
                return [dictionary['department']] # some school only has one department 
        except:
            return [dictionary] # some school might not have sub-department
    '''
    extract course information 
    parameter: 
        courseData: dictionary generated by __get_dept_course
        
    '''
    def __course_extractor(self, courseData):
        if courseData['canceled'] == 'Y' or courseData['canceled'] == 'y':
            return []# return an empty dict if the session is cancled
        if 'instructor' in courseData:
            if type(courseData['instructor']) == dict: # mutiple instructors
                return [courseData['instructor']['last_name']+','+courseData['instructor']['first_name'] ]
            else: # multiple prof for the same section
                return [prof['last_name']+','+prof['first_name'] for prof in courseData['instructor']]
        else:
            return []
        
    '''
    get all courses under the same department
    parameter: 
        deptInfo, dictionary generated by getAllInstructors
        semester, str, for instance: '20193' (2019 Fall)
    return: a list of professors under the given department
    '''
    def __get_dept_course(self,deptInfo, semester):
        url = '/'.join([self.SoCPrefix,'classes' ,deptInfo['code'], semester])
        courseList = self.__make_request(url)['OfferedCourses']['course']
        if type(courseList) == dict: # some dept has only one course
            courseList = [courseList]
        profList = []
        for course in courseList:
            if type(course['CourseData']['SectionData']) == dict:  # some course has only one session
                profList += [self.__course_extractor(course['CourseData']['SectionData'])]
            else:
                profList += list(map(self.__course_extractor, course['CourseData']['SectionData']))
        profList = set(list(it.chain.from_iterable(profList))) # flatten the list
        return  [(name, deptInfo['code'], deptInfo['name']) for name in profList]

In [54]:
gateway = RMPInterface('./tempStorage')
profs = gateway.getAllInstructors('20193')

In [55]:
len(profs)

5030

In [52]:
profs

[('Levin,Tracy', 'ALI', 'American Language Institute'),
 ('Murphy,Mary Ann', 'ALI', 'American Language Institute'),
 ('Aarsen,Lucienne', 'ALI', 'American Language Institute'),
 ('Clausen,Reka', 'ALI', 'American Language Institute'),
 ('Jones,Richard', 'ALI', 'American Language Institute'),
 ('Martinez,Olivia', 'ALI', 'American Language Institute'),
 ('Tzoytzoyrakos,Anastassia', 'ALI', 'American Language Institute'),
 ('Roth,Eric', 'ALI', 'American Language Institute'),
 ('Kang,Nina', 'ALI', 'American Language Institute'),
 ('Griner,Barry', 'ALI', 'American Language Institute'),
 ('Polk,James', 'ALI', 'American Language Institute'),
 ('Briesch Sumner,Kimberley', 'ALI', 'American Language Institute'),
 ('Kirkpatrick,Juli', 'ALI', 'American Language Institute'),
 ('De Leon,Adrian', 'AMST', 'American Studies and Ethnicity'),
 ('Shah,Nayan', 'AMST', 'American Studies and Ethnicity'),
 ('Kurashige,Lon', 'AMST', 'American Studies and Ethnicity'),
 ('Hill,Edwin', 'AMST', 'American Studies and 