In [3]:
import requests
import os,json
from os.path import join, isdir
from pprint import pprint
import itertools as it
import dask
import re
from bs4 import BeautifulSoup
import string 

In [4]:
class SoCInterface:
    def __init__(self):
        self.SoCPrefix = 'http://web-app.usc.edu/web/soc/api' 
    '''
    get the list of active instructors of the given semester. 
    Take ~10 secs to run [7.62 s ± 545 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)]
    parameter: 
        semester, str, for instance: '20193' (2019 Fall)
    return:
        prof name, set, for instance: {(first, last), (first, last)...}
    '''
    def get_all_instructors(self, semester):
        url = '/'.join([self.SoCPrefix,'depts', semester]) # form a valid url
        schoolList = self.__make_request(url)['department']
        deptList = list(map(self.__dept_extractor, schoolList)) 
        deptList = list(it.chain.from_iterable(deptList)) # flatten the data structure, [{code:csci, name:comp science},{}]
        # now use get instructor info (multithreading)
        multithreader = [ dask.delayed(self.__get_dept_course)(dept,semester) for dept in deptList]
        # flattened result
        return list(set(list(it.chain.from_iterable(dask.compute(*multithreader, scheduler = 'threads',num_workers=12))) ))

    #---------------------------
    # private helpers 
    '''
    retrieve information from the given url 
    parameter:
        url: a valid url 
    return:
        a json file 
    '''
    def __make_request(self, url):
        try:
            response = requests.post(url)
        except: # in case of time-out, reconnect
            response = requests.post(url)
        return response.json()
    '''
    extract department names
    parameter: 
        dictionary: a dictionary of school information
    return:
        a list of departments under the given school, list of str
    '''
    def __dept_extractor(self, dictionary):
        try:
            if type(dictionary['department']) == list:
                return dictionary['department']
            else:
                return [dictionary['department']] # some school only has one department 
        except:
            return [dictionary] # some school might not have sub-department
    '''
    extract course information 
    parameter: 
        courseData: dictionary generated by __get_dept_course
    return:
        list of intructors of an given class, list of tuple
    '''
    def __course_extractor(self, courseData):
        if courseData['canceled'] == 'Y' or courseData['canceled'] == 'y':
            return []# return an empty dict if the session is cancled
        if 'instructor' in courseData:
            if type(courseData['instructor']) == dict: # mutiple instructors
                return [(courseData['instructor']['first_name'], courseData['instructor']['last_name']) ]
            else: # multiple prof for the same section
                return [(prof['first_name'], prof['last_name']) for prof in courseData['instructor']]
        else:
            return []
        
    '''
    get all courses under the same department
    parameter: 
        deptInfo, dictionary generated by getAllInstructors
        semester, str, for instance: '20193' (2019 Fall)
    return: a list of professors under the given department, a list of tuple
    '''
    def __get_dept_course(self,deptInfo, semester):
        url = '/'.join([self.SoCPrefix,'classes' ,deptInfo['code'], semester])
        courseList = self.__make_request(url)['OfferedCourses']['course']
        if type(courseList) == dict: # some dept has only one course
            courseList = [courseList]
        profList = []
        for course in courseList: 
            if type(course['CourseData']['SectionData']) == dict:  # some course has only one session
                profList += [self.__course_extractor(course['CourseData']['SectionData'])]
            else:
                profList += list(map(self.__course_extractor, course['CourseData']['SectionData']))
        return list(it.chain.from_iterable(profList)) # flatten the list

In [5]:
class RMPInterface:
    def __init__(self, outDir):
        if not isdir(outDir):
            os.mkdir (outDir)
        self.outDir = outDir
        self.RMPPrefix = 'https://www.ratemyprofessors.com'
        self. queryBody = '/search.jsp?queryoption=HEADER&queryBy=teacherName&schoolName=University+of+Southern+California&schoolID=1381&query='
    def update_prof_DB(self, profList):
        multithreader = [ dask.delayed(self.__get_prof_info)(prof) for prof in profList]
        profScoreList = list(dask.compute(*multithreader, scheduler = 'threads'))
        profScoreList = list (filter (lambda profInfo: len(profInfo)>0, profScoreList))
        with open(join(outDir,'profScoreDB.json'), 'w') as outfile:
            json.dump(dict(profScoreList), outfile)
        
    '''    
    retrieve information about the given professor
    parameter:
        profName, tuple, (first name, last name)
    return: 
        information about the prof, tuple, for instance:
        (name, {department name 1: (score, # of raters), department name 2: (score, # of raters)}) 
        a name may have several departments because people may have the same name
    '''
    def __get_prof_info(self, profName):
        url = ''.join([self.RMPPrefix,self.queryBody,'+'.join(profName)])
        profListPage = BeautifulSoup(self.__make_request(url), 'html.parser')
        relatedProf = {}
        for listings in  profListPage.find_all('li', {'class':'listing PROFESSOR'}):
            profUrl = self.RMPPrefix + listings.find('a', href=True)['href']# link to prof's individual page
            dept = listings.find('span', attrs = {'class':'sub'}).text.split(',')[-1]
            profInfoPage = BeautifulSoup(self.__make_request(profUrl), 'html.parser')
            rateTuple = self.__extract_prof_info(profInfoPage)
            if rateTuple != tuple(): # in case no information found 
                relatedProf[dept] = rateTuple
        
        if len(relatedProf) >0 :
            return (' '.join(profName), relatedProf)
        else:
            return () # no associate prof. 
    '''
    make request and return text data
    parameter:
        url: a valid url pointing to RMP pages
    return:
        web content, str
    '''
    def __make_request(self, url):
        try:
            response = requests.post(url)
        except: # in case of time-out, reconnect
            response = requests.post(url)
        return response.text
    
    def __extract_prof_info(self, profInfoPage):
        try:
            score = profInfoPage.find('div', {'class': 'grade'}).text # get avg. RMP score
            # get num of rater
            raterNum = re.findall(r'\b\d+\b',profInfoPage.find('div', {'class': 'table-toggle rating-count active'}).text)
            return (float(score), int(raterNum[0]))
        except:
            return ()
        
    

In [6]:
%%time
gateway = SoCInterface()
profs = gateway.get_all_instructors('20193')

gateway = RMPInterface('./tempStorage/')
profDict = gateway.update_prof_DB(profs)
#gateway.update_prof_DB([('aaron', 'cote'), ('Andrew', 'Goodney'), ('Olivera', 'Grujic'), ('Mahdi', 'Soltanolkotabi')])

working on:  ('Stephen', 'Child')
working on: working on:  ('Irene', 'Chiolo')
 ('Brijesh', 'Pinto')
working on:  ('Lance', 'Winkel')
working on:  ('Rebecca', 'Ehrhardt')
working on:  ('David', 'Hutchins')
working on:  ('David', 'Albertson')
working on:  ('Shana', 'Kraynak')
working on:  ('Dion', 'Jackson')
working on:  ('Nicole', 'Esparza')
working on:  ('Juan', 'Carrillo')
working on:  ('Anna', 'Arabyan')
working on:  ('Karra', 'Bikson')
working on:  ('Ram', 'Nevatia')
working on:  ('William', 'Celis')
working on:  ('M G', 'Lord')
working on:  ('Bill', 'Biersach')
working on:  ('Mansour', 'Rostami')
working on:  ('Anna', 'Farzindar')
working on:  ('Sung-Hwa', 'Park')
working on:  ('Stefan', 'Pollack')
working on:  ('Takahiro', 'Sakai')
working on:  ('Hector', 'Reyes')
working on:  ('Brent', 'Blair')
working on:  ('Judith', 'Hirsch')
working on:  ('Carol', 'Muske-Dukes')
working on:  ('Leah', 'Pate')
working on:  ('Peter', 'Holzhauer')
working on:  ('David', 'Bringhurst')
working on: 

working on:  ('Mike', 'Lee')
working on:  ('Teddy', 'Lance')
working on:  ('Tridib', 'Banerjee')
working on:  ('Scott', 'Darrell')
working on:  ('Antonio', 'Bento')
working on:  ('Aluizio', 'Prata')
working on:  ('Rex', 'Kovacevich')
working on:  ('Jack', 'Feinberg')
working on:  ('Lodovico', 'Pizzati')
working on:  ('Alexandra', 'Graddy-Reed')
working on:  ('Mark', 'Goldstein')
working on:  ('Yuan', 'Gao')
working on:  ('Clark', 'Kromenaker')
working on:  ('Myron', 'Goodman')
working on:  ('Julia', 'Chamberlin')
working on:  ('Jonathan', 'Mullins')
working on:  ('Chander', 'Burgos')
working on:  ('Kristy', 'Payne')
working on:  ('Atiyeh', 'Showrai')
working on:  ('Virgil', 'Adumitroaie')
working on:  ('Andrew', 'Goodney')
working on:  ('Susan', 'Arnold')
working on:  ('Lavonna', 'Lewis')
working on: working on:  ('Murat', 'Bayiz')
 ('Alberto', 'Vallejo')
working on:  ('Niels', 'Frenzen')
working on:  ('Canan', 'Ipek')
working on:  ('Lee', 'Olvera')
working on:  ('Candace', 'Smith')
wo

working on:  ('Nabil', 'Ziane')
working on:  ('Donald', 'Spivack')
working on:  ('Alice', 'Echols')
working on:  ('Arpi', 'Mardirossian')
working on:  ('Ricardo', 'Mancera')
working on:  ('Francesca', 'Italiano')
working on:  ('Tamara', 'Black')
working on:  ('Helaine', 'Lopes')
working on:  ('Susanna', 'Seierup')
working on:  ('Ashok', 'Srinivasan')
working on:  ('Ryan', 'Boyd')
working on:  ('Carolann', 'Peterson')
working on:  ('Vibhor', 'Trehan')
working on:  ('John', 'Wilson')
working on:  working on:  ('Robert', 'Waller')
working on:  ('Andrew', 'Bacon')
('Alisa', 'Sanchez')
working on:  ('Paul', 'Young')
working on:  ('Peter', 'Robinson')
working on:  ('Anna', 'Krylov')
working on:  ('Paul', 'Urcioli')
working on:  ('Leah', 'Hochman')
working on:  ('Donald', 'Yett')
working on:  ('Mary', 'McNamara')
working on:  ('Amanda', 'Pope')
working on:  ('Trudy', 'Green')
working on:  ('Gerald', 'Giaquinta')
working on:  ('Heidi', 'Khalil')
working on:  ('Alice', 'Fung')
working on:  ('Ma

working on:  ('Devon', 'Brooks')
working on:  ('Duke', 'Bristow')
working on:  ('Eve', 'Lee')
working on:  ('Sheldon', 'Ross')
working on:  ('Nathan', 'Greenfield')
working on:  ('Dennis', 'Schorr')
working on: working on:  ('Vinay', 'Goyal')
 ('Kadri', 'Vihvelin')
working on:  ('Azure', 'Darby')
working on:  ('Steven', 'Mednick')
working on:  ('Carol', 'Wise')
working on:  ('Douglas', 'Shook')
working on:  ('Davide', 'Proserpio')
working on:  ('Helen', 'Chung')
working on:  ('Eric', 'Briggs')
working on:  ('Sharon', 'Carnicke')
working on:  ('Brian', 'Bernards')
working on:  ('Marc', 'Aubertin')
working on:  ('Antonia', 'Szabari')
working on:  ('Carlos', 'Godoy')
working on:  ('Ken', 'Sereno')
working on:  ('Stephen', 'Moyer')
working on:  ('Kirk', 'Snyder')
working on: working on:  ('Ferol', 'Mennen')
 ('Paul', 'Lerner')
working on:  ('Sven', 'Koenig')
working on:  ('Fengzhu', 'Sun')
working on:  ('Reuven', 'Firestone')
working on:  ('David', 'Weber')
working on:  ('Caroline', 'Clerc

working on:  ('Felicia', 'Tabing')
working on:  ('Panayiota', 'Courelli')
working on:  ('Anthony', 'Kemp')
working on:  ('Howard', 'Croom')
working on:  ('Jane', 'Junn')
working on:  ('Lindsey', 'Bier')
working on:  ('Aaron', 'Winslow')
working on:  ('Mark', 'Weiser')
working on:  ('Paul', 'Orfalea')
working on:  ('Roksana', 'Karim')
working on:  ('Gene', 'Bickers')
working on:  ('Francille', 'Wilson')
working on:  ('Martin', 'Daniel')
working on:  ('James', 'Polk')
working on:  ('Kelsey', 'Rubin-Detlev')
working on:  ('Robert', 'Nashak')
working on:  ('Mohammad Reza', 'Rajati')
working on:  ('Olu', 'Orange')
working on:  ('James', 'Boedicker')
working on:  ('Paul', 'Lichterman')
working on:  ('Elsi', 'Kaiser')
working on:  ('Timothy', 'Burton')
working on:  ('Grace', 'Lu')
working on:  ('Maura', 'Crowley')
working on:  ('Graham', 'Robertson')
working on:  ('Leana', 'Golubchik')
working on: working on:  ('Elizabeth', 'Amini')
 ('Xochitl', 'Ruiz')
working on:  ('Ellen', 'Oliveira')
work

working on:  ('Gayle', 'Fiedler-Vierma')
working on:  ('Laurie', 'Fisher')
working on:  ('Ivan', 'Nikkhoo')
working on:  ('Brett', 'Sheehan')
working on:  ('Nina', 'Eliasoph')
working on:  ('Helen', 'Choi')
working on:  ('David', 'Tang')
working on:  ('Steven', 'Bush')
working on:  ('Konstantinos', 'Psounis')
working on:  ('Ron', 'Grover')
working on:  ('Deborah', 'Sims')
working on:  ('Yuka', 'Kumagai')
working on:  ('Mahmood', 'Sariolghalam')
working on:  ('Meiling', 'Cheng')
working on:  ('Vincent', 'Farenga')
working on:  ('Yi-Hsien', 'Liu')
working on:  ('Bill', 'Yahraus')
working on:  ('Ulrich', 'Neumann')
working on: working on:  ('Mary Joan', 'Negro')
 working on:  ('Trisha', 'Tucker')
('Merle', 'Hopkins')
working on: working on:  ('William', 'Resh')
 ('Audra', 'Bardsley')
working on:  ('Eric', 'Goldberg')
working on:  ('Gelya', 'Frank')
working on:  ('Jud', 'Fine')
working on:  ('Aniko', 'Imre')
working on:  ('Clifford', 'Neuman')
working on:  ('Masako', 'Tamanaha')
working on

In [8]:
with open(join('.','profScoreDB.json'), 'w') as outfile:
    json.dump(profDict, outfile)