In [73]:
import requests
import os,json
from os.path import join, isdir
from pprint import pprint
import itertools as it
import dask
import re
from bs4 import BeautifulSoup
import string 

In [148]:
class SoCInterface:
    def __init__(self):
        self.SoCPrefix = 'http://web-app.usc.edu/web/soc/api' 
    '''
    get the list of active instructors of the given semester. 
    Take ~10 secs to run [7.62 s ± 545 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)]
    parameter: 
        semester, str, for instance: '20193' (2019 Fall)
    return:
        prof name, set, for instance: {(first, last), (first, last)...}
    '''
    def get_all_instructors(self, semester):
        url = '/'.join([self.SoCPrefix,'depts', semester]) # form a valid url
        schoolList = self.__make_request(url)['department']
        deptList = list(map(self.__dept_extractor, schoolList)) 
        deptList = list(it.chain.from_iterable(deptList)) # flatten the data structure, [{code:csci, name:comp science},{}]
        # now use get instructor info (multithreading)
        multithreader = [ dask.delayed(self.__get_dept_course)(dept,semester) for dept in deptList]
        # flattened result
        return list(set(list(it.chain.from_iterable(dask.compute(*multithreader, scheduler = 'threads',num_workers=12))) ))

    #---------------------------
    # private helpers 
    '''
    retrieve information from the given url 
    parameter:
        url: a valid url 
    return:
        a json file 
    '''
    def __make_request(self, url):
        try:
            response = requests.post(url)
        except: # in case of time-out, reconnect
            response = requests.post(url)
        return response.json()
    '''
    extract department names
    parameter: 
        dictionary: a dictionary of school information
    return:
        a list of departments under the given school, list of str
    '''
    def __dept_extractor(self, dictionary):
        try:
            if type(dictionary['department']) == list:
                return dictionary['department']
            else:
                return [dictionary['department']] # some school only has one department 
        except:
            return [dictionary] # some school might not have sub-department
    '''
    extract course information 
    parameter: 
        courseData: dictionary generated by __get_dept_course
    return:
        list of intructors of an given class, list of tuple
    '''
    def __course_extractor(self, courseData):
        if courseData['canceled'] == 'Y' or courseData['canceled'] == 'y':
            return []# return an empty dict if the session is cancled
        if 'instructor' in courseData:
            if type(courseData['instructor']) == dict: # mutiple instructors
                return [(courseData['instructor']['first_name'], courseData['instructor']['last_name']) ]
            else: # multiple prof for the same section
                return [(prof['first_name'], prof['last_name']) for prof in courseData['instructor']]
        else:
            return []
        
    '''
    get all courses under the same department
    parameter: 
        deptInfo, dictionary generated by getAllInstructors
        semester, str, for instance: '20193' (2019 Fall)
    return: a list of professors under the given department, a list of tuple
    '''
    def __get_dept_course(self,deptInfo, semester):
        url = '/'.join([self.SoCPrefix,'classes' ,deptInfo['code'], semester])
        courseList = self.__make_request(url)['OfferedCourses']['course']
        if type(courseList) == dict: # some dept has only one course
            courseList = [courseList]
        profList = []
        for course in courseList: 
            if type(course['CourseData']['SectionData']) == dict:  # some course has only one session
                profList += [self.__course_extractor(course['CourseData']['SectionData'])]
            else:
                profList += list(map(self.__course_extractor, course['CourseData']['SectionData']))
        return list(it.chain.from_iterable(profList)) # flatten the list

In [150]:
class RMPInterface:
    def __init__(self, outDir):
        if not isdir(outDir):
            os.mkdir (outDir)
        self.outDir = outDir
        self.RMPPrefix = 'https://www.ratemyprofessors.com'
        self. queryBody = '/search.jsp?queryoption=HEADER&queryBy=teacherName&schoolName=University+of+Southern+California&schoolID=1381&query='
    def update_prof_DB(self, profList):
        multithreader = [ dask.delayed(self.__get_prof_info)(prof) for prof in profList]
        profScoreList = list(dask.compute(*multithreader, scheduler = 'threads'))
        profScoreList = list (filter (lambda profInfo: len(profInfo)>0, profScoreList))
        return dict(profScoreList)
    '''    
    retrieve information about the given professor
    parameter:
        profName, tuple, (first name, last name)
    return: 
        information about the prof, tuple, for instance:
        (name, {department name 1: (score, # of raters), department name 2: (score, # of raters)}) 
        a name may have several departments because people may have the same name
    '''
    def __get_prof_info(self, profName):
        url = ''.join([self.RMPPrefix,self.queryBody,'+'.join(profName)])
        profListPage = BeautifulSoup(self.__make_request(url), 'html.parser')
        relatedProf = {}
        for listings in  profListPage.find_all('li', {'class':'listing PROFESSOR'}):
            profUrl = self.RMPPrefix + listings.find('a', href=True)['href']# link to prof's individual page
            dept = listings.find('span', attrs = {'class':'sub'}).text.split(',')[-1]
            profInfoPage = BeautifulSoup(self.__make_request(profUrl), 'html.parser')
            rateTuple = self.__extract_prof_info(profInfoPage)
            if rateTuple != tuple(): # in case no information found 
                relatedProf[dept] = rateTuple
        
        if len(relatedProf) >0 :
            print ('working on: ', profName)
            return (' '.join(profName), relatedProf)
        else:
            return () # no associate prof. 
    '''
    make request and return text data
    parameter:
        url: a valid url pointing to RMP pages
    return:
        web content, str
    '''
    def __make_request(self, url):
        try:
            response = requests.post(url)
        except: # in case of time-out, reconnect
            response = requests.post(url)
        return response.text
    
    def __extract_prof_info(self, profInfoPage):
        try:
            score = profInfoPage.find('div', {'class': 'grade'}).text # get avg. RMP score
            # get num of rater
            raterNum = re.findall(r'\b\d+\b',profInfoPage.find('div', {'class': 'table-toggle rating-count active'}).text)
            return (float(score), int(raterNum[0]))
        except:
            return ()
        
    

In [155]:
%%time
gateway = SoCInterface()
profs = gateway.get_all_instructors('20193')

gateway = RMPInterface('./tempStorage/')
profDict = gateway.update_prof_DB(profs)
#gateway.update_prof_DB([('aaron', 'cote'), ('Andrew', 'Goodney'), ('Olivera', 'Grujic'), ('Mahdi', 'Soltanolkotabi')])

working on:  ('Christina', 'Paddock')
working on:  ('Lisa', 'Pecot-Hebert')
working on:  ('Oscar', 'Aparicio')
working on:  ('Daria', 'Roithmayr')
working on:  ('Amanda', 'Bloom')
working on:  ('Mark', 'Haddad')
working on:  ('Shanea', 'Thomas')
working on:  ('Gerald', 'Giaquinta')
working on:  ('Leah', 'Kemp')
working on:  ('Doug', 'Thomas')
working on:  ('Albert', 'Herrera')
working on:  ('Ricardo', 'Mancera')
working on:  ('Neelesh', 'Tiruviluamala')
working on:  ('Bruce', 'Yazejian')
working on:  ('Hilary', 'Schor')
working on:  ('Deanna', 'Rivera')
working on:  ('Robert', 'Sacker')
working on:  ('Andy', 'Abad')
working on:  ('John', 'De Mita')
working on:  ('Sheldon', 'Ross')
working on:  ('Peer', 'Fiss')
working on:  ('Jae', 'Deal')
working on:  working on: ('Olu', 'Orange')
 ('Joseph', 'Lim')
working on:  ('Lisa', 'Vest')
working on:  ('Pamela', 'Douglas')
working on:  ('Duncan', 'Mahoney')
working on:  ('Navdeep', 'Mundi')
working on:  ('Eva', 'Kanso')
working on:  ('Tom', 'Hol

working on:  ('Ann Marie', 'Yamada')
working on:  ('Toby', 'Mintz')
working on:  ('David', 'Tomkins')
working on:  ('Yanhao', 'Wei')
working on:  ('Greys', 'Sosic')
working on:  ('Ashutosh', 'Nayyar')
working on:  ('Jennifer', 'Warren')
working on:  ('Kim', 'Finney')
working on:  ('John', 'Ayoob')
working on:  ('Phil', 'Allen')
working on: working on:  ('Rhoda', 'Coleman')
 ('Maki', 'Irie')
working on:  ('Mark', 'Moore')
working on:  ('Francesca', 'Italiano')
working on:  ('Tridib', 'Banerjee')
working on:  ('Tok', 'Thompson')
working on:  ('Benita', 'Walton-Moss')
working on:  ('Marisa', 'Mandler')
working on:  ('Kenneth', 'Phillips')
working on:  ('Scott', 'Easley')
working on:  ('Matthew', 'Whiting')
working on:  ('Gordon', 'Stables')
working on:  ('Jed', 'Fuhrman')
working on:  ('Raffaella', 'Ghittoni')
working on:  ('E Moncell', 'Durden')
working on:  ('Carlos', 'Godoy')
working on:  ('Jackie', 'Kopcsak')
working on:  ('Kazuo', 'Takeda')
working on:  ('Brent', 'Melot')
working on:

working on:  ('Doug', 'Hammond')
working on:  ('Beatriz', 'Ilari')
working on:  ('Harry', 'Hunter')
working on:  ('Carol', 'Wise')
working on:  ('Trudy', 'Green')
working on:  ('Wensheng', 'Wu')
working on:  ('Wen', 'Sun')
working on:  ('Arpi', 'Mardirossian')
working on:  ('Christopher', 'Sampson')
working on:  ('Benjamin', 'Henwood')
working on:  ('Margaret', 'Rosenthal')
working on:  ('Dana', 'Johnson')
working on:  ('Drew', 'Casper')
working on:  ('James', 'Van Cleve')
working on:  ('Katherine', 'Shing')
working on:  ('Mansour', 'Rahimi')
working on:  ('Kristine', 'Garroway')
working on:  ('Eric', 'Goldberg')
working on:  working on:  ('Alberto', 'Vallejo')
('Arianna', 'Uhalde')
working on:  ('Tom', 'Chang')
working on:  ('Kenneth', 'Ahern')
working on:  ('Stephen', 'Moyer')
working on:  ('Wolf', 'Gruner')
working on:  ('Murat', 'Bayiz')
working on:  ('Steven', 'Mednick')
working on:  ('Elisa', 'Warford')
working on:  ('Jeff', 'Guh')
working on:  ('Massoud', 'Pirbazari')
working on

working on:  ('Rochelle', 'Gold')
working on:  ('Mike', 'Ananny')
working on:  ('Meridith', 'Kruse')
working on:  ('Ali', 'Abbas')
working on:  ('Helaine', 'Head')
working on:  ('Yao-Yi', 'Chiang')
working on:  ('Carolin', 'Fleischmann')working on:  ('Paul', 'Lerner')

working on:  ('Rajiv', 'Kalia')
working on:  ('Brenda', 'Goodman')
working on:  working on:  ('David', 'Schwartz')
('Dana', 'Milstein')
working on:  ('Susan', 'Gavigan')
working on:  ('Maria', 'Hu')
working on:  ('Aniko', 'Imre')
working on:  ('Rafael', 'Angulo')
working on:  ('Sozan', 'Wali')
working on:  ('Christina', 'Belcher')
working on:  ('Francis', 'Pereira')
working on:  ('Alexandra', 'Graddy-Reed')
working on:  ('Trond', 'Sigurdsen')
working on:  ('Debbie', 'Murad')
working on:  ('Ben', 'Reichardt')
working on:  ('Jane', 'Allgood')
working on:  ('Brad', 'Shipley')
working on:  ('Steven', 'Lopez')
working on:  ('Ben', 'Poston')
working on:  ('Darry', 'Sragow')
working on:  ('Consuelo', 'Siguenza-Ortiz')
working o

working on:  ('Michael', 'Kassner')
working on:  ('Kyung', 'Jung')
working on:  ('Sung-Hwa', 'Park')
working on:  ('Priya', 'Jaikumar')
working on:  ('Mark', 'Goldstein')
working on:  ('Elissa', 'Grossman')
working on:  ('Carolann', 'Peterson')
working on:  ('Bhargav', 'Oza')
working on:  ('Scott', 'Smith')
working on:  ('David', 'St John')
working on:  ('Karla', 'Heidelberg')
working on:  ('Roberto', 'Diaz')
working on:  ('Devin', 'Griffiths')
working on:  ('Jim', 'Staahl')
working on:  ('Kidogo', 'Kennedy')
working on:  ('Malgorzata', 'Switek')
working on:  ('Stephanie', 'Sabo')
working on:  ('Stanislav', 'Minsker')
working on:  ('Danielle', 'Brown')
working on:  ('Yongxiang', 'Wang')
working on:  ('Irving', 'Belateche')
working on:  ('Ali', 'Nowroozi')
working on:  working on: working on:  ('Kevin', 'Fields')
('Lorraine', 'Turcotte')
 ('George', 'Sanchez')
working on:  ('Divana', 'Olivas')
working on:  ('Robin', 'Jeshion')
working on: working on:  ('Frank', 'Corsetti')
 ('Diana', 'B

working on:  ('Hazel', 'Atuel')
working on:  ('Robert', 'Hernandez')
working on:  ('Jose', 'Zavala')
working on:  ('Isabelle', 'Brocas')
working on:  ('Barbara', 'Nance')
working on:  ('Caesar', 'Sedek')
working on:  ('Marcel', 'Valcarce')
working on:  ('Candice', 'Levy')
working on:  ('Tin-Yu', 'Tseng')
working on:  ('Nick', 'Strimple')
working on:  ('Jinchi', 'Lv')
working on:  ('Robert', 'Scheer')
working on:  ('Melinda', 'Finberg')
working on:  ('Kyung Moon', 'Hwang')
working on:  ('Leonard', 'Maltin')
working on:  ('Taj', 'Frazier')
working on:  ('Jennifer', 'Greenhill')
working on:  ('Mary', 'Traester')
working on:  ('Lee', 'Olvera')
working on:  ('Mark', 'Redekopp')
working on:  ('China', 'Adams')
working on:  ('Sarah', 'Bonner')
working on:  ('David', 'Stamper')
working on:  ('Graham', 'Robertson')
working on:  ('Sharon', 'Carnicke')
working on:  ('Atiyeh', 'Showrai')
working on:  ('Fokion', 'Egolfopoulos')
working on:  ('Richard', 'Lemarchand')
working on:  ('Carmen', 'Lee')
w

In [160]:
for key in profDict:
    if len( profDict[key]) >1:
        print (key, profDict[key])

Jorge De La Roca {' Policy Planning  Development': (4.0, 1), ' Policy Planning & Development': (1.3, 3)}
Iva Bozovic {' International Relations': (4.5, 2), ' International Studies': (4.0, 26)}
Jennifer Miller {' Policy Planning  Development': (4.0, 2), ' Political Science': (3.3, 3)}
Leah Pate {' English': (2.0, 1), ' Writing': (4.1, 22)}
John Walsh {' Gerontology': (4.5, 2), ' Biology': (4.2, 56)}
Parish Sedghizadeh {' Business': (5.0, 1), ' Dentistry': (3.5, 1)}
Maurice Rahimi {' Policy Planning & Development': (3.3, 12), ' Business': (3.5, 4)}
David Kang {' International Studies': (4.5, 23), ' Engineering': (5.0, 1)}
Bruce Brown {' Music': (3.3, 3), ' Economics': (1.6, 9)}
Jeffrey Fellenzer {' Journalism': (4.6, 14), ' Communication': (5.0, 1)}
John Wilson {' History': (3.1, 36), ' Geography': (2.5, 30)}
Daniel Tiffany {' Philosophy': (1.6, 35), ' Arts & Letters': (1.6, 6)}
Heidi Rummel {' Law': (1.5, 1), ' Criminal Justice': (1.0, 1)}
Michael Coombs {' Business': (1.5, 2), ' Manage

<function dict.items>