# Analyzing Computer Science versus Business Management Introductory Course Professors Reviews and Their Trends Over Time

William Ingold, Erik Kelemen, Ashish Manda

## Introduction

## Grabbing Introductory Course Professors From UMD.io

In [485]:
import requests
import pandas as pd
professors_url = "https://api.umd.io/v1/professors"

#### Utilities for saving professor data to a file

In [486]:
import csv
from os import path

cmsc_professor_names_filepath = './data/cmsc_professor_names.csv'
bmgt_professor_names_filepath = './data/bmgt_professor_names.csv'

have_cmsc_professors = path.exists(cmsc_professor_names_filepath)
have_bmgt_professors = path.exists(bmgt_professor_names_filepath)

def read_professor_name_data(professor_filepath):
    """Reads the professor names and their courses from a CSV file.
    
    Args:
        professor_filepath: String holding a filepath to the professor csv file.
        
    Returns:
        A dictionary of professor names to a set of courses they have taught.
    """
    
    with open(professor_filepath, mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        line_count = 0

        professors = {}

        for row in csv_reader:
            if line_count != 0:
                professors[row['name']] = set([course for course in row['courses'].split(' ')])
            line_count += 1

        return professors

def save_professor_data(professors, filepath):
    """Saves the professor names and their courses to a CSV file.
    
    Args:
        professors: A dictionary of professor name keys and a set of courses for values.
    """
    
    columns = ['name', 'courses']
    try:
        with open(filepath, 'w') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=columns)
            writer.writeheader()
            
            for name, courses in professors.items():
                writer.writerow({'name': name, 'courses': ' '.join(courses)})
                
    except IOError:
        print("Error in writing the CSV file")

#### Utility to actually grab professors based on a list of courses

In [487]:
def get_professors_for_courses(course_ids):
    """Gets all the professors for the given course_ids and returns a list of them.
    
    Args:
        course_ids: A list of course ids (e.g. ['CMSC216', CMSC250']).
        
    Returns:
        List of professors that teach the given courses.
    
    """
    
    professors = {}
    
    for course_id in course_ids:
        params = {'course_id': course_id}

        response = requests.get(professors_url, params)
        
        if response.status_code == 200:

            for item in response.json():
                name = item['name']

                if name in professors:
                    professors[name].add(course_id)
                else:
                    professors[name] = {course_id}

    return professors

### Grab Computer Science Professors

In [488]:
cmsc_course_ids = ["CMSC131", "CMSC132", "CMSC216", "CMSC250"]

# TODO: Dr. Eastman has taught CMSC131, per RateMyProfessor, but wasn't given via UMD.IO

# Only query the UMD.io API if we don't have the data
if not have_cmsc_professors:
    cmsc_professors = get_professors_for_courses(cmsc_course_ids)
    save_professor_data(cmsc_professors, cmsc_professor_names_filepath)
    have_cmsc_professors = True
else: 
    cmsc_professors = read_professor_name_data(cmsc_professor_names_filepath)

    if not cmsc_professors:
        print("Error response from umd.io API")

if 'Iason Filippou' in cmsc_professors:
    cmsc_professors.pop('Iason Filippou') # A typo of Jason Filippou from the database
    
print(cmsc_professors)

{'Fawzi Emad': {'CMSC250', 'CMSC131', 'CMSC132'}, 'Ilchul Yoon': {'CMSC216', 'CMSC131', 'CMSC132'}, 'Nelson Padua-Perez': {'CMSC216', 'CMSC131', 'CMSC132'}, 'Pedram Sadeghian': {'CMSC131', 'CMSC132'}, 'Anwar Mamat': {'CMSC132'}, 'Laurence Herman': {'CMSC216', 'CMSC132'}, 'A Shankar': {'CMSC216'}, 'Aditya Acharya': {'CMSC250'}, 'Alexander Brassel': {'CMSC250'}, 'Clyde Kruskal': {'CMSC250'}, 'David Sekora': {'CMSC250'}, 'Donald Perlis': {'CMSC250'}, 'Jason Filippou': {'CMSC250'}, 'Mohammad Nayeem Teli': {'CMSC250'}, 'Roger Eastman': {'CMSC250'}}


### Grab Business Management Professors

In [489]:
bmgt_course_ids = ["BMGT110", "BMGT220", "BMGT221", "BMGT230"]

# Only query the UMD.io API if we don't have the data
if not have_bmgt_professors:
    bmgt_professors = get_professors_for_courses(bmgt_course_ids)
    save_professor_data(bmgt_professors, bmgt_professor_names_filepath)
    have_bmgt_professors = True
else:
    bmgt_professors = read_professor_name_data(bmgt_professor_names_filepath)

    if not bmgt_professors:
        print("Error response from umd.io API")

print(bmgt_professors)

{'Jeff Miller': {'BMGT110'}, 'Cody Hyman': {'BMGT220'}, 'Laurel Mazur': {'BMGT221', 'BMGT220'}, 'Progyan Basu': {'BMGT220'}, 'Viktoriya Zotova': {'BMGT220'}, 'Gary Bulmash': {'BMGT221'}, 'Gerald Ward': {'BMGT221'}, 'Ai Ren': {'BMGT230'}, 'Daehoon Noh': {'BMGT230'}, 'Erich Studer-Ellis': {'BMGT230'}, 'Huan Cao': {'BMGT230'}, 'Radu Lazar': {'BMGT230'}, 'Shubham Akshat': {'BMGT230'}, 'Ziwei Cao': {'BMGT230'}}


## Grabbing Reviews

### Setup Databases to Hold Review Data

In [490]:
import sqlite3
from sqlite3 import Error

# Setup database presets
db_filepath = './data/db/'
bmgt_rmp_db_filepath = db_filepath + 'bmgt_rmp.db'
cmsc_rmp_db_filepath = db_filepath + 'cmsc_rmp.db'

cmsc_rmp_db_exists = path.exists(cmsc_rmp_db_filepath)
bmgt_rmp_db_exists = path.exists(bmgt_rmp_db_filepath)


def create_connection(db_file):
    """Create a connection to the provided database file.
    
    Args:
        db_file: A string holding the filepath to a database.
    """
    
    conn = None
    print(db_file)
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e)

    return conn


def execute_create_command(conn, sql_command, params=()):
    """Executes the provided sql_command on the provided database.
    
    Args:
        conn: The connection object to the database.
        sql_command: A string containing the SQL command.
        params: A tuple of potential parameters.
    """
    
    try:
        c = conn.cursor()
        c.execute(sql_command, params)
        
    except Error as e:
        print(e)
    
    
def execute_insert_command(conn, sql_command, params=()):
    """Executes the provided sql_command on the provided database.
    
    Args:
        conn: The connection object to the database.
        sql_command: A string containing the SQL command.
        params: A tuple of potential parameters.
    """
    
    try:
        c = conn.cursor()
        c.execute(sql_command, params)
        conn.commit()
        
        return c.lastrowid
    except Error as e:
        print(e)
        
        
def execute_query_command(conn, sql_command, params=()):
    """Executes the provided sql_command on the provided database.
    
    Args:
        conn: The connection object to the database.
        sql_command: A string containing the SQL command.
        params: A tuple of potential parameters.
    """
    
    try:
        c = conn.cursor()
        c.execute(sql_command, params)
        
        return c.fetchall()
    
    except Error as e:
        print(e)

#### RateMyProfessor Specific Database Functionality

In [491]:
def create_rmp_tables(rmp_conn):
    """Create the stats and review tables for RateMyProfessors data.
    
    Args:
        rmp_conn: Connection object to a RateMyProfessors database.
    """
    
    stats_table = """ CREATE TABLE IF NOT EXISTS professor_stats (
                        id INTEGER PRIMARY KEY,
                        first_name TEXT NOT NULL,
                        last_name TEXT NOT NULL,
                        full_name TEXT NOT NULL UNIQUE ON CONFLICT IGNORE,
                        rating REAL,
                        take_again REAL,
                        difficulty REAL,
                        rating_count INTEGER NOT NULL,
                        gives_good_feedback INTEGER,
                        respected INTEGER,
                        lots_of_homework INTEGER,
                        accessible_outside_class INTEGER,
                        get_ready_to_read INTEGER,
                        participation_matters INTEGER,
                        skip_class_wont_pass INTEGER,
                        inspirational INTEGER,
                        graded_by_few_things INTEGER,
                        test_heavy INTEGER,
                        group_projects INTEGER,
                        clear_grading_criteria INTEGER,
                        hilarious INTEGER,
                        beware_of_pop_quizes INTEGER,
                        amazing_lectures INTEGER,
                        lecture_heavy INTEGER,
                        caring INTEGER,
                        extra_credit INTEGER,
                        so_many_papers INTEGER,
                        tough_grader INTEGER
                    ) """
    
    # TODO: Review id format? <professor last name>-<#> ?
    review_table = """ CREATE TABLE IF NOT EXISTS reviews (
                        id INTEGER PRIMARY KEY,
                        review_id TEXT NOT NULL UNIQUE ON CONFLICT IGNORE,
                        first_name TEXT NOT NULL,
                        last_name TEXT NOT NULL,
                        full_name TEXT NOT NULL,
                        course TEXT NOT NULL,
                        date INTEGER NOT NULL,
                        body TEXT NOT NULL,
                        thumb_up INTEGER,
                        thumb_down INTEGER,
                        quality REAL NOT NULL,
                        difficulty REAL NOT NULL,
                        would_take_again INTEGER NOT NULL,
                        for_credit INTEGER NOT NULL,
                        textbook INTEGER NOT NULL,
                        attendance INTEGER,
                        grade TEXT,
                        online_class INTEGER,
                        gives_good_feedback INTEGER,
                        respected INTEGER,
                        lots_of_homework INTEGER,
                        accessible_outside_class INTEGER,
                        get_ready_to_read INTEGER,
                        participation_matters INTEGER,
                        skip_class_wont_pass INTEGER,
                        inspirational INTEGER,
                        graded_by_few_things INTEGER,
                        test_heavy INTEGER,
                        group_projects INTEGER,
                        clear_grading_criteria INTEGER,
                        hilarious INTEGER,
                        beware_of_pop_quizes INTEGER,
                        amazing_lectures INTEGER,
                        lecture_heavy INTEGER,
                        caring INTEGER,
                        extra_credit INTEGER,
                        so_many_papers INTEGER,
                        tough_grader INTEGER
                   ) """
    
    execute_create_command(rmp_conn, stats_table)
    execute_create_command(rmp_conn, review_table)

# Create the CMSC and BMGT database with the two tables
cmsc_rmp_db = create_connection(cmsc_rmp_db_filepath)
bmgt_rmp_db = create_connection(bmgt_rmp_db_filepath)

create_rmp_tables(cmsc_rmp_db)
create_rmp_tables(bmgt_rmp_db)

# Close for now, will reopen when writing to them
cmsc_rmp_db.close()
bmgt_rmp_db.close()

./data/db/cmsc_rmp.db
./data/db/bmgt_rmp.db


In [492]:
def is_rmp_scraped(rmp_conn, professor_name):
    """Returns if the professor's RateMyProfessors page has been scraped already.
    
    Args:
        rmp_conn: Connection object to the appropriate rmp database.
        professor_name: String holding the professor's name.
    """
    
    sql_command = """SELECT
                        full_name
                    FROM
                        professor_stats ps
                    WHERE
                        full_name LIKE ?"""
    
    params=('%'+professor_name+'%',)
    
    result = execute_query_command(rmp_conn, sql_command, params)
    
    return len(result) != 0


def get_rmp_stats_from_db(rmp_conn, professor):
    """Reads the professor_stats table into a pandas dataframe and returns it."""
    
    sql_query = """SELECT * FROM professor_stats WHERE full_name LIKE ?"""
    
    return pd.read_sql_query(sql_query, rmp_conn, params=[professor])

def get_rmp_reviews_from_db(rmp_conn, professor):
    """Reads the reviews table into a pandas dataframe and returns it."""
    
    sql_query = """SELECT * FROM reviews WHERE full_name LIKE ?"""
    
    return pd.read_sql_query(sql_query, rmp_conn, params=[professor])

##### Using dataframes, but single table to hold all stats and reviews instead of individual tables

In [493]:
def execute_rmp_dataframe_insert(rmp_conn, table_name, column_list, values):
    """Executes an insert to the provided database and table, using the
    column headers and values provided as well.
    
    Args:
        rmp_conn: Connection object to a RateMyProfessor database.
        table_name: String holding a table name to insert into ('reviews' or 'professor_stats')
        values: List of values corresponding to the column headers to insert.
    """
    
    # Question mark for each value to be filled, don't want a trailing comma
    question_marks = "?," * (len(column_list) - 1)
    question_marks = question_marks + "?"
    
    column_names = ",".join(column_list)
    
    insert_rmp_sql = """INSERT INTO {table_name} (
                                {column_names}
                           )
                           VALUES({question_marks})
                           """.format(table_name=table_name, question_marks=question_marks, column_names=column_names)
    
    # TODO: Need to surround with % ?
    values = tuple(values)
    
    return execute_insert_command(rmp_conn, insert_rmp_sql, values)
    
    
def insert_rmp_professor_dataframe(rmp_conn, df, table_name):
    """Inserts all rows of a given dataframe to the RMP database's table.
    
    Args:
        rmp_conn: Connection object to a RateMyProfessor database.
        df: Pandas DataFrame object containing data to insert.
        table_name: String holding a table name to insert into ('reviews' or 'professor_stats')
    """
    
    column_list = list(df.columns)
    
    for idx, row in df.iterrows():
        execute_rmp_dataframe_insert(rmp_conn, table_name, column_list, row.array)
    

##### TODO: Alternative database structure, where each professor has its own tables  

In [494]:
def insert_rmp_professor_overall_table(rmp_conn, professor_name, overall_df):
    table_name = professor_name + "_stats"
    overall_df.to_sql(table_name, con=rmp_conn, if_exists='append')
    
def insert_rmp_professor_reviews_table(rmp_conn, professor_name, review_df):
    table_name = professor_name + "_reviews"
    review_df.to_sql(table_name, con=rmp_conn, if_exists='append')

#### PlanetTerp Database Functionality

## Scrape and Parse Data from RateMyProfessor

In [495]:
from bs4 import BeautifulSoup

# Data needed for requesting data from RateMyProfessor
ratemyprofessor_url = "https://www.ratemyprofessors.com/search.jsp"
params = {'queryoption':'HEADER', 'schoolID':'1270', 'queryBy':'teacherName', 'schoolName':'University+of+Maryland'}
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0",
    "Access-Control-Allow-Origin": "*",
    "Access-Control-Allow-Headers": "Content-Type",
    "Access-Control-Allow-Methods": "GET"
}


# List of tags that RateMyProfessor uses to describe professors, which are used for the database and dataframes
tag_list = ['gives_good_feedback', 'respected', 'lots_of_homework', 'accessible_outside_class',
           'get_ready_to_read', 'participation_matters', 'inspirational',
           'graded_by_few_things', 'test_heavy', 'group_projects', 'clear_grading_criteria', 
           'hilarious', 'beware_of_pop_quizes', 'amazing_lectures', 'lecture_heavy', 'caring',
           'extra_credit', 'so_many_papers', 'tough_grader', 'skip_class_wont_pass']

# Want to tie the code friendly tag names to what is found on a RateMyProfessor page
text_tag_list = [' '.join(x.split('_')) for x in tag_list]
text_tag_list.remove('skip class wont pass')
text_tag_list.append("skip class? you won't pass.")

# both tag_list and text_tag_list in same order, and correspond to one another
text_tag_dict = {text_tag_list[i]: tag_list[i] for i in range(len(text_tag_list))}

# These are the column headers for a professor's overall statistics found at the top of the page
overall_header_list = ['first_name', 'last_name', 'full_name', 'rating', 'take_again', 'difficulty',
                      'rating_count'] + tag_list

# Review post column headers. The meta list is the row of top meta responses (like 'Grade: A-').
review_meta_list = ['would_take_again', 'grade', 'textbook', 'online_class', 'for_credit', 'attendance']
review_text_meta_list = [' '.join(x.split('_')) for x in review_meta_list]
review_meta_dict = {review_text_meta_list[i]: review_meta_list[i] for i in range(len(review_meta_list))}
        
review_header_list = ['review_id', 'course', 'date', 'quality', 'difficulty', 'body',
                      'thumb_up', 'thumb_down'] + review_meta_list + tag_list 

##### Utilities

In [496]:
def tags_to_dict(provided_tags):
    """Turns the list of text tags (e.g. skip class? you won't pass) into a dictionary
    of approriately named tags that work for database columns and if they were present.
    
    Args:
        provided_tags: A list of space separated tags scraped from the RMP page.
        
    Returns:
        A dictionary of {tag: 1 or 0} on whether a tag was used to describe the professor.
    """
    
    tag_dict = {val: 0 for val in text_tag_dict.values()}
    
    for tag in provided_tags:
        if tag.lower() in text_tag_dict.keys():
            tag_dict[text_tag_dict[tag.lower()]] = 1
            
    return tag_dict

def meta_to_dict(provided_meta):
    """Turns the dictionary of meta tags (e.g. Would Take Again: No) into a dictionary
    of appropriately named tags that work for database columns and values if they were present.
    
    Args:
        provided_meta: A dictionary of meta information from a review.
        
    Returns:
        A dictionary of {meta: 1 or 0} on whether a meta was used on the review.
    """
    
    meta_dict = {val: 0 for val in review_meta_list}
    
    for meta, response in provided_meta.items():
        value = 0
        
        if meta.lower() in review_meta_dict.keys():
            
            if response.lower() == "yes" or response.lower() == "mandatory":
                value = 1
                
            if meta.lower() == "grade":
                value = response
            
            meta_dict[review_meta_dict[meta.lower()]] = value
            
    return meta_dict

#### Querying RateMyProfessor and Getting the Professor's URL

In [497]:
def find_rmp_professor_url(html_doc):
    """Finds the professor's URL on the search page and returns it.
    
    Args:
        html_doc: A string containing an HTML document.
        
    Returns:
        The full URL for the professor's page (if found).
    
    """
    
    soup = BeautifulSoup(html_doc, 'html.parser')
    
    no_results = soup.find('div[class*="NoResultsFoundArea__StyledNoResultsFound"]')
    partial_url = soup.find('li', class_='listing PROFESSOR')
    
    # Sometimes RMP does the search differntly, so it'll be elsewhere
    diff_location = soup.find('a', attrs={'class': lambda x: 'TeacherCard__StyledTeacherCard' in x if x else False}, href=True)
    
    # The professor may not be reviewed
    if no_results is None and partial_url and len(partial_url) != 0:
        if diff_location:
            partial_url = diff_location['href']
        else:
            partial_url = partial_url.find('a', href=True)

        if partial_url:
            main_url = "https://www.ratemyprofessors.com"
            return main_url + partial_url['href']
    else:
        return None
    
    
def query_rmp_for_professor_url(professor_name, headers, params):
    """Queries RateMyProfessor for the professor, given the parameters and headers.
    
    Args:
        professor_name: The <first name> <last name> of the professor.
        headers: Dictionary of headers for the get request.
        params: Dictionary of parameters for the get request.
        
    Returns:
        The full URL for the professor's page after searching for it (if found).
        
    """
    
    params['query'] = professor_name
    
    response = requests.get(ratemyprofessor_url, headers=headers, params=params)
    
    if response.status_code == 200:
        url = find_rmp_professor_url(response.text)
        
        if url is not None:
            return url
        else:
            print("Professor {name} has not been reviewed.".format(name=professor_name))
            return url

#### Parsing the Professor Overall Information (Stats and Tags)

In [498]:
def get_rmp_prof_stats(page_text):
    """Parses the professor's stats from their page and returns them. Namely their overall rating, 
    how many would take again, overall difficulty and how many ratings they have on RateMyProfessor.
    
    Args:
        page_text: An HTML document of the professor's page.
        
    Returns:
        A dictionary containing their rating, take again percentage, difficulty rating, and rating count.
    """
    
    soup = BeautifulSoup(page_text, 'html.parser')
    
    rating_score = soup.select('div[class*="RatingValue__Numerator"]')
    
    if rating_score is not None:
        rating_score = float(rating_score[0].text)
    else:
        print('Rating score error: \n')
        print(soup)
    
    feedback = soup.select('div[class*="TeacherFeedback__StyledTeacherFeedback"]')[0].select('div[class*="FeedbackItem__FeedbackNumber"]')
    
    take_again = float(feedback[0].text[:-1]) / 100
    difficulty = float(feedback[1].text)
    
    rating_count = soup.select('div[class*="RatingValue__NumRatings"]')[0].select('a')[0].text
    rating_count = ''.join([x for x in rating_count if x.isdigit()])
    rating_count = int(rating_count)
    
    return {'rating': rating_score, 'take_again': take_again, 'difficulty': difficulty, 'rating_count': rating_count}


def get_rmp_prof_top_tags(page_text):
    """Parses and returns the professor's top tags.
    
    Args:
        page_text: An HTML document of the professor's page.
        
    Returns:
        A list of tags describing the professor.
    """
    
    soup = BeautifulSoup(page_text, 'html.parser')
    
    tags = []
    unparsed_tags = soup.select('div[class*="TeacherTags__TagsContainer"]')[0].select('span')
    
    for tag in unparsed_tags:
        tags.append(tag.text)
        
    return tags_to_dict(tags)


def rmp_prof_overall_to_dataframe(professor_name, stats, tags):
    """Combines the professor's overall stats and tags into a pandas dataframe.
    
    Args:
        professor_name: String holding the professor's name.
        stats: A dictionary holding the overall stats (e.g. 'would_take_again': .83)
        tags: A dictionary holding the tags associated with a professor (e.g. {'caring': 1})
        
    Returns:
        A dataframe containing the combination of professor name, stats, and tags.
    """
    
    overall_df = pd.DataFrame(columns=overall_header_list)
    
    first_name, last_name = professor_name.split(' ', 1)
    overall_dict = {'first_name': first_name, 'last_name': last_name, 'full_name': professor_name}
    
    overall_dict.update(stats)
    overall_dict.update(tags)
    
    overall_df = overall_df.append(overall_dict, ignore_index=True)
    
    return overall_df

#### Use Selenium to Load All Professor Reviews

In [499]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

In [500]:
def start_selenium():
    """Starts up the Selenium browser."""
    driver = webdriver.Firefox(executable_path='./bin/geckodriver.exe')
    return driver
    
def stop_selenium(driver):
    """Shutdown the Selenium browser."""
    driver.close()
    driver.quit()
    
def load_all_rmp_reviews(page_url, driver):
    """Loads all the reviews for a given porfessor and returns the text of all of them.
    
    Args:
        page_url: The URL for the professor's page.
        
    Returns:
        A string containing the HTML for all the reviews.
    """

    driver.get(page_url)
    
    # RateMyProfessors has a cookies pop up that overlays the website, it needs to be closed first
    time.sleep(0.5)
    close_cookies = driver.find_elements(By.XPATH, '//button[text()="Close"]')
    
    if close_cookies:
        close_cookies[0].click()
        
    load_more = driver.find_elements(By.XPATH, '//button[text()="Load More Ratings"]')
    
    # RateMyProfessors paginates the reviews via Javascript, so we must continually load more while the button is present
    while load_more:
        load_more[0].click()
        time.sleep(1)
        load_more = driver.find_elements(By.XPATH, '//button[text()="Load More Ratings"]')
        
        
    all_reviews = driver.find_element_by_id('ratingsList').get_attribute('outerHTML')
    
    
    return all_reviews

#### Parsing Utilities for a Single Review

In [501]:
import datetime

def string_date_to_unix(date_str):
    """Turns the RateMyProfessor date format (e.g. Nov 23rd, 2020) into a
    UTC timestamp. Assumes the date is already in UTC.
    
    Args:
        date_str: A string containing the RateMyProfessor review date.
        
    Returns:
        A UTC timestamp corresponding to the date provided.
    """
    
    # Split into month, day, year
    date_split = date_str.split(' ')
    day = date_split[1]
    
    # Remove comma and suffix for day
    day = day[:-3]
    
    # Place the day back into the list and join everything back together
    date_split[1] = day
    remade_date_str = (' ').join(date_split)
    
    # Change into UTC time
    datetime_obj = datetime.datetime.strptime(remade_date_str, '%b %d %Y')
    utc_time = datetime_obj.timestamp()
    
    return utc_time
    
def parse_rating_header(soup):
    """Parses and returns the rating header for a single review.
    
    Args:
        soup: An initialized BeautifulSoup object for the professor's page.
        
    Returns:
        A dictionary containing the course and date for the review.
    """
    
    rating_header = soup.select('div[class*="Rating__RatingInfo"]')
    
    if len(rating_header) != 0:
        course = rating_header[0].select('div[class*="RatingHeader__StyledClass"]')[0].text.strip()
        date = rating_header[0].select('div[class*="TimeStamp__StyledTimeStamp"]')[0].text.strip()
        
        utc_time = string_date_to_unix(date)
    else:
        print(soup)
    
    return {'course': course, 'date': utc_time}

def parse_meta_data(soup):
    """Parses and returns the meta data for a single review.
    
    Args:
        soup: An initialized BeautifulSoup object for the professor's page.
        
    Returns:
        A dictionary containing the meta data (e.g. Would Take Again) for the review.
    """
    
    course_meta = soup.select('div[class*="CourseMeta__StyledCourseMeta"]')[0]
    review_meta_data = {}

    for meta_div in course_meta.select('div'):
        meta_data = meta_div.text.split(':')
        meta_name = meta_data[0].strip()
        meta_value = meta_data[1].strip()

        review_meta_data[meta_name] = meta_value

    return meta_to_dict(review_meta_data)

def parse_rating_data(soup):
    """Parses and returns the rating data for a single review.
    
    Args:
        soup: An initialized BeautifulSoup object for the professor's page.
        
    Returns:
        A dictionary containing the rating data for the quality and difficulty for the review.
    """
    
    rating_values_text = soup.select('div[class*="RatingValues__StyledRatingValues"]')[0].select('div[class*="RatingValues__RatingValue"]')
    quality = rating_values_text[0].text
    difficulty = rating_values_text[1].text

    rating_data = {'quality': quality, 'difficulty': difficulty}
    
    return rating_data

def parse_review_tags(soup):
    """Parses and returns the tags for a single review.
    
    Args:
        soup: An initialized BeautifulSoup object for the professor's page.
        
    Returns:
        A list containing the tags for the review.
    """
    
    tag_container = soup.select('div[class*="RatingTags__StyledTags"]')
    tags = []
    
    if tag_container: # Since not all reviews add tags
        unparsed_tags = tag_container[0].select('span')

        for tag in unparsed_tags:
            tags.append(tag.text)

    return tags_to_dict(tags)
    
def parse_thumb_scoring(soup):
    """Parses and returns the thumb scoring data for a single review.
    
    Args:
        soup: An initialized BeautifulSoup object for the professor's page.
        
    Returns:
        A dictionary containing the thumb scoring data for the review.
    """
    
    thumb_container = soup.select('div[class*="RatingFooter__StyledRatingFooter"]')[0].select('div[class*="RatingFooter__HelpTotal"]')

    thumb_up = int(thumb_container[0].text.strip())
    thumb_down = int(thumb_container[1].text.strip())
    thumb_data = {'thumb_up': thumb_up, 'thumb_down': thumb_down}

    return thumb_data

def parse_review_text(soup):
    """Parses and returns the review body text for a single review.
    
    Args:
        soup: An initialized BeautifulSoup object for the professor's page.
        
    Returns:
        A string containing the review text for the review.
    """
    
    review_text = soup.select('div[class*="Comments__StyledComments"]')[0].text
    
    return {'body': review_text}
    
def parse_single_rmp_review(review_item, courses):
    """Parses and returns all data for a single review.
    Namely it returns: Meta data, rating data, tags, thumb_scoring, and review text.
    
    Args:
        review_item: A single review list item containing all the appropraite HTML.
        
    Returns:
        A dictionary containing the meta data, rating data, tags, thumb_scoring, and review text
        for a single review.
    """
    
    soup = BeautifulSoup(review_item, 'html.parser')
    
    course_and_date = parse_rating_header(soup)
    
    # TODO: Loses course reviews like 'CMSC131CMSC132' where students combined multiple courses they took
    if course_and_date['course'] in courses:
        
        # Meta data
        meta_data = parse_meta_data(soup)
        
        # Rating data
        rating_data = parse_rating_data(soup)
        
        # Tags 
        tags = parse_review_tags(soup)
        
        # Thumb Scoring
        thumb_scoring = parse_thumb_scoring(soup)
        
        # Review body
        review_text = parse_review_text(soup)
        
        return {'meta_data': meta_data, 'rating_data': rating_data, 'tags': tags, 'thumb_scoring': thumb_scoring,
                'review_text': review_text, 'rating_header': course_and_date}

#### Parsing Utilities for an Entire RateMyProfessor Page

In [502]:
def get_rmp_prof_reviews(rmp_prof_url, selenium_driver, prof_name, prof_courses):
    """Gets all the RateMyProfessor reviews for a given professor and places into a
    dataframe. Only grabs reviews for classes in the provided courses.
    
    Args:
        rmp_prof_url: A string containing the RateMyProfessor URL for the professor.
        prof_name: A string containing the professor's name.
        prof_courses: List of courses to look for in the reviews.
        
    Returns:
        A dataframe containing all the appropriate reviews.
    """
    
    reviews_html = load_all_rmp_reviews(rmp_prof_url, selenium_driver)
    soup = BeautifulSoup(reviews_html, 'html.parser')
    
    first_name, last_name = prof_name.split(' ', 1)
    review_id_head = prof_name + '-'
    counter = 1
    
    review_df = pd.DataFrame(columns=review_header_list)
    
    for review in soup.find_all('li'):
        
        if len(review.select('div[class*="Rating__StyledRating"]')) != 0: # Avoid advertisement list items
            data = parse_single_rmp_review(str(review), prof_courses)

            cur_review_id = review_id_head + str(counter)
            counter = counter + 1
            
            if data: # Since the review could be of an undesired course
                flattened_data = {'first_name': first_name, 'last_name': last_name, 'full_name': prof_name,
                                  'review_id': cur_review_id}

                for data_type, data_dict in data.items():
                    
                    for key, val in data_dict.items():
                        flattened_data[key] = val

                review_df = review_df.append(flattened_data, ignore_index=True)
    
    return review_df


def parse_rmp_page(rmp_prof_url, headers, rmp_conn, selenium_driver, professor_name, courses):
    """Parses an entire RateMyProfessor professor page for overall stats & tags, and all
    of their reviews. It will return two dataframes holding this information and insert
    them into a database.
    
    Args:
        rmp_prof_url: A string containing the RateMyProfessor URL for the professor.
        headers: Request headers to use.
        rmp_conn: Connection object to the RateMyProfessor database.
        prof_name: A string containing the professor's name.
        courses: List of courses to look for in the reviews.
        
    Returns:
        A tuple of two dataframes, (overall statistics, all the reviews).
    """
    
    rmp_prof_page = requests.get(rmp_prof_url, headers=headers)
    
    if rmp_prof_page.status_code == 200:
        soup = BeautifulSoup(rmp_prof_page.text, 'html.parser')
        
        # Professor stats
        stats_container = soup.select('div[class*="TeacherInfo__StyledTeacher"]')[0]
        
        prof_stats = get_rmp_prof_stats(str(stats_container))
        prof_tags = get_rmp_prof_top_tags(str(stats_container))
        
        overall_df = rmp_prof_overall_to_dataframe(professor_name, prof_stats, prof_tags)
        insert_rmp_professor_dataframe(rmp_conn, overall_df, 'professor_stats')
        
        # Professor reviews
        all_reviews_df = get_rmp_prof_reviews(rmp_prof_url, selenium_driver, professor_name, courses)
        insert_rmp_professor_dataframe(rmp_conn, all_reviews_df, 'reviews')
        
        return (overall_df, all_reviews_df)
    else:
        print("Error opening the RateMyProfessor professor page")

#### Scrape and Parse All Professors Utility

In [503]:
def parse_rmp_all_professors(rmp_db_filepath, professors, force_scrape=False):
    """Scrapes and parses all professors, storing the data in a database and returning a
    list of dataframes for stats and reviews.
    
    Args:
        rmp_db_filepath: String containing the filepath to the appropriate database.
        professors: Dictionary of professors to list of courses.
        force_scrape (optional, default=False): Forces a scrape of RateMyProfessors even if already done.
        
    Returns:
        The tuple (stats, reviews) where each is a list of dataframes.
    """
    
    rmp_db = create_connection(rmp_db_filepath)
    selenium_driver = start_selenium()
    
    all_major_stats = []
    all_major_reviews = []

    for professor, courses in professors.items():
        overall_stats_df = None
        all_reviews_df = None

        # Read from database if the professor has already been scraped (only checks stats for confirmation)
        if not force_scrape and is_rmp_scraped(rmp_db, professor):
            overall_stats_df = get_rmp_stats_from_db(rmp_db, professor)
            all_reviews_df = get_rmp_reviews_from_db(rmp_db, professor)

        else:
            # Get all the data from the professor's RateMyProfessor page
            prof_rmp_url = query_rmp_for_professor_url(professor, headers, params)

            if prof_rmp_url is not None:
                overall_stats_df, all_reviews_df = parse_rmp_page(prof_rmp_url, headers, rmp_db, selenium_driver, professor, courses)

            # So we don't query RateMyProfessor too much
            time.sleep(1)

        # Keep track of the dataframes for each professor
        all_major_stats.append(overall_stats_df)
        all_major_reviews.append(all_reviews_df)

    rmp_db.close()
    stop_selenium(selenium_driver)
    
    return (all_major_stats, all_major_reviews)

### Scrape and Parse All Computer Science Professors from RateMyProfessor

In [504]:
all_cmsc_stats, all_cmsc_reviews = parse_rmp_all_professors(cmsc_rmp_db_filepath, cmsc_professors)

./data/db/cmsc_rmp.db
Professor Laurence Herman has not been reviewed.
Professor A Shankar has not been reviewed.
Professor Aditya Acharya has not been reviewed.
Professor Alexander Brassel has not been reviewed.


### Scrape and Parse All Business Management Professors from RateMyProfessor

In [None]:
all_bmgt_stats, all_bmgt_reviews = parse_rmp_all_professors(bmgt_rmp_db_filepath, bmgt_professors)

./data/db/bmgt_rmp.db
Professor Laurel Mazur has not been reviewed.
Professor Viktoriya Zotova has not been reviewed.
Professor Gerald Ward has not been reviewed.
Professor Daehoon Noh has not been reviewed.


## From Planetterp

### Parse All Computer Science Professors Professors from PlanetTerp

### Parse All Business Management Professors from PlanetTerp