# Analyzing Computer Science versus Business Management Introductory Course Professors Reviews and Their Trends Over Time

William Ingold, Erik Kelemen, Ashish Manda

## Introduction

### About

### Motivation

## Setup

In [764]:
# Querying and requests of pages
import requests

# Parsing and handling HTML elements
from bs4 import BeautifulSoup

# Storage and manipulation of data
import pandas as pd

# Used to check for the existance of files
from os import path
from itertools import chain

# Database and data storage
import csv
import sqlite3
from sqlite3 import Error

# Selenium lets us load pages more natively, and can interact with the page
from selenium import webdriver
from selenium.webdriver.common.by import By

# For handling the time & dates for reviews
import time
import datetime

## Data Storage: Setup Databases to Hold Review Data

TODO: Explain section

### Data Storage Part 1: Generic Database Functionality

In [765]:
# Setup database presets
db_filepath = './data/db/'
bmgt_rmp_db_filepath = db_filepath + 'bmgt_rmp.db'
cmsc_rmp_db_filepath = db_filepath + 'cmsc_rmp.db'

bmgt_pt_db_filepath = db_filepath + 'bmgt_pt.db'
cmsc_pt_db_filepath = db_filepath + 'cmsc_pt.db'

In [765]:
def create_connection(db_file):
    """Create a connection to the provided database file.
    
    Args:
        db_file: A string holding the filepath to a database.
    """
    
    conn = None
    print(db_file)
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e)

    return conn


def execute_create_command(conn, sql_command, params=()):
    """Executes the provided sql_command on the provided database.
    
    Args:
        conn: The connection object to the database.
        sql_command: A string containing the SQL command.
        params: A tuple of potential parameters.
    """
    
    try:
        c = conn.cursor()
        c.execute(sql_command, params)
        
    except Error as e:
        print(e)
    
    
def execute_insert_command(conn, table_name, column_list, params=()):
    """Executes the provided sql_command on the provided database.
    
    Args:
        conn: The connection object to the database.
        sql_command: A string containing the SQL command.
        params: A tuple of potential parameters.
    """
    
    # Question mark for each value to be filled, don't want a trailing comma
    question_marks = "?," * (len(column_list) - 1)
    question_marks = question_marks + "?"
    
    column_names = ",".join(column_list)
    
    insert_sql = """INSERT INTO {table_name} (
                                {column_names}
                           )
                           VALUES({question_marks})
                           """.format(table_name=table_name, 
                                      question_marks=question_marks,
                                      column_names=column_names)
    
    try:
        c = conn.cursor()
        c.execute(insert_sql, params)
        conn.commit()
        
        return c.lastrowid
    except Error as e:
        print(e)
        
        
def execute_query_command(conn, sql_command, params=()):
    """Executes the provided sql_command on the provided database.
    
    Args:
        conn: The connection object to the database.
        sql_command: A string containing the SQL command.
        params: A tuple of potential parameters.
    """
    
    try:
        c = conn.cursor()
        c.execute(sql_command, params)
        
        return c.fetchall()
    
    except Error as e:
        print(e)
        

def is_professor_scraped(db_conn, professor_name):
    """Returns if the professor's RateMyProfessors page has been scraped already.
    
    Args:
        db_conn: Connection object to the appropriate database.
        professor_name: String holding the professor's name.
    """
    
    sql_command = """SELECT
                        full_name
                    FROM
                        professor_stats ps
                    WHERE
                        full_name LIKE ?"""
    
    params=('%'+professor_name+'%',)
    
    result = execute_query_command(db_conn, sql_command, params)
    
    return len(result) != 0


def insert_dataframe_into_db(db_conn, df, table_name):
    """Inserts all rows of a given dataframe to the database's table.
    
    Args:
        db_conn: Connection object to a database.
        df: Pandas DataFrame object containing data to insert.
        table_name: String holding a table name to insert into ('reviews' or 'professor_stats')
    """
    
    column_list = list(df.columns)
    
    for idx, row in df.iterrows():
        execute_insert_command(db_conn, table_name, column_list, tuple(row.array))


def get_professor_stats_from_db(db_conn, professor):
    """Reads the professor_stats table into a pandas dataframe and returns it."""
    
    sql_query = """SELECT * FROM professor_stats WHERE full_name LIKE ?"""
    
    return pd.read_sql_query(sql_query, db_conn, params=[professor])

def get_professor_reviews_from_db(db_conn, professor):
    """Reads the reviews table into a pandas dataframe and returns it."""
    
    sql_query = """SELECT * FROM reviews WHERE full_name LIKE ?"""
    
    return pd.read_sql_query(sql_query, db_conn, params=[professor])

### Data Storage Part 2: RateMyProfessor Specific Database Functionality

In [766]:
def create_rmp_tables(rmp_conn):
    """Create the stats and review tables for RateMyProfessors data.
    
    Args:
        rmp_conn: Connection object to a RateMyProfessors database.
    """
    
    stats_table = """ CREATE TABLE IF NOT EXISTS professor_stats (
                        id INTEGER PRIMARY KEY,
                        first_name TEXT NOT NULL,
                        last_name TEXT NOT NULL,
                        full_name TEXT NOT NULL UNIQUE ON CONFLICT IGNORE,
                        page_exists INTEGER NOT NULL,
                        rating REAL,
                        take_again REAL,
                        difficulty REAL,
                        rating_count INTEGER NOT NULL,
                        gives_good_feedback INTEGER,
                        respected INTEGER,
                        lots_of_homework INTEGER,
                        accessible_outside_class INTEGER,
                        get_ready_to_read INTEGER,
                        participation_matters INTEGER,
                        skip_class_wont_pass INTEGER,
                        inspirational INTEGER,
                        graded_by_few_things INTEGER,
                        test_heavy INTEGER,
                        group_projects INTEGER,
                        clear_grading_criteria INTEGER,
                        hilarious INTEGER,
                        beware_of_pop_quizes INTEGER,
                        amazing_lectures INTEGER,
                        lecture_heavy INTEGER,
                        caring INTEGER,
                        extra_credit INTEGER,
                        so_many_papers INTEGER,
                        tough_grader INTEGER
                    ) """
    
    # Review id format <professor last name>-<#> 
    review_table = """ CREATE TABLE IF NOT EXISTS reviews (
                        id INTEGER PRIMARY KEY,
                        review_id TEXT NOT NULL UNIQUE ON CONFLICT IGNORE,
                        first_name TEXT NOT NULL,
                        last_name TEXT NOT NULL,
                        full_name TEXT NOT NULL,
                        course TEXT NOT NULL,
                        date INTEGER NOT NULL,
                        body TEXT NOT NULL,
                        thumb_up INTEGER,
                        thumb_down INTEGER,
                        quality REAL NOT NULL,
                        difficulty REAL NOT NULL,
                        would_take_again INTEGER NOT NULL,
                        for_credit INTEGER NOT NULL,
                        textbook INTEGER NOT NULL,
                        attendance INTEGER,
                        grade TEXT,
                        online_class INTEGER,
                        gives_good_feedback INTEGER,
                        respected INTEGER,
                        lots_of_homework INTEGER,
                        accessible_outside_class INTEGER,
                        get_ready_to_read INTEGER,
                        participation_matters INTEGER,
                        skip_class_wont_pass INTEGER,
                        inspirational INTEGER,
                        graded_by_few_things INTEGER,
                        test_heavy INTEGER,
                        group_projects INTEGER,
                        clear_grading_criteria INTEGER,
                        hilarious INTEGER,
                        beware_of_pop_quizes INTEGER,
                        amazing_lectures INTEGER,
                        lecture_heavy INTEGER,
                        caring INTEGER,
                        extra_credit INTEGER,
                        so_many_papers INTEGER,
                        tough_grader INTEGER
                   ) """
    
    execute_create_command(rmp_conn, stats_table)
    execute_create_command(rmp_conn, review_table)

# Create the CMSC and BMGT database with the two tables
cmsc_rmp_db = create_connection(cmsc_rmp_db_filepath)
bmgt_rmp_db = create_connection(bmgt_rmp_db_filepath)

create_rmp_tables(cmsc_rmp_db)
create_rmp_tables(bmgt_rmp_db)

# Close for now, will reopen when writing to them
cmsc_rmp_db.close()
bmgt_rmp_db.close()

./data/db/cmsc_rmp.db
./data/db/bmgt_rmp.db


### Data Storage Part 3: PlanetTerp Database Functionality

In [767]:
def create_pt_tables(pt_conn):
    """Create the stats and review tables for RateMyProfessors data.
    
    Args:
        pt_conn: Connection object to a PlanetTerp database.
    """
    
    # TODO: Keep track of grade distribution in this table?
    stats_table = """ CREATE TABLE IF NOT EXISTS professor_stats (
                        id INTEGER PRIMARY KEY,
                        first_name TEXT NOT NULL,
                        last_name TEXT NOT NULL,
                        full_name TEXT NOT NULL UNIQUE ON CONFLICT IGNORE,
                        page_exists INTEGER NOT NULL,
                        slug TEXT,
                        review_count INTEGER NOT NULL,
                        type TEXT
                    ) """
    
    # TODO: Review id format? <professor last name>-<#> ?
    review_table = """ CREATE TABLE IF NOT EXISTS reviews (
                        id INTEGER PRIMARY KEY,
                        review_id TEXT NOT NULL UNIQUE ON CONFLICT IGNORE,
                        full_name TEXT NOT NULL,
                        course TEXT NOT NULL,
                        date INTEGER NOT NULL,
                        body TEXT NOT NULL,
                        rating INTEGER NOT NULL,
                        expected_grade TEXT
                   ) """
    
    execute_create_command(pt_conn, stats_table)
    execute_create_command(pt_conn, review_table)

# Create the CMSC and BMGT database with the two tables
cmsc_pt_db = create_connection(cmsc_pt_db_filepath)
bmgt_pt_db = create_connection(bmgt_pt_db_filepath)

create_pt_tables(cmsc_pt_db)
create_pt_tables(bmgt_pt_db)

# Close for now, will reopen when writing to them
cmsc_pt_db.close()
bmgt_pt_db.close()


./data/db/cmsc_pt.db
./data/db/bmgt_pt.db


##### TODO: Alternative database structure, where each professor has its own tables  

In [768]:
def insert_rmp_professor_overall_table(rmp_conn, professor_name, overall_df):
    table_name = professor_name + "_stats"
    overall_df.to_sql(table_name, con=rmp_conn, if_exists='append')
    
def insert_rmp_professor_reviews_table(rmp_conn, professor_name, review_df):
    table_name = professor_name + "_reviews"
    review_df.to_sql(table_name, con=rmp_conn, if_exists='append')

### End Data Storage

TODO: Accomplished

## Data Collection Part 1: Grabbing Introductory Course Professors From UMD.io

TODO: Explain section

In [769]:
# Base API url for UMD.io
professors_url = "https://api.umd.io/v1/professors"

# The filepaths for the files to hold professor information
cmsc_professor_names_filepath = './data/cmsc_professor_names.csv'
bmgt_professor_names_filepath = './data/bmgt_professor_names.csv'

# Determines if we've created these already
have_cmsc_professors = path.exists(cmsc_professor_names_filepath)
have_bmgt_professors = path.exists(bmgt_professor_names_filepath)


# Courses we're interseted in look at
# TODO: What about honors?
# TODO: Dr. Eastman has taught CMSC131, per RateMyProfessor, but wasn't given via UMD.IO
cmsc_course_ids = ["CMSC131", "CMSC132", "CMSC216", "CMSC250"]
bmgt_course_ids = ["BMGT110", "BMGT220", "BMGT221", "BMGT230"]

#### Utilities for saving professor data to a file

In [770]:
def read_professor_name_data(professor_filepath):
    """Reads the professor names and their courses from a CSV file.
    
    Args:
        professor_filepath: String holding a filepath to the professor csv file.
        
    Returns:
        A dictionary of professor names to a set of courses they have taught.
    """
    
    with open(professor_filepath, mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        line_count = 0

        professors = {}

        for row in csv_reader:
            if line_count != 0:
                professors[row['name']] = set([course for course in row['courses'].split(' ')])
            line_count += 1

        return professors

def save_professor_data(professors, filepath):
    """Saves the professor names and their courses to a CSV file.
    
    Args:
        professors: A dictionary of professor name keys and a set of courses for values.
    """
    
    columns = ['name', 'courses']
    try:
        with open(filepath, 'w') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=columns)
            writer.writeheader()
            
            for name, courses in professors.items():
                writer.writerow({'name': name, 'courses': ' '.join(courses)})
                
    except IOError:
        print("Error in writing the CSV file")

#### Utility to actually grab professors based on a list of courses

In [771]:
def get_professors_for_courses(course_ids):
    """Gets all the professors for the given course_ids and returns a list of them.
    
    Args:
        course_ids: A list of course ids (e.g. ['CMSC216', CMSC250']).
        
    Returns:
        List of professors that teach the given courses.
    
    """
    
    professors = {}
    
    for course_id in course_ids:
        params = {'course_id': course_id}

        response = requests.get(professors_url, params)
        
        if response.status_code == 200:

            for item in response.json():
                name = item['name']

                if name in professors:
                    professors[name].add(course_id)
                else:
                    professors[name] = {course_id}

    return professors

### Grab Computer Science Professors

In [772]:
# Only query the UMD.io API if we don't have the data
if not have_cmsc_professors:
    cmsc_professors = get_professors_for_courses(cmsc_course_ids)
    save_professor_data(cmsc_professors, cmsc_professor_names_filepath)
    have_cmsc_professors = True
else: 
    cmsc_professors = read_professor_name_data(cmsc_professor_names_filepath)

    if not cmsc_professors:
        print("Error response from umd.io API")

if 'Iason Filippou' in cmsc_professors:
    cmsc_professors.pop('Iason Filippou') # A typo of Jason Filippou from the database
    
print(cmsc_professors)

{'Fawzi Emad': {'CMSC250', 'CMSC131', 'CMSC132'}, 'Ilchul Yoon': {'CMSC216', 'CMSC131', 'CMSC132'}, 'Nelson Padua-Perez': {'CMSC216', 'CMSC131', 'CMSC132'}, 'Pedram Sadeghian': {'CMSC131', 'CMSC132'}, 'Anwar Mamat': {'CMSC132'}, 'Laurence Herman': {'CMSC216', 'CMSC132'}, 'A Shankar': {'CMSC216'}, 'Aditya Acharya': {'CMSC250'}, 'Alexander Brassel': {'CMSC250'}, 'Clyde Kruskal': {'CMSC250'}, 'David Sekora': {'CMSC250'}, 'Donald Perlis': {'CMSC250'}, 'Jason Filippou': {'CMSC250'}, 'Mohammad Nayeem Teli': {'CMSC250'}, 'Roger Eastman': {'CMSC250'}}


### Grab Business Management Professors

In [773]:
# Only query the UMD.io API if we don't have the data
if not have_bmgt_professors:
    bmgt_professors = get_professors_for_courses(bmgt_course_ids)
    save_professor_data(bmgt_professors, bmgt_professor_names_filepath)
    have_bmgt_professors = True
else:
    bmgt_professors = read_professor_name_data(bmgt_professor_names_filepath)

    if not bmgt_professors:
        print("Error response from umd.io API")

print(bmgt_professors)

{'Jeff Miller': {'BMGT110'}, 'Cody Hyman': {'BMGT220'}, 'Laurel Mazur': {'BMGT221', 'BMGT220'}, 'Progyan Basu': {'BMGT220'}, 'Viktoriya Zotova': {'BMGT220'}, 'Gary Bulmash': {'BMGT221'}, 'Gerald Ward': {'BMGT221'}, 'Ai Ren': {'BMGT230'}, 'Daehoon Noh': {'BMGT230'}, 'Erich Studer-Ellis': {'BMGT230'}, 'Huan Cao': {'BMGT230'}, 'Radu Lazar': {'BMGT230'}, 'Shubham Akshat': {'BMGT230'}, 'Ziwei Cao': {'BMGT230'}}


## Data Collection Part 2: Grabbing Reviews From RateMyProfessors

TODO: Describe section

### Data Collection Part 2.1: Setup and Utilities to Scrape and Parse Data from RateMyProfessor

#### Data Collection Part 2.1.1: Setup Data

In [774]:
# Data needed for requesting data from RateMyProfessor
ratemyprofessor_url = "https://www.ratemyprofessors.com/search.jsp"
params = {'queryoption':'HEADER', 'schoolID':'1270', 'queryBy':'teacherName', 'schoolName':'University+of+Maryland'}
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0",
    "Access-Control-Allow-Origin": "*",
    "Access-Control-Allow-Headers": "Content-Type",
    "Access-Control-Allow-Methods": "GET",
}


# List of tags that RateMyProfessor uses to describe professors, which are used for the database and dataframes
tag_list = ['gives_good_feedback', 'respected', 'lots_of_homework', 'accessible_outside_class',
           'get_ready_to_read', 'participation_matters', 'inspirational',
           'graded_by_few_things', 'test_heavy', 'group_projects', 'clear_grading_criteria', 
           'hilarious', 'beware_of_pop_quizes', 'amazing_lectures', 'lecture_heavy', 'caring',
           'extra_credit', 'so_many_papers', 'tough_grader', 'skip_class_wont_pass']

# Want to tie the code friendly tag names to what is found on a RateMyProfessor page
text_tag_list = [' '.join(x.split('_')) for x in tag_list]
text_tag_list.remove('skip class wont pass')
text_tag_list.append("skip class? you won't pass.")

# both tag_list and text_tag_list in same order, and correspond to one another
text_tag_dict = {text_tag_list[i]: tag_list[i] for i in range(len(text_tag_list))}

# These are the column headers for a professor's overall statistics found at the top of the page
overall_header_list = ['first_name', 'last_name', 'full_name', 'page_exists', 'rating', 'take_again', 'difficulty',
                      'rating_count'] + tag_list

# Review post column headers. The meta list is the row of top meta responses (like 'Grade: A-').
review_meta_list = ['would_take_again', 'grade', 'textbook', 'online_class', 'for_credit', 'attendance']
review_text_meta_list = [' '.join(x.split('_')) for x in review_meta_list]
review_meta_dict = {review_text_meta_list[i]: review_meta_list[i] for i in range(len(review_meta_list))}
        
review_header_list = ['review_id', 'course', 'date', 'quality', 'difficulty', 'body',
                      'thumb_up', 'thumb_down'] + review_meta_list + tag_list 

#### Data Collection Part 2.1.2: Utility Functionality

In [775]:
def tags_to_dict(provided_tags):
    """Turns the list of text tags (e.g. skip class? you won't pass) into a dictionary
    of approriately named tags that work for database columns and if they were present.
    
    Args:
        provided_tags: A list of space separated tags scraped from the RMP page.
        
    Returns:
        A dictionary of {tag: 1 or 0} on whether a tag was used to describe the professor.
    """
    
    tag_dict = {val: 0 for val in text_tag_dict.values()}
    
    for tag in provided_tags:
        if tag.lower() in text_tag_dict.keys():
            tag_dict[text_tag_dict[tag.lower()]] = 1
            
    return tag_dict

def meta_to_dict(provided_meta):
    """Turns the dictionary of meta tags (e.g. Would Take Again: No) into a dictionary
    of appropriately named tags that work for database columns and values if they were present.
    
    Args:
        provided_meta: A dictionary of meta information from a review.
        
    Returns:
        A dictionary of {meta: 1 or 0} on whether a meta was used on the review.
    """
    
    meta_dict = {val: 0 for val in review_meta_list}
    
    for meta, response in provided_meta.items():
        value = 0
        
        if meta.lower() in review_meta_dict.keys():
            
            if response.lower() == "yes" or response.lower() == "mandatory":
                value = 1
                
            if meta.lower() == "grade":
                value = response
            
            meta_dict[review_meta_dict[meta.lower()]] = value
            
    return meta_dict

### Data Collection Part 2.2: Querying RateMyProfessor and Getting the Professor's URL

In [776]:
def find_rmp_professor_url(html_doc):
    """Finds the professor's URL on the search page and returns it.
    
    Args:
        html_doc: A string containing an HTML document.
        
    Returns:
        The full URL for the professor's page (if found).
    
    """
    
    soup = BeautifulSoup(html_doc, 'html.parser')
    
    no_results = soup.find('div[class*="NoResultsFoundArea__StyledNoResultsFound"]')
    partial_url = soup.find('li', class_='listing PROFESSOR')
    
    # Sometimes RMP does the search differntly, so it'll be elsewhere
    diff_location = soup.find('a', attrs={'class': lambda x: 'TeacherCard__StyledTeacherCard' in x if x else False}, href=True)
    
    # The professor may not be reviewed
    if no_results is None and partial_url and len(partial_url) != 0:
        if diff_location:
            partial_url = diff_location['href']
        else:
            partial_url = partial_url.find('a', href=True)

        if partial_url:
            main_url = "https://www.ratemyprofessors.com"
            return main_url + partial_url['href']
    else:
        return None
    
    
def query_rmp_for_professor_url(professor_name, headers, params):
    """Queries RateMyProfessor for the professor, given the parameters and headers.
    
    Args:
        professor_name: The <first name> <last name> of the professor.
        headers: Dictionary of headers for the get request.
        params: Dictionary of parameters for the get request.
        
    Returns:
        The full URL for the professor's page after searching for it (if found).
        
    """
    
    params['query'] = professor_name
    
    response = requests.get(ratemyprofessor_url, headers=headers, params=params)
    
    if response.status_code == 200:
        url = find_rmp_professor_url(response.text)
        
        if url is not None:
            return url
        else:
            print("Professor {name} has not been reviewed.".format(name=professor_name))
            return None

### Data Collection Part 2.3: Parsing the Professor Overall Information (Stats and Tags)

In [777]:
def get_rmp_prof_stats(page_text):
    """Parses the professor's stats from their page and returns them. Namely their overall rating, 
    how many would take again, overall difficulty and how many ratings they have on RateMyProfessor.
    
    Args:
        page_text: An HTML document of the professor's page.
        
    Returns:
        A dictionary containing their rating, take again percentage, difficulty rating, and rating count.
    """
    
    soup = BeautifulSoup(page_text, 'html.parser')
    
    rating_score = soup.select('div[class*="RatingValue__Numerator"]')
    
    if rating_score is not None:
        rating_score = float(rating_score[0].text)
    else:
        print('Rating score error: \n')
        print(soup)
    
    feedback = soup.select('div[class*="TeacherFeedback__StyledTeacherFeedback"]')[0].select('div[class*="FeedbackItem__FeedbackNumber"]')
    
    take_again = float(feedback[0].text[:-1]) / 100
    difficulty = float(feedback[1].text)
    
    rating_count = soup.select('div[class*="RatingValue__NumRatings"]')[0].select('a')[0].text
    rating_count = ''.join([x for x in rating_count if x.isdigit()])
    rating_count = int(rating_count)
    
    return {'rating': rating_score, 'take_again': take_again, 'difficulty': difficulty, 'rating_count': rating_count}


def get_rmp_prof_top_tags(page_text):
    """Parses and returns the professor's top tags.
    
    Args:
        page_text: An HTML document of the professor's page.
        
    Returns:
        A list of tags describing the professor.
    """
    
    soup = BeautifulSoup(page_text, 'html.parser')
    
    tags = []
    unparsed_tags = soup.select('div[class*="TeacherTags__TagsContainer"]')
    
    if unparsed_tags and len(unparsed_tags) != 0:
        unparsed_tags = unparsed_tags[0].select('span')
    
        for tag in unparsed_tags:
            tags.append(tag.text)
        
    return tags_to_dict(tags)


def rmp_prof_overall_to_dataframe(professor_name, stats, tags, page_exists=1):
    """Combines the professor's overall stats and tags into a pandas dataframe.
    
    Args:
        professor_name: String holding the professor's name.
        stats: A dictionary holding the overall stats (e.g. 'would_take_again': .83).
        tags: A dictionary holding the tags associated with a professor (e.g. {'caring': 1}).
        page_exists (optional, default=1): Integer boolean determining if a professor has a RMP page.
        
    Returns:
        A dataframe containing the combination of professor name, stats, and tags.
    """
    
    overall_df = pd.DataFrame(columns=overall_header_list)
    
    first_name, last_name = professor_name.split(' ', 1)
    overall_dict = {'first_name': first_name, 'last_name': last_name, 'full_name': professor_name, 'page_exists': page_exists}
    
    overall_dict.update(stats)
    overall_dict.update(tags)
    
    overall_df = overall_df.append(overall_dict, ignore_index=True)
    
    return overall_df

### Data Collection Part 2.4: Use Selenium to Load All Professor Reviews

In [778]:
def start_selenium():
    """Starts up the Selenium browser."""
    driver = webdriver.Firefox(executable_path='./bin/geckodriver.exe')
    return driver
    
def stop_selenium(driver):
    """Shutdown the Selenium browser."""
    driver.close()
    driver.quit()
    
def load_all_rmp_reviews(page_url, driver):
    """Loads all the reviews for a given porfessor and returns the text of all of them.
    
    Args:
        page_url: The URL for the professor's page.
        
    Returns:
        A string containing the HTML for all the reviews.
    """

    driver.get(page_url)
    
    # RateMyProfessors has a cookies pop up that overlays the website, it needs to be closed first
    time.sleep(0.5)
    close_cookies = driver.find_elements(By.XPATH, '//button[text()="Close"]')
    
    if close_cookies:
        close_cookies[0].click()
        
    load_more = driver.find_elements(By.XPATH, '//button[text()="Load More Ratings"]')
    
    # RateMyProfessors paginates the reviews via Javascript, so we must continually load more while the button is present
    while load_more:
        load_more[0].click()
        time.sleep(1)
        load_more = driver.find_elements(By.XPATH, '//button[text()="Load More Ratings"]')
        
        
    all_reviews = driver.find_element_by_id('ratingsList').get_attribute('outerHTML')
    
    
    return all_reviews

### Data Collection Part 2.5: Parsing Utilities for a Single Review

In [779]:
def string_date_to_unix(date_str):
    """Turns the RateMyProfessor date format (e.g. Nov 23rd, 2020) into a
    UTC timestamp. Assumes the date is already in UTC.
    
    Args:
        date_str: A string containing the RateMyProfessor review date.
        
    Returns:
        A UTC timestamp corresponding to the date provided.
    """
    
    # Split into month, day, year
    date_split = date_str.split(' ')
    day = date_split[1]
    
    # Remove comma and suffix for day
    day = day[:-3]
    
    # Place the day back into the list and join everything back together
    date_split[1] = day
    remade_date_str = (' ').join(date_split)
    
    # Change into UTC time
    datetime_obj = datetime.datetime.strptime(remade_date_str, '%b %d %Y')
    utc_time = datetime_obj.timestamp()
    
    return utc_time
    
def parse_rating_header(soup):
    """Parses and returns the rating header for a single review.
    
    Args:
        soup: An initialized BeautifulSoup object for the professor's page.
        
    Returns:
        A dictionary containing the course and date for the review.
    """
    
    rating_header = soup.select('div[class*="Rating__RatingInfo"]')
    
    if len(rating_header) != 0:
        course = rating_header[0].select('div[class*="RatingHeader__StyledClass"]')[0].text.strip()
        date = rating_header[0].select('div[class*="TimeStamp__StyledTimeStamp"]')[0].text.strip()
        
        utc_time = string_date_to_unix(date)
    else:
        print(soup)
    
    return {'course': course, 'date': utc_time}

def parse_meta_data(soup):
    """Parses and returns the meta data for a single review.
    
    Args:
        soup: An initialized BeautifulSoup object for the professor's page.
        
    Returns:
        A dictionary containing the meta data (e.g. Would Take Again) for the review.
    """
    
    course_meta = soup.select('div[class*="CourseMeta__StyledCourseMeta"]')[0]
    review_meta_data = {}

    for meta_div in course_meta.select('div'):
        meta_data = meta_div.text.split(':')
        meta_name = meta_data[0].strip()
        meta_value = meta_data[1].strip()

        review_meta_data[meta_name] = meta_value

    return meta_to_dict(review_meta_data)

def parse_rating_data(soup):
    """Parses and returns the rating data for a single review.
    
    Args:
        soup: An initialized BeautifulSoup object for the professor's page.
        
    Returns:
        A dictionary containing the rating data for the quality and difficulty for the review.
    """
    
    rating_values_text = soup.select('div[class*="RatingValues__StyledRatingValues"]')[0].select('div[class*="RatingValues__RatingValue"]')
    quality = rating_values_text[0].text
    difficulty = rating_values_text[1].text

    rating_data = {'quality': quality, 'difficulty': difficulty}
    
    return rating_data

def parse_review_tags(soup):
    """Parses and returns the tags for a single review.
    
    Args:
        soup: An initialized BeautifulSoup object for the professor's page.
        
    Returns:
        A list containing the tags for the review.
    """
    
    tag_container = soup.select('div[class*="RatingTags__StyledTags"]')
    tags = []
    
    if tag_container: # Since not all reviews add tags
        unparsed_tags = tag_container[0].select('span')

        for tag in unparsed_tags:
            tags.append(tag.text)

    return tags_to_dict(tags)
    
def parse_thumb_scoring(soup):
    """Parses and returns the thumb scoring data for a single review.
    
    Args:
        soup: An initialized BeautifulSoup object for the professor's page.
        
    Returns:
        A dictionary containing the thumb scoring data for the review.
    """
    
    thumb_container = soup.select('div[class*="RatingFooter__StyledRatingFooter"]')[0].select('div[class*="RatingFooter__HelpTotal"]')

    thumb_up = int(thumb_container[0].text.strip())
    thumb_down = int(thumb_container[1].text.strip())
    thumb_data = {'thumb_up': thumb_up, 'thumb_down': thumb_down}

    return thumb_data

def parse_review_text(soup):
    """Parses and returns the review body text for a single review.
    
    Args:
        soup: An initialized BeautifulSoup object for the professor's page.
        
    Returns:
        A string containing the review text for the review.
    """
    
    review_text = soup.select('div[class*="Comments__StyledComments"]')[0].text
    
    return {'body': review_text}
    
def parse_single_rmp_review(review_item, courses):
    """Parses and returns all data for a single review.
    Namely it returns: Meta data, rating data, tags, thumb_scoring, and review text.
    
    Args:
        review_item: A single review list item containing all the appropraite HTML.
        
    Returns:
        A dictionary containing the meta data, rating data, tags, thumb_scoring, and review text
        for a single review.
    """
    
    soup = BeautifulSoup(review_item, 'html.parser')
    
    course_and_date = parse_rating_header(soup)
    
    # TODO: Loses course reviews like 'CMSC131CMSC132' where students combined multiple courses they took
    if course_and_date['course'] in courses:
        
        # Meta data
        meta_data = parse_meta_data(soup)
        
        # Rating data
        rating_data = parse_rating_data(soup)
        
        # Tags 
        tags = parse_review_tags(soup)
        
        # Thumb Scoring
        thumb_scoring = parse_thumb_scoring(soup)
        
        # Review body
        review_text = parse_review_text(soup)
        
        return {'meta_data': meta_data, 'rating_data': rating_data, 'tags': tags, 'thumb_scoring': thumb_scoring,
                'review_text': review_text, 'rating_header': course_and_date}

### Data Collection 2.6: Parsing Utilities for an Entire RateMyProfessor Page

In [780]:
def get_rmp_prof_reviews(rmp_prof_url, selenium_driver, prof_name, courses):
    """Gets all the RateMyProfessor reviews for a given professor and places into a
    dataframe. Only grabs reviews for classes in the provided courses.
    
    Args:
        rmp_prof_url: A string containing the RateMyProfessor URL for the professor.
        prof_name: A string containing the professor's name.
        prof_courses: List of courses to look for in the reviews.
        
    Returns:
        A dataframe containing all the appropriate reviews.
    """
    
    reviews_html = load_all_rmp_reviews(rmp_prof_url, selenium_driver)
    soup = BeautifulSoup(reviews_html, 'html.parser')
    
    first_name, last_name = prof_name.split(' ', 1)
    review_id_head = prof_name + '-'
    counter = 1
    
    review_df = pd.DataFrame(columns=review_header_list)
    
    for review in soup.find_all('li'):
        
        if len(review.select('div[class*="Rating__StyledRating"]')) != 0: # Avoid advertisement list items
            data = parse_single_rmp_review(str(review), courses)

            cur_review_id = review_id_head + str(counter)
            counter = counter + 1
            
            if data: # Since the review could be of an undesired course
                flattened_data = {'first_name': first_name, 'last_name': last_name, 'full_name': prof_name,
                                  'review_id': cur_review_id}

                for data_type, data_dict in data.items():
                    
                    for key, val in data_dict.items():
                        flattened_data[key] = val

                review_df = review_df.append(flattened_data, ignore_index=True)
    
    return review_df


def parse_rmp_page(rmp_prof_url, headers, rmp_conn, selenium_driver, professor_name, courses):
    """Parses an entire RateMyProfessor professor page for overall stats & tags, and all
    of their reviews. It will return two dataframes holding this information and insert
    them into a database.
    
    Args:
        rmp_prof_url: A string containing the RateMyProfessor URL for the professor.
        headers: Request headers to use.
        rmp_conn: Connection object to the RateMyProfessor database.
        prof_name: A string containing the professor's name.
        courses: List of courses to look for in the reviews.
        
    Returns:
        A tuple of two dataframes, (overall statistics, all the reviews).
    """
    
    rmp_prof_page = requests.get(rmp_prof_url, headers=headers)
    
    if rmp_prof_page.status_code == 200:
        soup = BeautifulSoup(rmp_prof_page.text, 'html.parser')
        
        # Professor stats
        stats_container = soup.select('div[class*="TeacherInfo__StyledTeacher"]')[0]
        
        prof_stats = get_rmp_prof_stats(str(stats_container))
        prof_tags = get_rmp_prof_top_tags(str(stats_container))
        
        overall_df = rmp_prof_overall_to_dataframe(professor_name, prof_stats, prof_tags)
        insert_dataframe_into_db(rmp_conn, overall_df, 'professor_stats')
        
        # Professor reviews
        all_reviews_df = get_rmp_prof_reviews(rmp_prof_url, selenium_driver, professor_name, courses)
        insert_dataframe_into_db(rmp_conn, all_reviews_df, 'reviews')
        
        return (overall_df, all_reviews_df)
    else:
        print("Error opening the RateMyProfessor professor page")

### Data Collection 2.7: Scrape and Parse All Professors Utilities

In [781]:
def fill_nonexistant_rmp_data(rmp_conn, professor):
    """Marks a professor as not having a page and fills professor's overall statistics dataframe
    with empty values so that it may be placed into the database and not re-queried for later.
    
    Args:
        rmp_db: Connection object to the RateMyProfessor database.
        professor: String containing the name of the professor.
    """
    
    empty_tags = tags_to_dict([])
    empty_stats = {'rating': 0, 'take_again': 0, 'difficulty': 0, 'rating_count': 0}
    
    # Professor stats
    overall_df = rmp_prof_overall_to_dataframe(professor, empty_stats, empty_tags, page_exists=0)
    insert_dataframe_into_db(rmp_conn, overall_df, 'professor_stats')
    
    return None
    
    
def parse_rmp_all_professors(rmp_db_filepath, professors, provided_courses, force_scrape=False):
    """Scrapes and parses all professors, storing the data in a database and returning a
    list of dataframes for stats and reviews.
    
    Args:
        rmp_db_filepath: String containing the filepath to the appropriate database.
        professors: Dictionary of professors to list of courses.
        force_scrape (optional, default=False): Forces a scrape of RateMyProfessors even if already done.
        
    Returns:
        The tuple (stats, reviews) where each is a list of dataframes.
    """
    
    all_major_stats = []
    all_major_reviews = []
    
    rmp_db = create_connection(rmp_db_filepath)
    selenium_driver = start_selenium()
    
    try:
        for professor, courses in professors.items():
            overall_stats_df = None
            all_reviews_df = None

            # Read from database if the professor has already been scraped (only checks stats for confirmation)
            if not force_scrape and is_professor_scraped(rmp_db, professor):
                overall_stats_df = get_professor_stats_from_db(rmp_db, professor)
                all_reviews_df = get_professor_reviews_from_db(rmp_db, professor)

                # Keep track of the dataframes for each professor
                all_major_stats.append(overall_stats_df)
                all_major_reviews.append(all_reviews_df)

            else:
                # Get all the data from the professor's RateMyProfessor page
                prof_rmp_url = query_rmp_for_professor_url(professor, headers, params)

                # If the professor has a RateMyProfessor page
                if prof_rmp_url is not None:
                    overall_stats_df, all_reviews_df = parse_rmp_page(prof_rmp_url, headers, rmp_db, selenium_driver, professor, provided_courses)

                    # Keep track of the dataframes for each professor
                    all_major_stats.append(overall_stats_df)
                    all_major_reviews.append(all_reviews_df)
                    
                    # So we don't query RateMyProfessor too much
                    time.sleep(1)

                else:
                    # Used to fill the stats table to show their page doesn't exist
                    fill_nonexistant_rmp_data(rmp_db, professor)

    except Exception as e:
        print("Type error: " + str(e))

    finally:
        rmp_db.close()
        stop_selenium(selenium_driver)
    
        return (all_major_stats, all_major_reviews)

### Data Collection Part 2.8: Scrape and Parse All Computer Science Professors from RateMyProfessor

In [782]:
all_rmp_cmsc_stats, all_rmp_cmsc_reviews = parse_rmp_all_professors(cmsc_rmp_db_filepath, cmsc_professors, cmsc_course_ids)

merged_rmp_cmsc_stats = pd.concat(all_rmp_cmsc_stats)
merged_rmp_cmsc_reviews = pd.concat(all_rmp_cmsc_reviews)

./data/db/cmsc_rmp.db


In [783]:
merged_rmp_cmsc_stats.head()

Unnamed: 0,id,first_name,last_name,full_name,page_exists,rating,take_again,difficulty,rating_count,gives_good_feedback,...,group_projects,clear_grading_criteria,hilarious,beware_of_pop_quizes,amazing_lectures,lecture_heavy,caring,extra_credit,so_many_papers,tough_grader
0,1,Fawzi,Emad,Fawzi Emad,1,4.4,0.83,3.1,114,0,...,0,0,1,0,1,0,0,0,0,0
0,2,Ilchul,Yoon,Ilchul Yoon,1,2.1,0.16,4.2,33,0,...,0,0,0,0,0,1,0,0,0,1
0,3,Nelson,Padua-Perez,Nelson Padua-Perez,1,4.0,0.8,3.1,94,0,...,0,0,1,0,1,0,1,0,0,0
0,4,Pedram,Sadeghian,Pedram Sadeghian,1,3.0,0.55,3.5,21,0,...,0,0,0,0,0,1,0,0,0,1
0,5,Anwar,Mamat,Anwar Mamat,1,3.1,0.69,3.3,20,0,...,1,1,1,0,0,0,0,0,0,1


In [784]:
merged_cmsc_reviews.head()

Unnamed: 0,id,review_id,first_name,last_name,full_name,course,date,body,thumb_up,thumb_down,...,group_projects,clear_grading_criteria,hilarious,beware_of_pop_quizes,amazing_lectures,lecture_heavy,caring,extra_credit,so_many_papers,tough_grader
0,1.0,Fawzi Emad-1,Fawzi,Emad,Fawzi Emad,CMSC131,1606972000.0,Fawzi is a legend,0,0,...,0,0,1,0,0,1,0,0,0,0
1,2.0,Fawzi Emad-2,Fawzi,Emad,Fawzi Emad,CMSC131,1606885000.0,Fawzi is great. Lectures are super easy to fol...,0,0,...,0,1,0,0,1,0,0,0,0,0
2,3.0,Fawzi Emad-3,Fawzi,Emad,Fawzi Emad,CMSC131,1603426000.0,Good guy and teacher. Just gives a lot of work...,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4.0,Fawzi Emad-4,Fawzi,Emad,Fawzi Emad,CMSC131,1599365000.0,He's a great lecturer but his tests are super ...,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5.0,Fawzi Emad-5,Fawzi,Emad,Fawzi Emad,CMSC131,1598760000.0,"No joke, he is the best professor at UMD hands...",0,0,...,0,0,1,0,1,0,1,0,0,0


### Data Collection Part 2.9: Scrape and Parse All Business Management Professors from RateMyProfessor

In [785]:
all_rmp_bmgt_stats, all_rmp_bmgt_reviews = parse_rmp_all_professors(bmgt_rmp_db_filepath, bmgt_professors, bmgt_course_ids)

merged_rmp_bmgt_stats = pd.concat(all_bmgt_stats)
merged_rmp_bmgt_reviews = pd.concat(all_bmgt_reviews)

./data/db/bmgt_rmp.db


In [786]:
merged_rmp_bmgt_stats.head()

Unnamed: 0,id,first_name,last_name,full_name,page_exists,rating,take_again,difficulty,rating_count,gives_good_feedback,...,group_projects,clear_grading_criteria,hilarious,beware_of_pop_quizes,amazing_lectures,lecture_heavy,caring,extra_credit,so_many_papers,tough_grader
0,1.0,Jeff,Miller,Jeff Miller,1,3.5,0.52,2.8,86,0,...,0,0,1,0,1,0,0,0,0,0
0,2.0,Cody,Hyman,Cody Hyman,1,5.0,1.0,2.0,1,1,...,0,1,0,0,0,0,0,0,0,0
0,,Progyan,Basu,Progyan Basu,1,3.9,0.85,3.9,39,0,...,0,1,0,0,0,0,0,0,0,0
0,,Gary,Bulmash,Gary Bulmash,1,4.2,0.86,3.0,48,0,...,0,1,0,0,0,0,1,0,0,0
0,,Ai,Ren,Ai Ren,1,3.3,0.5,4.6,4,1,...,0,1,0,0,0,0,0,1,0,1


In [787]:
merged_rmp_bmgt_reviews.head()

Unnamed: 0,id,review_id,first_name,last_name,full_name,course,date,body,thumb_up,thumb_down,...,group_projects,clear_grading_criteria,hilarious,beware_of_pop_quizes,amazing_lectures,lecture_heavy,caring,extra_credit,so_many_papers,tough_grader
0,1.0,Jeff Miller-2,Jeff,Miller,Jeff Miller,BMGT110,1606972000.0,"Prof. Miller is a really interesting man, at t...",0,0,...,0,0,1,0,0,0,1,0,0,0
1,2.0,Jeff Miller-4,Jeff,Miller,Jeff Miller,BMGT110,1599365000.0,Do not take this class. He reads the news to t...,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3.0,Jeff Miller-5,Jeff,Miller,Jeff Miller,BMGT110,1590898000.0,Often goes off on tangents and strays away fro...,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4.0,Jeff Miller-7,Jeff,Miller,Jeff Miller,BMGT110,1590379000.0,His franchising paper is hell. So much researc...,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5.0,Jeff Miller-8,Jeff,Miller,Jeff Miller,BMGT110,1589861000.0,"A great guy, but not a great professor. He doe...",0,0,...,0,0,0,0,0,1,0,0,0,0


## Data Collection Part 3: Query and Parse Data from PlanetTerp 

TODO: Describe section

### Data Collection Part 3.1: Setup and Utilities for PlanetTerp

In [797]:
# See https://api.planetterp.com/#get-a-professor
planetterp_api = "https://api.planetterp.com/v1/professor"
pt_header = {'Accept': 'application/json'}
params = {'reviews': 'true'}

stats_columns=['first_name', 'last_name', 'full_name', 'slug', 'review_count', 'type', 'page_exists']
review_columns=['review_id', 'full_name', 'course', 'date', 'body', 'rating', 'expected_grade']

base_grades = ['A', 'B', 'C', 'D']
grade_possibilities = list(chain(*([x + '+', x, x + '-'] for x in base_grades)))
grade_possibilities.append(['F', 'W', 'Other'])

In [797]:
def pt_date_to_unix(date_str):
    """Takes the PlanetTerp datetime string and converts to unix time. Assumes
    It is already in UTC timezone.
    
    Args:
        date_str: String containing a date time in the format "%Y-%m-%dT%H:%M:%S".
        
    Returns:
        A unix timestamp real representing the time passed into the function.
    """
    
    # Format: 2020-01-01T00:00:00
    date_time_obj = datetime.datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S")
    return date_time_obj.timestamp() 

### Data Collection Part 3.2: Parsing PlanetTerp Reviews

In [805]:
# TODO: Could simply use the review JSON provided, but may not be the format we want
def parse_pt_single_review(review, review_id, courses):
    """Parses a single PlanetTerp review and places it into a dictionary.
    
    Args:
        review: A dictionary or JSON object holding the review data.
        review_id: A string holding an unique id for this review.
        courses: List of course ids to determine if review wanted.
        
    Returns:
        Dictionary holding review information.
    """
    
    review_dict = {}
    course = review.get('course')
    
    if course and course in courses:
        review_dict = {'full_name': review.get('professor'), 'course': course,
                       'body': review.get('review'), 'expected_grade': review.get('expected_grade'),
                       'rating': review.get('rating')}

        unix_time = pt_date_to_unix(review.get('created'))
        review_dict['date'] = unix_time

        review_dict['review_id'] = review_id

    return review_dict
    
    
def parse_pt_reviews(reviews, courses):
    """Parses all reviews from PlanetTerp, placing those that are within the desired courses
    into a dataframe.
    
    Args:
        reviews: A list of dictionaries, each dictionary representing a single review.
        reviews_df: A dataframe to hold the reviews.
        courses: The desired courses for which to look for in the reviews.
        
    Returns:
        A dataframe containing all the reviews for a professor.
    """
    
    reviews_df = pd.DataFrame(columns=review_columns)
    idx = 1
    
    for review in reviews:
        review_id = review.get('professor') + '-' + str(idx)
        idx = idx + 1
        
        review_dict = parse_pt_single_review(review, review_id, courses)
        
        if bool(review_dict):
            reviews_df = reviews_df.append(review_dict, ignore_index=True)
            
    return reviews_df

### Data Collection Part 3.3: Querying PlanetTerp for Professors

In [806]:
def query_pt_for_professor(professor, courses):
    """Queries the PlanetTerp API for a given professor, gathering their stats
    and reviews. It then returns two dataframes (stats, reviews).
    
    Args:
        professor: String holding the name of the professor to query.
        courses: List of course ids to look for in the reviews.
        
    Returns:
        A tuple (stats, reviews) of dataframes holding the stats and reviews data.
    """
    
    stats_df = pd.DataFrame(columns=stats_columns)
    reviews_df = pd.DataFrame()
    
    params['name'] = professor
    
    
    response = requests.get(planetterp_api, headers=pt_header, params=params)
    
    first_name, last_name = professor.split(' ', 1)
    prof_stats = {'first_name': first_name, 'last_name': last_name, 
                  'full_name': professor}
    
    # The professor may not exist in the PlanetTerp database (though this shouldn't occur)
    if response.status_code == 200:
        json = response.json()
        
        review_count = 0
        reviews = json.get('reviews')
        
        # The professor may not have any reviews
        if reviews:
            review_count = len(reviews)
            reviews_df = parse_pt_reviews(reviews, courses)
            
        stats_cont = {'slug': json.get('slug'), 'type': json.get('type'),
                     'review_count': review_count, 'page_exists': 1}
        
    else:
        stats_cont = {'page_exists': 0, 'review_count': 0}
        
        
    prof_stats.update(stats_cont)
    stats_df = stats_df.append(prof_stats, ignore_index=True)
    
    return (stats_df, reviews_df)


def query_pt_for_all_professors(professors, courses, db_filepath, force_query=False):
    """Queries PlanetTerp for all the professors provided, taking reviews that
    correspond to the given courses, and places professor stats and 
    reviews into a database.
    
    Args:
        professors: A list of strings containing professor names.
        courses: A list of strings containing course ids.
        db_filepath: A string holding the filepath to a database.
        force_query (optional, default=False): Boolean to decied whether to force
            query the PlanetTerp API.
        
    Returns:
        A tuple (stats, reviews) of lists containing all dataframes for
        each professor stats and reviews respectively.
    """
    
    all_major_stats = []
    all_major_reviews = []
    
    try:
        pt_db = create_connection(db_filepath)
        for professor in professors:
            
            if not force_query and is_professor_scraped(pt_db, professor):
                
                stats_df = get_professor_stats_from_db(pt_db, professor)
                reviews_df = get_professor_reviews_from_db(pt_db, professor)

                # Keep track of the dataframes for each professor
                all_major_stats.append(stats_df)
                all_major_reviews.append(reviews_df)
                
            else:
                stats_df, reviews_df = query_pt_for_professor(professor, courses)

                if not stats_df.empty:
                    all_major_stats.append(stats_df)
                    insert_dataframe_into_db(pt_db, stats_df, 'professor_stats')

                if not reviews_df.empty:
                    all_major_reviews.append(reviews_df)
                    insert_dataframe_into_db(pt_db, reviews_df, 'reviews')

                time.sleep(1) # To give some time to the PlanetTerp API
            
        
    except Exception as e:
        print("Type error: " + str(e))
        
    finally:
        pt_db.close()
        return (all_major_stats, all_major_reviews)

### Data Collection Part 3.4: Parse All Computer Science Professors from PlanetTerp

In [807]:
all_pt_cmsc_stats, all_pt_cmsc_reviews = query_pt_for_all_professors(cmsc_professors, cmsc_course_ids, cmsc_pt_db_filepath)
merged_pt_cmsc_stats = pd.concat(all_pt_cmsc_stats)
merged_pt_cmsc_reviews = pd.concat(all_pt_cmsc_reviews)

./data/db/cmsc_pt.db


In [808]:
merged_pt_cmsc_stats.head()

Unnamed: 0,first_name,last_name,full_name,slug,review_count,type,page_exists
0,Fawzi,Emad,Fawzi Emad,emad_fawzi,76,professor,1
0,Ilchul,Yoon,Ilchul Yoon,yoon_ilchul,20,professor,1
0,Nelson,Padua-Perez,Nelson Padua-Perez,padua-perez,76,professor,1
0,Pedram,Sadeghian,Pedram Sadeghian,sadeghian,15,professor,1
0,Anwar,Mamat,Anwar Mamat,mamat,18,professor,1


In [809]:
merged_pt_cmsc_reviews.head()

Unnamed: 0,review_id,full_name,course,date,body,rating,expected_grade
0,Fawzi Emad-1,Fawzi Emad,CMSC131,1463070000.0,BEST PROFESSOR FOR CS! He is so funny and pass...,5,C
1,Fawzi Emad-2,Fawzi Emad,CMSC250,1432041000.0,Best professor in the Computer Science Departm...,5,B
2,Fawzi Emad-3,Fawzi Emad,CMSC250,1431706000.0,"Fawzi is the best teacher I have had at UMD, i...",5,A
3,Fawzi Emad-4,Fawzi Emad,CMSC250,1430921000.0,Fawzi was a really fun and engaging professor ...,5,A
4,Fawzi Emad-7,Fawzi Emad,CMSC131,1418341000.0,Fawzi was an excellent professor for 131. He w...,5,A


### Data Collection Part 3.5: Parse All Business Management Professors from PlanetTerp

In [810]:
all_pt_bmgt_stats, all_pt_bmgt_reviews = query_pt_for_all_professors(bmgt_professors, bmgt_course_ids, bmgt_pt_db_filepath)
merged_pt_bmgt_stats = pd.concat(all_pt_bmgt_stats)
merged_pt_bmgt_reviews = pd.concat(all_pt_bmgt_reviews)

./data/db/bmgt_pt.db


In [811]:
merged_pt_bmgt_stats.head()

Unnamed: 0,id,first_name,last_name,full_name,page_exists,slug,review_count,type
0,1,Jeff,Miller,Jeff Miller,1,miller_jeff,43,professor
0,2,Cody,Hyman,Cody Hyman,1,hyman_cody,0,professor
0,3,Laurel,Mazur,Laurel Mazur,1,mazur,0,professor
0,4,Progyan,Basu,Progyan Basu,1,basu,56,professor
0,5,Viktoriya,Zotova,Viktoriya Zotova,1,zotova,1,professor


In [812]:
merged_pt_bmgt_reviews.head()

Unnamed: 0,id,review_id,full_name,course,date,body,rating,expected_grade
0,1,Jeff Miller-1,Jeff Miller,BMGT110,1516998652,"Pretty funny guy, although he gets off tangent...",4,B+
1,2,Jeff Miller-3,Jeff Miller,BMGT110,1481522400,He is simultaneously the best and worst profes...,3,
2,3,Jeff Miller-4,Jeff Miller,BMGT110,1481195880,The class itself is very boring - the material...,3,
3,4,Jeff Miller-6,Jeff Miller,BMGT110,1463451540,I don't understand for the life of me how so m...,2,
4,5,Jeff Miller-7,Jeff Miller,BMGT110,1451315640,Professor Miller is a great guy and an interes...,4,A


## Data Collection Part 4: Putting It All Together

## Analysis

TODO: Describe section

### Analyzing Word Frequencies and Rudimentary Sentiment Analysis in Reviews

### Analyzing Review Trends

## Visualization

TODO: Describe section

### Visualizing Word Associations and Sentiments

### Visualizing Review Trends and Trends Over Time

## Insights and Conclusion

TODO: Fill-in

## Potential Future Work