In [205]:
"""
Scrape reviews from glassdoor using given links 
Saves individual company reviews to csv file
Method to combine all csvs into one dataframe included 
"""

'\nScrape reviews from glassdoor using given links \nSaves individual company reviews to csv file\nMethod to combine all csvs into one dataframe included \n'

In [165]:
import pandas as pd
import requests
import time
import math
import re
import pickle
import itertools
import glob

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

In [None]:
"""
Takes in a list of csv files using glob 
Output: dataframe combining all csvs vertically (axis=0)
Assumes all csvs have same column names 
"""
def files_to_df(glob_csv):
    final_df = pd.DataFrame()
    
    for file in glob_csv:
        df = pd.read_csv(file)
        df = df.drop(df.columns[0], axis=1)
        final_df = pd.concat([final_df, df])
        
    return final_df

In [171]:
csvfiles = glob.glob('reviews/top_50_smb/' + '*.csv')
top_50_smb_reviews = files_to_df(csvfiles)

# for file in csvfiles:
#     df = pd.read_csv(file)
#     df = df.drop(df.columns[0], axis=1)
#     top_50_smb_reviews = pd.concat([top_50_smb_reviews, df])

top_50_smb_reviews

In [None]:
csvfiles

In [None]:
csvfiles = glob.glob('reviews/top_50_large/' + '*.csv')
top_50_large_reviews =  files_to_df(csvfiles)
top_50_large_reviews

In [178]:
with open('links_top_50_smb.pickle','rb') as f:
    links_top_50_smb = pickle.load(f)
    
with open('links_top_50_large.pickle','rb') as f:
    links_top_50_large = pickle.load(f)

In [200]:
links_top_50_large[36:]

['https://www.glassdoor.com/Reviews/Lawrence-Livermore-National-Laboratory-Reviews-E35235.htm',
 'https://www.glassdoor.com/Reviews/Zscaler-Reviews-E359434.htm',
 'https://www.glassdoor.com/Reviews/In-N-Out-Burger-Reviews-E14276.htm',
 'https://www.glassdoor.com/Reviews/Tanium-Reviews-E952409.htm',
 'https://www.glassdoor.com/Reviews/Red-Hat-Reviews-E8868.htm',
 'https://www.glassdoor.com/Reviews/SAP-Reviews-E10471.htm',
 'https://www.glassdoor.com/Reviews/Zendesk-Reviews-E360923.htm',
 'https://www.glassdoor.com/Reviews/Silicon-Valley-Bank-Reviews-E107161.htm',
 'https://www.glassdoor.com/Reviews/AppFolio-Reviews-E225531.htm',
 'https://www.glassdoor.com/Reviews/Merck-Reviews-E438.htm',
 'https://www.glassdoor.com/Reviews/Meta-Reviews-E40772.htm',
 'https://www.glassdoor.com/Reviews/Houston-Methodist-Reviews-E4460.htm',
 'https://www.glassdoor.com/Reviews/St-Jude-Children-s-Research-Hospital-Reviews-E28315.htm',
 'https://www.glassdoor.com/Reviews/Southern-California-Edison-Reviews-E1

In [None]:
# TODO add a print for each company to keep track

In [202]:
# add chrome options
chrome_options = Options()
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--headless")

# initiate driver 
driver =  webdriver.Chrome(options=chrome_options) 
driver.get('https://www.glassdoor.com/index.htm') 

# NAVIGATE TO SIGN ON 
#page 1
sign_in_button= driver.find_element(By.XPATH, '//button[text()="Sign In"]')
sign_in_button.click()

# login page
username = driver.find_element(By.ID,'modalUserEmail')
username.send_keys('nicholasjang0614@gmail.com')
password = driver.find_element(By.ID,'modalUserPassword')
password.send_keys('ngtee0614')
log_in_button = driver.find_element(By.NAME, 'submit')
log_in_button.click()

# time.sleep(1)
# WAIT FOR MAIN PAGE TO LOAD AFTER LOGIN  
try:
    element = WebDriverWait(driver, 3).until(
        EC.presence_of_element_located((By.ID, 'Discover'))
    )
except:
    print('error loading main page.') 

reviews_companies = pd.DataFrame()

for company_url in links_top_50_large[36:]:
    time.sleep(5)
    print(f'Getting reviews for {company_url}')
    df = get_company_reviews(driver, company_url)
    reviews_companies = pd.concat([reviews_companies, df])
    
reviews_companies 

2014-12-16
2014-10-08
2014-12-15
2014-12-25
2014-12-31
2014-12-19
2014-12-09
2014-12-09
2014-12-22
2014-03-17
2014-12-29
2014-12-16
2014-12-29
2014-12-27
2014-12-30
2014-12-15
2014-12-11
2014-12-22
error loading company page.


AttributeError: 'NoneType' object has no attribute 'text'

In [203]:
""" 
For a single company
Get all reviews by searching through all pages 
"""
def get_company_reviews(driver, company_url):
    try:
        driver.get(company_url)
    except Exception as e:
        print('Unable to get new URL.')
        print(e)

    # WAIT FOR COMPANY PAGE TO LOAD  # time.sleep(2)
    wait_for_page(driver)

    # GET PAGE SOURCE 
    htmlSource = driver.page_source
    soup = BeautifulSoup(htmlSource,"html.parser")

    # GET NUMBER OF PAGES OF REVIEWS 
    num_pages = get_num_pages(soup)

    # get reviews from first page 
    all_reviews = []
    first_review = get_reviews_page(soup)
    all_reviews.append(first_review)

    # if more than one page
    if num_pages > 1:
        for i in range(2,num_pages+1,1): 
            # get the new url for another page 
            current_url = get_new_url(company_url, i) 
            
            # open page and wait for loading 
            try:
                driver.get(current_url)
            except Exception as e:
                print('Unable to get new URL.')
                print(e)
                
            wait_for_page(driver)
            # download the source of page 
            htmlSource = driver.page_source
            soup = BeautifulSoup(htmlSource,"html.parser")
            # get all reviews from page and append to all_reviews list 
            current_reviews = get_reviews_page(soup)
            all_reviews.append(current_reviews)
            # if reviews are older than 2015 stop aggregating 
            if get_date(soup, '2015') == False:
                break
    # save to csv
    final_df = pd.DataFrame(flatten(all_reviews))
    final_df.to_csv(f'{get_company_name(soup)}.csv')
    return final_df

In [146]:
def flatten(lst):
    return list(itertools.chain(*lst))

In [204]:
def wait_for_page(driver):
    # WAIT FOR COMPANY PAGE TO LOAD 
    try:
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, 'EIReviews'))
        )
    except:
        print('error loading company page.')

In [148]:
def get_new_url(company_url, page_num):
    url_split = company_url.split('.')
    url_split[2] = url_split[2] + '_P{}'.format(page_num)
    new_url = '.'.join(url_split)
    return new_url


In [189]:
def get_num_pages(soup):
    footer_string = soup.find(class_='paginationFooter').text
    num_results = int(footer_string.split('of ')[1].split(' ')[0].replace(',','')) 
    num_pages = math.ceil(int(num_results/10))
    return num_pages

In [150]:
def get_date(soup, year):
    reviews = soup.find_all(class_ ='empReview')
    
    for review in reviews:
        author_info = review.find(class_='authorInfo').text.split(' - ')
        date = pd.to_datetime(author_info[0]).date()
        if date < pd.to_datetime(year):
            print(date)
            return False
    
    return True

In [151]:
def get_company_name(soup):
    company_title = soup.find(class_='employerName').text
    return company_title

In [152]:
def get_reviews_page(soup):
    reviews = soup.find_all(class_ ='empReview')
    company_title = soup.find(class_='employerName').text
    reviews_page = []

    for review in reviews:
        review_dict = {}
        headline = review.find('h2').text
        overall_rating = float(review.find(class_='ratingNumber mr-xsm').text)

        author_info = review.find(class_='authorInfo').text.split(' - ')
        date = pd.to_datetime(author_info[0]).date()
        title = author_info[1].split('\xa0')[0]

        pros = review.find_all(class_='v2__EIReviewDetailsV2__fullWidth')[0].find_all('p')[1].text
        cons = review.find_all(class_='v2__EIReviewDetailsV2__fullWidth')[1].find_all('p')[1].text

        review_dict['company'] = company_title
        review_dict['headline'] = headline
        review_dict['date'] = date
        review_dict['overall_rating'] = overall_rating
        review_dict['author_position'] = title
        review_dict['pros'] = pros
        review_dict['cons'] = cons 

        reviews_page.append(review_dict)
        
    return reviews_page