In [1]:
"""
Scrape reviews from glassdoor using given links 
Saves individual company reviews to csv file
Method to combine all csvs into one dataframe included 
"""

'\nScrape reviews from glassdoor using given links \nSaves individual company reviews to csv file\nMethod to combine all csvs into one dataframe included \n'

In [5]:
import pandas as pd
import numpy as np
import requests
import time
import math
import re
import pickle
import itertools
import glob

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

In [6]:
"""
Load links to ratings 
links_top_50_smb: links to top 50 SMB companies to work for in 2022 according to GD (based off of 2021 reviews)
links_top_large: links to top 100 large companies to work for in 2022 according to GD (based off of 2021 reviews)
links_low_ratings_finance: links to companies rated below 3 (451 total) in the NY area finance industry
# """
# with open('links_top_50_smb.pickle','rb') as f:
#     links_top_50_smb = pickle.load(f)
    
# with open('links_top_large.pickle','rb') as f:
#     links_top_large = pickle.load(f)

# with open('/glassdoor-review-scraper/links_low_ratings_finance.pickle','rb') as f:
#     links_low_ratings_finance = pickle.load(f)

# with open('links_internet.pickle','rb') as f:
#     links_internet = pickle.load(f)
    
with open('links_investment.pickle','rb') as f:
    links_investment = pickle.load(f)
    
# with open('links_finance.pickle','rb') as f:
#     links_finance = pickle.load(f)

In [6]:
#links_investment

In [1]:
internet_links = ['https://www.glassdoor.com/Reviews/Google-Reviews-E9079.htm']

In [2]:
new_links = []
for link in internet_links:
    new_links.append(link + '?sort.sortType=RD&sort.ascending=false')
new_links

['https://www.glassdoor.com/Reviews/Google-Reviews-E9079.htm?sort.sortType=RD&sort.ascending=false']

In [9]:
def get_name_from_url(company_url):
    return company_url.split('https://www.glassdoor.com/Reviews/')[1].split('-')[0]


""" 
For a single company
Get all reviews by searching through all pages 
"""
def get_company_reviews(driver, company_url):
    try:
        driver.get(company_url)
    except Exception as e:
        print('Unable to get new URL. Sleeping for 5 mins.')
        print(e)
        time.sleep(60*5)
        driver.get(company_url)

    # WAIT FOR COMPANY PAGE TO LOAD  # time.sleep(2)   
    if wait_for_page(driver) == False:
        time.sleep(60*5)
        driver.get(company_url)

    # GET PAGE SOURCE 
    htmlSource = driver.page_source
    soup = BeautifulSoup(htmlSource,"html.parser")

    # GET NUMBER OF PAGES OF REVIEWS & company name 
    num_pages = get_num_pages(soup)
    # If < 50 reviews on site in English, skip the company 
    if num_pages < 3: 
        return pd.DataFrame()
    
#     # if comapny is not HQ in US, skip 
#     if headquarters_check(soup) == False:
#         return pd.DataFrame()
    
    # ------
    # otherwise keep going 
    company_name = get_company_name(soup)
    if company_name == None: 
        print('No company name.')
        company_name = get_name_from_url(company_url)

    # get reviews from first page 
    all_reviews = []
    first_review = get_reviews_page(soup)
    all_reviews.append(first_review)

    # if more than one page
    if num_pages > 1:
        for i in range(2,num_pages+1,1): 
            print(f'Scraping page {i}.')
            # get the new url for another page 
            current_url = get_new_url(company_url, i) 
            
            # open page and wait for loading 
            try:
                driver.get(current_url)
            except Exception as e:
                print('Unable to get new URL.')
                print(e)
                
            if wait_for_page(driver) == False:
                print('Wait for page false.')
                time.sleep(60*5)
                driver.get(current_url)
                
            # download the source of page 
            htmlSource = driver.page_source
            soup = BeautifulSoup(htmlSource,"html.parser")
            # get all reviews from page and append to all_reviews list 
            try:
                current_reviews = get_reviews_page(soup)
            except: 
                final_df = pd.DataFrame(flatten(all_reviews))
                final_df.to_csv(f'reviews/top_large/{company_name}.csv')
                return final_df 
            
            all_reviews.append(current_reviews)
            # if reviews are older than 2012 stop aggregating 
            if get_date(soup, '2012') == False:
                break
    # save to csv
    final_df = pd.DataFrame(flatten(all_reviews))
    final_df.to_csv(f'reviews/internet_large/{company_name}.csv')
    return final_df


"""
append sorting criteria to reviews page so reviews are filtered by date, newest first 
"""
def add_sorting_url(links):
    new_links = []
    for link in links:
        new_links.append(link + '?sort.sortType=RD&sort.ascending=false')
    return new_links


"""
Wait for page to load, wait 10 seconds before throwing error 
returns false if there is an error 
"""
def wait_for_page(driver):
    # WAIT FOR COMPANY PAGE TO LOAD 
    try:
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, 'EIReviews'))
        )
    except:
        print('error loading company page.')
        return False
    
    return True 

"""
Used to move to new page under company reviews
"""
def get_new_url(company_url, page_num):
    url_split = company_url.split('.')
    url_split[2] = url_split[2] + f'_P{page_num}'
    new_url = '.'.join(url_split)
    return new_url
# https://www.glassdoor.com/Reviews/KlientBoost-Reviews-E1572205.htm?sort.sortType=RD&sort.ascending=false

"""
Get the number of pages of reviews for a particular company 
"""
def get_num_pages(soup):
    try:
        footer_string = soup.find(class_='paginationFooter').text
        num_results = int(footer_string.split('of ')[1].split(' ')[0].replace(',','')) 
        num_pages = math.ceil(int(num_results/10))
    
    except:
        return 1
    
    return num_pages


"""
check to see where the company is headquartered
"""
def headquarters_check(soup):
    
    try:
        state = soup.find('div',{'data-test':'employer-headquarters'}).text.split(', ')[1]

        state_list = [ 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
               'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
               'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
               'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
               'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY']

        if state in state_list:
            return True
        else:
            return False 
    except:
        print('Could not find HQ on page.')
        return False
    

"""
Get the date of the review 
If the date of the review is earlier than year, return False 
"""
def get_date(soup, year):
    reviews = soup.find_all(class_ ='empReview')
    
    for review in reviews:
        author_info = review.find(class_='authorInfo').text.split(' - ')
        date = pd.to_datetime(author_info[0]).date()
        if date < pd.to_datetime(year):
            print(date)
            return False
    
    return True

def get_company_name(soup):
    try:
        company_title = soup.find(class_='employerName').text
        company_title = company_title.replace('/','')

        if company_title == None:
            company_title = soup.find('h3',{'class':'m-0 css-be8uqy el6ke056'}).text
    except:
        company_title = soup.find('h3',{'class':'m-0 css-be8uqy el6ke056'}).text
        if company_title == None: 
            print('Could not find company title. Tried 2 methods.')
#         html = soup.prettify("utf-8")
#         with open('test.html','w',encoding='UTF-8') as file:
#             file.write(str(soup))

        return 
    
    return company_title

"""
Takes in a page's source (Soup)
Get the reviews on a single page and add to a dictionary 
A dictionary is created for each review 
returns the full list of dictionaries for that page 
"""
def get_reviews_page(soup):
    reviews = soup.find_all(class_ ='empReview')
    company_title = get_company_name(soup)
    reviews_page = []

    for review in reviews:
        review_dict = {}
        headline = review.find('h2').text
        overall_rating = float(review.find(class_='ratingNumber mr-xsm').text)

        author_info = review.find(class_='authorInfo').text.split(' - ')
        date = pd.to_datetime(author_info[0]).date()
        title = author_info[1].split('\xa0')[0]

        pros = review.find_all(class_='v2__EIReviewDetailsV2__fullWidth')[0].find_all('p')[1].text
        cons = review.find_all(class_='v2__EIReviewDetailsV2__fullWidth')[1].find_all('p')[1].text

        review_dict['company'] = company_title
        review_dict['headline'] = headline
        review_dict['date'] = date
        review_dict['overall_rating'] = overall_rating
        review_dict['author_position'] = title
        review_dict['pros'] = pros
        review_dict['cons'] = cons 

        reviews_page.append(review_dict)
        
    return reviews_page


def flatten(lst):
    return list(itertools.chain(*lst))



In [10]:
# add chrome options
chrome_options = Options()
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--headless")

# initiate driver 
driver =  webdriver.Chrome(options=chrome_options) 
driver.get('https://www.glassdoor.com/index.htm') 

# NAVIGATE TO SIGN ON 
#page 1
sign_in_button= driver.find_element(By.XPATH, '//button[text()="Sign In"]')
sign_in_button.click()

# login page
username = driver.find_element(By.ID,'modalUserEmail')
username.send_keys('nicholasjang0614@gmail.com')
password = driver.find_element(By.ID,'modalUserPassword')
password.send_keys('ngtee0614')
log_in_button = driver.find_element(By.NAME, 'submit')
log_in_button.click()

# time.sleep(1)
# WAIT FOR MAIN PAGE TO LOAD AFTER LOGIN  
try:
    element = WebDriverWait(driver, 3).until(
        EC.presence_of_element_located((By.ID, 'Discover'))
    )
except:
    print('error loading main page.') 

reviews_companies = pd.DataFrame()

for company_url in new_links:
    time.sleep(5)
    
    print(f'Getting reviews for {company_url}')
    df = get_company_reviews(driver, company_url)
    # if the company doesn't have enough reviews skip 
    if len(df) > 1:
        reviews_companies = pd.concat([reviews_companies, df])
    else:
        print(f'Skipping company {company_url}')
    
reviews_companies 

Getting reviews for https://www.glassdoor.com/Reviews/Google-Reviews-E9079.htm?sort.sortType=RD&sort.ascending=false
Scraping page 2.


  if date < pd.to_datetime(year):


Scraping page 3.
Scraping page 4.
Scraping page 5.
Scraping page 6.
Scraping page 7.
Scraping page 8.
Scraping page 9.
Scraping page 10.
Scraping page 11.
Scraping page 12.
Scraping page 13.
Scraping page 14.
Scraping page 15.
Scraping page 16.
Scraping page 17.
Scraping page 18.
Scraping page 19.
Scraping page 20.
Scraping page 21.
Scraping page 22.
Scraping page 23.
Scraping page 24.
Scraping page 25.
Scraping page 26.
Scraping page 27.
Scraping page 28.
Scraping page 29.
Scraping page 30.
Scraping page 31.
Scraping page 32.
Scraping page 33.
Scraping page 34.
Scraping page 35.
Scraping page 36.
Scraping page 37.
Scraping page 38.
Scraping page 39.
Scraping page 40.
Scraping page 41.
Scraping page 42.
Scraping page 43.
Scraping page 44.
Scraping page 45.
Scraping page 46.
Scraping page 47.
Scraping page 48.
Scraping page 49.
Scraping page 50.
Scraping page 51.
Scraping page 52.
Scraping page 53.
Scraping page 54.
Scraping page 55.
Scraping page 56.
Scraping page 57.
Scraping page 58.

Unnamed: 0,company,headline,date,overall_rating,author_position,pros,cons
0,Google,"Great company, difficult to navigate",2022-06-21,5.0,Marketing Manager,Opportunity to work with and learn from awesom...,"Very self-service oriented, meaning you have t..."
1,Google,"Great overall company culture, but it comes do...",2022-06-21,4.0,Program Manager,Excellent employee benefits and competitive sa...,I'm sure you've read all the great things abou...
2,Google,Not the same company anymore,2022-06-21,3.0,Director,"Smart, innovative and dedicated employees",New leaders with no technology or RRK backgrou...
3,Google,Excellent experience,2022-06-21,5.0,Software Engineer(Internship),They cared about our internship experience (ev...,"Physical internship would be better, but it wa..."
4,Google,Great,2022-06-21,5.0,Advertising,Great work environment and people,Lots of work sometimes from high ups
...,...,...,...,...,...,...,...
12635,Google,The company of a dream,2020-08-21,5.0,Chief Technology Officer (CTO),Really great place to work at,Nothing found from my side
12636,Google,Software developer,2020-08-21,5.0,Backend Developer,"Best in industry , best ever",there are no as such cons
12637,Google,Good Company,2020-08-21,5.0,Research Scientist,I had a very good experience. Thank you for wh...,life is very expensive and the number of teams...
12638,Google,best,2020-08-21,5.0,Data Scientist,Best company to work with.,Nothing cons. to write about this.


In [54]:
for company_url in reversed(links_investment[:-24]):
    print(company_url)

https://www.glassdoor.com/Reviews/Robinhood-Reviews-E1167765.htm?sort.sortType=RD&sort.ascending=false
https://www.glassdoor.com/Reviews/Apollo-Global-Management-Reviews-E2715.htm?sort.sortType=RD&sort.ascending=false
https://www.glassdoor.com/Reviews/The-Carlyle-Group-Reviews-E3670.htm?sort.sortType=RD&sort.ascending=false
https://www.glassdoor.com/Reviews/Geller-and-Company-Reviews-E366900.htm?sort.sortType=RD&sort.ascending=false
https://www.glassdoor.com/Reviews/Moelis-and-Company-Reviews-E235071.htm?sort.sortType=RD&sort.ascending=false
https://www.glassdoor.com/Reviews/Brookfield-Asset-Management-Reviews-E5824.htm?sort.sortType=RD&sort.ascending=false
https://www.glassdoor.com/Reviews/KKR-Reviews-E2865.htm?sort.sortType=RD&sort.ascending=false
https://www.glassdoor.com/Reviews/Ally-Financial-Reviews-E7222.htm?sort.sortType=RD&sort.ascending=false
https://www.glassdoor.com/Reviews/Millennium-Reviews-E850344.htm?sort.sortType=RD&sort.ascending=false
https://www.glassdoor.com/Review