In [80]:
import time
import json
from collections import namedtuple
from random import randint
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys

In [79]:
Credentials = namedtuple('Credentials', ['username', 'password'])


def init_driver(driver_path):
    driver = webdriver.Chrome(driver_path)
    driver.wait = WebDriverWait(driver, 10)
    return driver

def get_credentials(path):
    with open(path, "r") as f:
        lines = f.readlines()
        account = Credentials(username=lines[0], password=lines[1])
    return account

def login(driver, credentials):
    driver.get("http://www.glassdoor.com/profile/login_input.htm")
    try:
        user_field = driver.wait.until(EC.presence_of_element_located(
            (By.NAME, "username")))
        pw_field = driver.find_element_by_class_name("signin-password")
        login_button = driver.find_element_by_id("signInBtn")
        user_field.send_keys(credentials.username)
        user_field.send_keys(Keys.TAB)
        time.sleep(1)
        pw_field.send_keys(credentials.password)
        time.sleep(1)
        login_button.click()
        time.sleep(3)
    except TimeoutException:
        print("TimeoutException! Username/password field or login button not found on glassdoor.com")

def get_total_pages(driver, url):
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    total_rev_str = soup.find('a', {'class': 'eiCell cell reviews active'}).find('span', {'class': 'num h2'}).text
    total_rev = int(total_rev_str)
    total_pages = total_rev // reviews_per_page
    return total_pages

def get_datetime(review):
    datetime = review.find("time").attrs['datetime']
    return datetime

def get_ratings(review):
    themes = ['Work Life Balance',
              'Culture and Values',
              'Career Opportunities',
              'Compensation and Benefits',
              'Senior Management']
    ratings_html = review.find_all('span', {'class': "gdBars gdRatings med "})
    ratings = {theme: int(float(rating.attrs['title'])) for theme, rating in zip(themes, ratings_html)}
    return ratings

def get_title(review):
    title = review.find('span', {'class': 'authorJobTitle middle reviewer'}).getText()
    return title

def get_location(review):
    location = review.find('span', {'class': 'authorLocation middle'}).getText()
    return location

def get_recommendations(review):
    options = {'CE'}
    t = review.find('div', {'class': 'flex-grid recommends'})
    recommendations_html = t.find_all('span', class_='middle')
    recommendations = {'Recommendation {}'.format(idx+1): rec.getText()
                        for idx, rec in enumerate(recommendations_html)}
    return recommendations

In [3]:
chromium_path = "/usr/local/bin/chromedriver"
reviews_per_page = 10
companyName = "sandvik"
companyURL = "https://www.glassdoor.com/Reviews/Sandvik-Reviews-E10375.htm"

In [4]:
driver = init_driver(chromium_path)
credentials = get_credentials("account.txt")
login(driver=driver, credentials=credentials)
total_pages = get_total_pages(driver=driver, url=companyURL)

In [6]:
HTML = driver.page_source
soup = BeautifulSoup(HTML, "html.parser")
reviews = soup.find_all("li", { "class" : ["empReview", "padVert"] })

In [None]:
driver.get(companyURL)

In [None]:
total_pages = get_total_pages(driver=driver, url=companyURL)

In [None]:
d = []

for page_no in range(1, total_pages):
    currentURL = '{}_P{}.htm'.format(companyURL[:companyURL.rfind('.')], page_no)
    driver.get(currentURL)
    driver.execute_script("window.scrollTo(0, 100)")
    time.sleep(randint(5,10))
    HTML = driver.page_source
    soup = BeautifulSoup(HTML, "html.parser")
    reviews = soup.find_all("li", { "class" : ["empReview", "padVert"] })
    dates = parse_reviews_HTML(reviews)
    d.extend(dates)

In [59]:
review = reviews[1]

In [60]:
review.find("time").attrs['datetime']

'2017-11-26'

In [61]:
review

<li class=" empReview cf " id="empReview_18070220"><div class="hreview"><div class="cf"><div class="floatLt"><time class="date subtle small" datetime="2017-11-26"> Nov 26, 2017</time></div><p class="helpfulReviews small tightVert floatRt"></p></div><div class=" tbl fill reviewTop"><div class="row"><div class="cell sqLogoCell showDesk"><span class="sqLogo tighten smSqLogo logoOverlay"><img alt="Sandvik Logo" class="lazy lazy-loaded" data-original="https://media.glassdoor.com/sqls/10375/sandvik-squarelogo.png" data-original-2x="https://media.glassdoor.com/sqlm/10375/sandvik-squarelogo.png" data-retina-ok="true" src="https://media.glassdoor.com/sqls/10375/sandvik-squarelogo.png" style="opacity: 1;" title=""/></span></div><div class="cell"><h2 class=" h2 summary strong tightTop margBotXs"><a class="reviewLink" href="/Reviews/Employee-Review-Sandvik-RVW18070220.htm"><span class="summary ">"sales engineer"</span></a></h2><div class="tbl reviewMeta"><div class="gdStarsWrapper cell top"><span 

In [71]:
t = review.find('div', {'class': 'flex-grid recommends'})

In [73]:
t.find_all('span', class_='middle')[0].getText()

"Doesn't Recommend"

In [77]:
t = review.find('div', {'class': 'flex-grid recommends'})
recommendations_html = t.find_all('span', class_='middle')
recommendations = {'Recommendation {}'.format(idx+1): rec.getText()
                   for idx, rec in enumerate(recommendations_html)}

In [75]:
recommendations

{'Recommendation 1': "Doesn't Recommend",
 'Recommendation 2': 'Negative Outlook'}