In [4]:
import time
import json
from collections import namedtuple
from random import randint
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys

from urllib import parse
from localcrawler import LocalCrawler
from webcrawler import WebCrawler

In [64]:
Credentials = namedtuple('Credentials', ['username', 'password'])

def get_crawler(uri):
    if parse.urlparse(uri).scheme in ('http', 'https',):
        scraper = WebCrawler(uri)
    else:
        scraper = LocalCrawler(uri)
    return scraper

def init_driver(driver_path):
    driver = webdriver.Chrome(driver_path)
    return driver

def get_credentials(path):
    with open(path, "r") as f:
        lines = f.readlines()
        account = Credentials(username=lines[0], password=lines[1])
    return account

def login(driver, credentials):
    driver.get("http://www.glassdoor.com/profile/login_input.htm")
    try:
        user_field = driver.wait.until(EC.presence_of_element_located(
            (By.NAME, "username")))
        pw_field = driver.find_element_by_class_name("signin-password")
        login_button = driver.find_element_by_id("signInBtn")
        user_field.send_keys(credentials.username)
        user_field.send_keys(Keys.TAB)
        time.sleep(1)
        pw_field.send_keys(credentials.password)
        time.sleep(1)
        login_button.click()
        time.sleep(3)
    except TimeoutException:
        print("TimeoutException! Username/password field or login button not found on glassdoor.com")

def get_total_pages(driver, url):
    driver.get(url)
    total_rev_present = EC.presence_of_element_located((By.CSS_SELECTOR, '.eiCell.cell.reviews.active'))
    total_rev_element = WebDriverWait(driver, 10).until(total_rev_present)
    total_rev_str = total_rev_element.find_element_by_css_selector('.num.h2').text
    total_rev = int(total_rev_str)
    total_pages = total_rev // reviews_per_page
    return total_pages

def get_datetime(review):
    datetime = review.find("time").attrs['datetime']
    return datetime

def get_ratings(review):
    themes = ['Work Life Balance',
              'Culture and Values',
              'Career Opportunities',
              'Compensation and Benefits',
              'Senior Management']
    ratings_html = review.find_all('span', {'class': "gdBars gdRatings med "})
    ratings = {theme: int(float(rating.attrs['title'])) for theme, rating in zip(themes, ratings_html)}
    return ratings

def get_jobtitle(review):
    title = review.find('span', {'class': 'authorJobTitle middle reviewer'})
    if title:
        title = title.text
    return title

def get_location(review):
    location = review.find('span', {'class': 'authorLocation middle'})
    if location:
        location = location.text
    return location

def get_recommendations(review):
    t = review.find('div', {'class': 'flex-grid recommends'})
    recommendations_html = t.find_all('span', class_='middle')
    if recommendations_html:
        recommendations = {'Recommendation {}'.format(idx+1): rec.getText()
                            for idx, rec in enumerate(recommendations_html)}
    else:
        recommendations = None
    return recommendations

def get_maintext(review):
    main_text = review.find('p', {'class': ' tightBot mainText'})
    if main_text:
        main_text = main_text.text.replace(u'\xa0', u' ')
    return main_text

def get_pros(review):
    pros = review.find('p', {'class': ' pros mainText truncateThis wrapToggleStr'})
    if pros:
        pros = pros.text
    return pros

def get_cons(review):
    cons = review.find('p', {'class': ' cons mainText truncateThis wrapToggleStr'})
    if cons:
        cons = cons.text
    return cons

def get_advice(review):
    advice = review.find('p', {'class': 'dviceMgmt mainText truncateThis wrapToggleStr truncatedThis pointer'})
    if advice:
        advice = advice.text
    return advice    

In [5]:
uri = 'www'
crawler = get_crawler(uri)

In [13]:
HTML = next(crawler.get_page())
soup = BeautifulSoup(HTML, "html.parser")
reviews = soup.find_all("li", { "class" : ["empReview", "padVert"] })

In [14]:
review = reviews[1]

In [65]:
get_jobtitle(review)

'Former Employee - Sales Representative'

In [66]:
get_location(review)

'Cleveland, OH'

In [67]:
get_recommendations(review)

{'Recommendation 1': "Doesn't Recommend",
 'Recommendation 2': 'Negative Outlook'}

In [68]:
get_maintext(review)

'I worked at Sandvik full-time (More than 10 years)'

In [69]:
get_pros(review)

'Benefits - Some old timers left with a lot of industry knowledge to learn from.'

In [70]:
get_cons(review)

"Endless management reorganization. Intellectual capital and good managers are leaving. Many managers have insufficient knowledge or experience to manage sales teams and above. Ethical issues. Code of conduct isn't adhered to, problems just get swept under the carpet. Marketing seems lost."

In [71]:
get_advice(review)

In [72]:
location = review.find('span', {'class': 'authorLocation middle'})

In [77]:
location.getText?

In [78]:
review.find('p', {'class': 'dviceMgmt mainText truncateThis wrapToggleStr truncatedThis pointer'})