In [2]:
# This program scrapes the list of companies located in California and their overview information
# referencing code from this github: https://github.com/MatthewChatham/glassdoor-review-scraper
import time
import pandas as pd
import logging
import logging.config
from selenium import webdriver as wd
from selenium.common.exceptions import TimeoutException
import selenium
import numpy as np
import json
import urllib
import datetime as dt

# grab login information from 'secret.json'
# secret.json should be a file that includes your username and password information so that it can typed into the glassdoor login page later
try:
    with open('secret.json') as f:
        d = json.loads(f.read())
        username = d['username']
        password = d['password']
except FileNotFoundError:
    msg = 'Please provide Glassdoor credentials.'
    raise Exception(msg)
    
# set up logger tracking and information 
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
logger.addHandler(ch)
formatter = logging.Formatter(
    '%(asctime)s %(levelname)s %(lineno)d\
    :%(filename)s(%(process)d) - %(message)s')
ch.setFormatter(formatter)

logging.getLogger('selenium').setLevel(logging.CRITICAL)
logging.getLogger('selenium').setLevel(logging.CRITICAL)

In [149]:
# open up a browser to start testing
def get_browser():
    logger.info('Configuring browser')
    chrome_options = wd.ChromeOptions()
    chrome_options.add_argument('log-level=3')
    browser = wd.Chrome(options=chrome_options, executable_path='./chromedriver.exe')
    return browser

browser = get_browser()

2019-06-08 20:03:11,278 INFO 3    :<ipython-input-149-ab27556e724b>(477) - Configuring browser
2019-06-08 20:03:11,278 INFO 3    :<ipython-input-149-ab27556e724b>(477) - Configuring browser


In [150]:
# sign in to Glassdoor
def sign_in():
    logger.info(f'Signing in as {username}')

    url = 'https://www.glassdoor.com/profile/login_input.htm'
    browser.get(url)

    email_field = browser.find_element_by_name('username')
    password_field = browser.find_element_by_name('password')
    submit_btn = browser.find_element_by_xpath('//button[@type="submit"]')

    email_field.send_keys(username)
    password_field.send_keys(password)
    submit_btn.click()

    time.sleep(1)
    
sign_in()

2019-06-08 20:03:19,828 INFO 3    :<ipython-input-150-ccc1d71de424>(477) - Signing in as hanmorinn@gmail.com
2019-06-08 20:03:19,828 INFO 3    :<ipython-input-150-ccc1d71de424>(477) - Signing in as hanmorinn@gmail.com


In [41]:
# to navigate to the right page
def navigate_to_page(url):
    logger.info('Navigating to company list in California.')
    browser.get(url)
    time.sleep(1)

# navigate to the list of companies page under California
# add a number before .htm to reach a specific page
DEFAULT_URL = ('https://www.glassdoor.com/Reviews/california-reviews-SRCH_IL.0,10_IS2280.htm')
# page = '129'
# DEFAULT_URL = ('https://www.glassdoor.com/Reviews/california-reviews-SRCH_IL.0,10_IS2280_IP' + page + '.htm')
navigate_to_page(DEFAULT_URL)

2019-06-07 16:54:45,901 INFO 3    :<ipython-input-41-d21029be15ed>(477) - Navigating to company list in California.


In [42]:
# helper functions for collecting lists of companies

# grab company titles and the link urls of their page
def get_company_list():
    # get the web elements of the 10 companies on the page
    tcompanies = browser.find_elements_by_class_name('margBotXs')
    for company in tcompanies:
        name = company.text.strip()
        url = company.find_element_by_tag_name('a').get_attribute("href");
        company_list.append((name, url))
    time.sleep(1)
        
# go to the next page
def get_next_page():
    paging_control = browser.find_element_by_class_name('pagingControls')
    next_ = paging_control.find_element_by_class_name(
        'next').find_element_by_tag_name('a')
    browser.get(next_.get_attribute('href'))
    time.sleep(1)

In [19]:
# main function that calls the previous scraping functions

pagenum = 0 #129 # start at one
pagemax = 1 #690 # max pages we should try to scrape for now
company_list = []
error_list = []
json_name = "compaylistfirst.json" # file name to store company names and url into

# starts a timer to see how long the program takes
start = time.time()

for page in range(pagenum, pagemax):
    try:
        logger.info(f'Extracting company names from page {page}')
        # get the company list information and add to the dictionary
        get_company_list()
        get_next_page()
    except TimeoutException as e:
        print("Exception has been thrown. " + str(e))
        logger.info(f'Timed out on page {page}')
        error_list.append(page)
    
    if page % 100 == 0:
        with open(json_name, 'w') as json_file:
            logger.info(f'Updating json file now that we hit the {page}th page')
            json.dump(company_list, json_file)
               
logger.info(f'Finished collecting!')
    
# save as a json file
with open(json_name, 'w') as json_file:
  json.dump(company_list, json_file)

# end recording time
end = time.time()
logger.info(f'Finished in {end - start} seconds')

2019-06-07 13:16:34,691 INFO 14    :<ipython-input-19-c3aa0be64ff7>(477) - Extracting company names from page 0
2019-06-07 13:16:39,281 INFO 25    :<ipython-input-19-c3aa0be64ff7>(477) - Updating json file now that we hit the 0th page
2019-06-07 13:16:39,283 INFO 28    :<ipython-input-19-c3aa0be64ff7>(477) - Finished collecting!
2019-06-07 13:16:39,284 INFO 36    :<ipython-input-19-c3aa0be64ff7>(477) - Finished in 4.593361139297485 seconds


In [87]:
len(company_list)
print(error_list)

# check the pages where there was a timeout and run the code again to grab their information
for page in error_list:
    url = "https://www.glassdoor.com/Reviews/california-reviews-SRCH_IL.0,10_IS2280_IP" + str(page) + ".htm"
    try:
        logger.info(f'Extracting company names from page {page}')
        # get the company list information and add to the dictionary
        get_company_list()
    except TimeoutException as e:
        print("Exception has been thrown. " + str(e))
        logger.info(f'Timed out on page {page}')
        error_list.append(page)
        
logger.info(f'Finished collecting!')
        
# save as a json file again
with open(json_name, 'w') as json_file:
  json.dump(company_list, json_file)

2019-06-02 23:42:59,417 INFO 8    :<ipython-input-87-50159119542f>(126) - Extracting company names from page 383


[383]


2019-06-02 23:43:00,777 INFO 16    :<ipython-input-87-50159119542f>(126) - Finished collecting!


In [20]:
# check the length of the company list to see if the amount of companies scraped matches the pages visited
len(company_list)

10

In [119]:
# function that takes in a company dict and fills it overview info values
def get_company_overview(company):
    header = browser.find_element_by_id('EIProductHeaders')
    
    # find the salary url for jac
    salary = header.find_element_by_class_name('salaries').get_attribute('href')
    company.update({"Salary URL": salary})
    
    # find the benefits url for zoe
    benefits = header.find_element_by_class_name('benefits').get_attribute('href')
    company.update({"Benefits URL": benefits})
    
    # also find the amount of jobs available for the company
    jobs = header.find_element_by_class_name('jobs').text.strip("Jobs").rstrip()
    company.update({"Jobs Available": jobs})
    
    # also find the locations of the company
    try:
        loccon = browser.find_element_by_id('ZCol').find_element_by_class_name("undecorated").find_elements_by_xpath('./li/a')
        loclist = []
        for location in loccon:
            loclist.append(location.get_attribute("innerHTML"))
        company.update({"Locations": loclist})
    except:
        company.update({"Locations": "N/A"})
    

    # find overview information for tina (me)
    overview = browser.find_elements_by_class_name('infoEntity')
    for info in overview:
        label = info.find_element_by_tag_name('label').text
        value = info.find_element_by_class_name('value').text
        company.update({label: value})

In [None]:
# if i need to open up a company list
# this code is for when I take breaks and close jupyter notebook, but kept scraped data in a file
# list_json = 'company.json'
# with open(list_json) as f:
#     company_list = json.load(f)

# print(company_list)

In [139]:
# initiliaze the company info scrape
company_info = []
error_companies = []
company_json = "all_company_info.json"

[('Bank of America',
  'https://www.glassdoor.com/Overview/Working-at-Bank-of-America-EI_IE8874.11,26.htm'),
 ('US Navy',
  'https://www.glassdoor.com/Overview/Working-at-US-Navy-EI_IE41451.11,18.htm'),
 ("Macy's",
  'https://www.glassdoor.com/Overview/Working-at-Macy-s-EI_IE1079.11,17.htm'),
 ('AT&T',
  'https://www.glassdoor.com/Overview/Working-at-AT-and-T-EI_IE613.11,19.htm'),
 ('Intuit',
  'https://www.glassdoor.com/Overview/Working-at-Intuit-EI_IE2293.11,17.htm'),
 ('US Marine Corps',
  'https://www.glassdoor.com/Overview/Working-at-US-Marine-Corps-EI_IE41423.11,26.htm'),
 ('Walmart',
  'https://www.glassdoor.com/Overview/Working-at-Walmart-EI_IE715.11,18.htm'),
 ('The Home Depot',
  'https://www.glassdoor.com/Overview/Working-at-The-Home-Depot-EI_IE655.11,25.htm'),
 ('Yahoo',
  'https://www.glassdoor.com/Overview/Working-at-Yahoo-EI_IE5807.11,16.htm'),
 ('Salesforce',
  'https://www.glassdoor.com/Overview/Working-at-Salesforce-EI_IE11159.11,21.htm'),
 ('Facebook', 'https://www.g

In [151]:
# starts a timer to see how long the program takes
start = time.time()

# from the company list
for index, (name, url) in enumerate(company_list[850:]):
    try: 
        browser.get(url)
        time.sleep(1)
        company = {"Name": name, "Overview URL": url}
        get_company_overview(company)
        company_info.append(company)
        logger.info(f'Visited {name}')
    except TimeoutException as e:
        print("Exception has been thrown. " + str(e))
        logger.info(f'Timed out on {name}, the {index}th company')
        error_companies.append((name, url))
    
    if index % 50 == 0:
        with open(company_json, 'w') as json_file:
            logger.info(f'Updating json file now that we hit the {index}th company')
            json.dump(company_info, json_file)
    
logger.info(f'Finished collecting!')
    
# just so it's readable, also save as a json file
with open(company_json, 'w') as json_file:
  json.dump(company_info, json_file)

# end recording time
end = time.time()
logger.info(f'Finished in {end - start} seconds')

2019-06-08 20:03:34,564 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Kahuna
2019-06-08 20:03:34,564 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Kahuna
2019-06-08 20:03:34,565 INFO 20    :<ipython-input-151-5d3b57b34eeb>(477) - Updating json file now that we hit the 0th company
2019-06-08 20:03:34,565 INFO 20    :<ipython-input-151-5d3b57b34eeb>(477) - Updating json file now that we hit the 0th company
2019-06-08 20:03:39,794 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited MindTouch, Inc.
2019-06-08 20:03:39,794 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited MindTouch, Inc.
2019-06-08 20:03:43,747 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited inhouseIT
2019-06-08 20:03:43,747 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited inhouseIT
2019-06-08 20:03:47,572 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited 6sense
2019-06-08 20:03:47,572 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited 6

2019-06-08 20:06:28,730 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Fandango
2019-06-08 20:06:28,730 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Fandango
2019-06-08 20:06:32,688 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Onestop Internet
2019-06-08 20:06:32,688 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Onestop Internet
2019-06-08 20:06:36,898 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited OpenGov
2019-06-08 20:06:36,898 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited OpenGov
2019-06-08 20:06:40,826 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited LivingSocial
2019-06-08 20:06:40,826 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited LivingSocial
2019-06-08 20:06:44,854 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited PubMatic
2019-06-08 20:06:44,854 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited PubMatic
2019-06-08 20:06:48,782 INFO 12    :<ipython-inp

2019-06-08 20:09:25,064 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Nextgen Technologies
2019-06-08 20:09:25,064 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Nextgen Technologies
2019-06-08 20:09:29,369 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Sonic (Internet and Phone)
2019-06-08 20:09:29,369 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Sonic (Internet and Phone)
2019-06-08 20:09:33,715 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Webdam
2019-06-08 20:09:33,715 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Webdam
2019-06-08 20:09:37,757 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited AnchorFree
2019-06-08 20:09:37,757 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited AnchorFree
2019-06-08 20:09:41,775 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Adchemy
2019-06-08 20:09:41,775 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Adchemy
2019-06-08 2

2019-06-08 20:12:15,824 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Cryptic Studios
2019-06-08 20:12:19,689 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Zodiac Inflight Innovations
2019-06-08 20:12:19,689 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Zodiac Inflight Innovations
2019-06-08 20:12:23,725 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited LTI
2019-06-08 20:12:23,725 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited LTI
2019-06-08 20:12:27,970 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Finisar
2019-06-08 20:12:27,970 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Finisar
2019-06-08 20:12:32,442 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Talend
2019-06-08 20:12:32,442 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Talend
2019-06-08 20:12:36,870 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Alteryx
2019-06-08 20:12:36,870 INFO 12    :<ipyth

2019-06-08 20:15:07,190 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited NTN Buzztime
2019-06-08 20:15:07,190 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited NTN Buzztime
2019-06-08 20:15:11,022 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited SOLUTE
2019-06-08 20:15:11,022 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited SOLUTE
2019-06-08 20:15:15,013 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Radius Intelligence
2019-06-08 20:15:15,013 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Radius Intelligence
2019-06-08 20:15:19,197 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Bloomspot
2019-06-08 20:15:19,197 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Bloomspot
2019-06-08 20:15:23,606 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Red Hat
2019-06-08 20:15:23,606 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Red Hat
2019-06-08 20:15:27,654 INFO 12    :<ipython

2019-06-08 20:18:05,200 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited EdgeWave
2019-06-08 20:18:05,200 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited EdgeWave
2019-06-08 20:18:08,869 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Indiegogo
2019-06-08 20:18:08,869 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Indiegogo
2019-06-08 20:18:12,773 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Velodyne LiDAR
2019-06-08 20:18:12,773 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Velodyne LiDAR
2019-06-08 20:18:16,996 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited ThousandEyes
2019-06-08 20:18:16,996 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited ThousandEyes
2019-06-08 20:18:20,946 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Sysco LABS
2019-06-08 20:18:20,946 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Sysco LABS
2019-06-08 20:18:24,899 INFO 12    :<ipython

2019-06-08 20:20:57,752 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Advantech
2019-06-08 20:20:57,752 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Advantech
2019-06-08 20:21:02,025 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Elastic
2019-06-08 20:21:02,025 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Elastic
2019-06-08 20:21:05,837 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Allied Digital Services
2019-06-08 20:21:05,837 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Allied Digital Services
2019-06-08 20:21:09,983 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited AudaExplore - a Solera Company
2019-06-08 20:21:09,983 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited AudaExplore - a Solera Company
2019-06-08 20:21:13,884 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Triple Crown Consulting
2019-06-08 20:21:13,884 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) 

2019-06-08 20:23:48,483 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited LucasArts
2019-06-08 20:23:52,452 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited VSCO
2019-06-08 20:23:52,452 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited VSCO
2019-06-08 20:23:56,527 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited DeltaTRAK
2019-06-08 20:23:56,527 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited DeltaTRAK
2019-06-08 20:24:00,410 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited iolo technologies
2019-06-08 20:24:00,410 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited iolo technologies
2019-06-08 20:24:04,423 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited ID Analytics
2019-06-08 20:24:04,423 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited ID Analytics
2019-06-08 20:24:08,466 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Eyefinity
2019-06-08 20:24:08,466 INFO 12    :<ipython-inp

2019-06-08 20:26:44,639 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Aryaka Networks
2019-06-08 20:26:44,639 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Aryaka Networks
2019-06-08 20:26:48,706 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Druva
2019-06-08 20:26:48,706 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Druva
2019-06-08 20:26:52,192 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Darktrace
2019-06-08 20:26:52,192 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Darktrace
2019-06-08 20:26:56,315 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Fusion-io
2019-06-08 20:26:56,315 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Fusion-io
2019-06-08 20:27:00,296 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited MarketShare
2019-06-08 20:27:00,296 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited MarketShare
2019-06-08 20:27:04,180 INFO 12    :<ipython-input-1

2019-06-08 20:29:32,405 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Intercom
2019-06-08 20:29:36,731 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Prezi
2019-06-08 20:29:36,731 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Prezi
2019-06-08 20:29:40,799 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited TPx Communications
2019-06-08 20:29:40,799 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited TPx Communications
2019-06-08 20:29:44,659 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Symphony
2019-06-08 20:29:44,659 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Symphony
2019-06-08 20:29:48,683 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited CollabNet
2019-06-08 20:29:48,683 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited CollabNet
2019-06-08 20:29:52,796 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Skyhigh Networks
2019-06-08 20:29:52,796 INFO 12    :<ipython-i

2019-06-08 20:32:24,581 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Clique
2019-06-08 20:32:24,581 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Clique
2019-06-08 20:32:28,559 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Nexenta Systems
2019-06-08 20:32:28,559 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Nexenta Systems
2019-06-08 20:32:32,528 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Kontron
2019-06-08 20:32:32,528 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Kontron
2019-06-08 20:32:37,121 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited betterworks
2019-06-08 20:32:37,121 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited betterworks
2019-06-08 20:32:41,235 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Guidebook
2019-06-08 20:32:41,235 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Guidebook
2019-06-08 20:32:45,241 INFO 12    :<ipython-input-151

2019-06-08 20:35:11,458 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Picarro
2019-06-08 20:35:15,286 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Aptina Imaging
2019-06-08 20:35:15,286 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Aptina Imaging
2019-06-08 20:35:19,186 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Syapse
2019-06-08 20:35:19,186 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Syapse
2019-06-08 20:35:23,216 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Fastly
2019-06-08 20:35:23,216 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Fastly
2019-06-08 20:35:27,298 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited UrbanSitter
2019-06-08 20:35:27,298 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited UrbanSitter
2019-06-08 20:35:31,461 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited EVGA
2019-06-08 20:35:31,461 INFO 12    :<ipython-input-151-5d3b57b34e

2019-06-08 20:38:00,599 INFO 20    :<ipython-input-151-5d3b57b34eeb>(477) - Updating json file now that we hit the 500th company
2019-06-08 20:38:04,857 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Pivotal Labs
2019-06-08 20:38:04,857 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Pivotal Labs
2019-06-08 20:38:08,734 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited TEGILE SYSTEMS
2019-06-08 20:38:08,734 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited TEGILE SYSTEMS
2019-06-08 20:38:13,701 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Actian
2019-06-08 20:38:13,701 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Actian
2019-06-08 20:38:17,933 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited InfoObjects
2019-06-08 20:38:17,933 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited InfoObjects
2019-06-08 20:38:21,757 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Humanity
2019-06-08 2

2019-06-08 20:40:55,361 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Yamaha Motor
2019-06-08 20:40:59,291 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited WRKSHP
2019-06-08 20:40:59,291 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited WRKSHP
2019-06-08 20:41:03,246 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Thanx
2019-06-08 20:41:03,246 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Thanx
2019-06-08 20:41:07,578 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Mercari
2019-06-08 20:41:07,578 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Mercari
2019-06-08 20:41:11,340 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited HighWire
2019-06-08 20:41:11,340 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited HighWire
2019-06-08 20:41:15,225 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited SLI Systems
2019-06-08 20:41:15,225 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) -

2019-06-08 20:43:51,628 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Safran
2019-06-08 20:43:51,628 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Safran
2019-06-08 20:43:55,667 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Sprinklr
2019-06-08 20:43:55,667 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Sprinklr
2019-06-08 20:43:59,578 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Zebra Technologies
2019-06-08 20:43:59,578 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Zebra Technologies
2019-06-08 20:44:04,109 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited FindLaw
2019-06-08 20:44:04,109 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited FindLaw
2019-06-08 20:44:08,485 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Spark Networks
2019-06-08 20:44:08,485 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Spark Networks
2019-06-08 20:44:12,292 INFO 12    :<ipython

2019-06-08 20:46:42,957 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Climber.com
2019-06-08 20:46:42,957 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Climber.com
2019-06-08 20:46:46,924 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited CliniComp
2019-06-08 20:46:46,924 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited CliniComp
2019-06-08 20:46:50,830 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Echelon Corporation
2019-06-08 20:46:50,830 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Echelon Corporation
2019-06-08 20:46:54,499 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited ListReports
2019-06-08 20:46:54,499 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited ListReports
2019-06-08 20:46:58,699 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited KickFire
2019-06-08 20:46:58,699 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited KickFire
2019-06-08 20:47:02,573 INFO 12   

2019-06-08 20:49:28,236 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Certain
2019-06-08 20:49:28,236 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Certain
2019-06-08 20:49:32,114 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Power Integrations
2019-06-08 20:49:32,114 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Power Integrations
2019-06-08 20:49:36,199 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Originate
2019-06-08 20:49:36,199 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Originate
2019-06-08 20:49:40,009 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Arena Solutions
2019-06-08 20:49:40,009 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Arena Solutions
2019-06-08 20:49:44,435 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Redbooth
2019-06-08 20:49:44,435 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Redbooth
2019-06-08 20:49:48,555 INFO 12    :

2019-06-08 20:52:29,080 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Flipboard
2019-06-08 20:52:32,874 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Printronix
2019-06-08 20:52:32,874 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Printronix
2019-06-08 20:52:36,877 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Symmetricom
2019-06-08 20:52:36,877 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Symmetricom
2019-06-08 20:52:40,691 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Bluescape
2019-06-08 20:52:40,691 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Bluescape
2019-06-08 20:52:44,847 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Docker
2019-06-08 20:52:44,847 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Docker
2019-06-08 20:52:48,982 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Nowcom
2019-06-08 20:52:48,982 INFO 12    :<ipython-input-151-5d3b57b3

2019-06-08 20:55:19,205 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Verizon Connect
2019-06-08 20:55:23,272 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited SmartPlay
2019-06-08 20:55:23,272 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited SmartPlay
2019-06-08 20:55:27,329 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Gateway
2019-06-08 20:55:27,329 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Gateway
2019-06-08 20:55:31,891 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Vlocity
2019-06-08 20:55:31,891 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Vlocity
2019-06-08 20:55:36,100 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Scality
2019-06-08 20:55:36,100 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Scality
2019-06-08 20:55:39,989 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited AMADA
2019-06-08 20:55:39,989 INFO 12    :<ipython-input-151-5d3b57b34eeb>(4

2019-06-08 20:58:18,481 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited RFI Comm. & Security Systems
2019-06-08 20:58:18,481 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited RFI Comm. & Security Systems
2019-06-08 20:58:22,556 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Life360
2019-06-08 20:58:22,556 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Life360
2019-06-08 20:58:26,962 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited WineDirect
2019-06-08 20:58:26,962 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited WineDirect
2019-06-08 20:58:31,044 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Verys
2019-06-08 20:58:31,044 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Verys
2019-06-08 20:58:35,216 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Remix
2019-06-08 20:58:35,216 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Remix
2019-06-08 20:58:35,218 INFO 20    :<ipy

2019-06-08 21:01:14,070 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Warner Bros. Interactive Entertainment
2019-06-08 21:01:14,070 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Warner Bros. Interactive Entertainment
2019-06-08 21:01:18,343 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Tom Sawyer Software
2019-06-08 21:01:18,343 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Tom Sawyer Software
2019-06-08 21:01:22,570 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited SecureAuth
2019-06-08 21:01:22,570 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited SecureAuth
2019-06-08 21:01:26,821 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Nextlabs
2019-06-08 21:01:26,821 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Nextlabs
2019-06-08 21:01:30,948 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Edlio
2019-06-08 21:01:30,948 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visi

2019-06-08 21:04:03,590 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Kamcord
2019-06-08 21:04:03,590 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Kamcord
2019-06-08 21:04:07,495 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited GoPago
2019-06-08 21:04:07,495 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited GoPago
2019-06-08 21:04:11,302 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Business Communications Solutions
2019-06-08 21:04:11,302 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Business Communications Solutions
2019-06-08 21:04:15,245 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Totango
2019-06-08 21:04:15,245 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Totango
2019-06-08 21:04:19,231 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Sendoso
2019-06-08 21:04:19,231 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Sendoso
2019-06-08 21:04:22,976 INFO 1

2019-06-08 21:06:55,620 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Partnerize
2019-06-08 21:06:59,503 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited RagingWire Data Centers
2019-06-08 21:06:59,503 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited RagingWire Data Centers
2019-06-08 21:07:03,476 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Quantros
2019-06-08 21:07:03,476 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Quantros
2019-06-08 21:07:07,827 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited SIMCO Electronics
2019-06-08 21:07:07,827 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited SIMCO Electronics
2019-06-08 21:07:11,471 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Certent
2019-06-08 21:07:11,471 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Certent
2019-06-08 21:07:15,434 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Dremio
2019-06-08 21:07:15,434 

2019-06-08 21:09:44,455 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Lattice
2019-06-08 21:09:44,455 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Lattice
2019-06-08 21:09:48,715 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Cognitive Medical Systems
2019-06-08 21:09:48,715 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Cognitive Medical Systems
2019-06-08 21:09:52,765 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited VeloCloud
2019-06-08 21:09:52,765 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited VeloCloud
2019-06-08 21:09:57,004 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited eatsa
2019-06-08 21:09:57,004 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited eatsa
2019-06-08 21:10:01,393 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited DecisionLogic
2019-06-08 21:10:01,393 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited DecisionLogic
2019-06-08 21:10:05,370 INFO 12 

2019-06-08 21:12:31,454 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Redbubble
2019-06-08 21:12:35,487 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Kana
2019-06-08 21:12:35,487 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Kana
2019-06-08 21:12:39,361 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited SAP Business Objects
2019-06-08 21:12:39,361 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited SAP Business Objects
2019-06-08 21:12:43,238 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Cameo Global
2019-06-08 21:12:43,238 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Cameo Global
2019-06-08 21:12:47,904 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited LiveVox
2019-06-08 21:12:47,904 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited LiveVox
2019-06-08 21:12:51,774 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited GuruCul
2019-06-08 21:12:51,774 INFO 12    :<ipython-inp

2019-06-08 21:15:25,629 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited way.com
2019-06-08 21:15:29,431 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited OUTSOURCE Consulting Services
2019-06-08 21:15:29,431 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited OUTSOURCE Consulting Services
2019-06-08 21:15:33,445 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Remind
2019-06-08 21:15:33,445 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Remind
2019-06-08 21:15:37,305 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited BDNA
2019-06-08 21:15:37,305 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited BDNA
2019-06-08 21:15:41,324 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited SugarSync
2019-06-08 21:15:41,324 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited SugarSync
2019-06-08 21:15:45,237 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Rdio
2019-06-08 21:15:45,237 INFO 12    :<ipytho

2019-06-08 21:18:13,138 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited TechValidate Software
2019-06-08 21:18:17,011 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Supertex
2019-06-08 21:18:17,011 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Supertex
2019-06-08 21:18:20,994 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Wiredrive
2019-06-08 21:18:20,994 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Wiredrive
2019-06-08 21:18:25,204 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited CDK Global
2019-06-08 21:18:25,204 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited CDK Global
2019-06-08 21:18:29,644 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Blackbaud
2019-06-08 21:18:29,644 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited Blackbaud
2019-06-08 21:18:33,745 INFO 12    :<ipython-input-151-5d3b57b34eeb>(477) - Visited CSS Corporation
2019-06-08 21:18:33,745 INFO 12    :<ipyth

In [152]:
len(company_info)
# check if length matches company_list

1994

In [99]:
# check the pages where there was a timeout and run the code again to grab their information
print(len(error_companies))

for index, (name, url) in enumerate(error_companies):
    try: 
        browser.get(url)
        time.sleep(2)
        company = {"Name": name, "Overview URL": url}
        get_company_overview(company)
        company_info.append(company)
        logger.info(f'Visited {name}')
    except TimeoutException as e:
        print("Exception has been thrown. " + str(e))
        logger.info(f'Timed out on {name}, the {index}th company')
        error_companies.append((name, url))
        
logger.info(f'Finished collecting!')
        
# save as a json file again
with open(company_json, 'w') as json_file:
  json.dump(company_info, json_file)

[('Alta Resources', 'https://www.glassdoor.com/Overview/Working-at-Alta-Resources-EI_IE266763.11,25.htm'), ('Littler Mendelson', 'https://www.glassdoor.com/Overview/Working-at-Littler-Mendelson-EI_IE7151.11,28.htm'), ('PIRCH', 'https://www.glassdoor.com/Overview/Working-at-PIRCH-EI_IE784380.11,16.htm'), ('Empyrean Solar', 'https://www.glassdoor.com/Overview/Working-at-Empyrean-Solar-EI_IE753157.11,25.htm'), ('Princeton Review', 'https://www.glassdoor.com/Overview/Working-at-Princeton-Review-EI_IE3988.11,27.htm'), ('Opportunities for Learning', 'https://www.glassdoor.com/Overview/Working-at-Opportunities-for-Learning-EI_IE410326.11,37.htm'), ('7 for All Mankind', 'https://www.glassdoor.com/Overview/Working-at-7-for-All-Mankind-EI_IE295657.11,28.htm'), ('Precision Castparts', 'https://www.glassdoor.com/Overview/Working-at-Precision-Castparts-EI_IE1798.11,30.htm'), ('FCB Global', 'https://www.glassdoor.com/Overview/Working-at-FCB-Global-EI_IE16031.11,21.htm'), ('Samasource', 'https://www.

2019-06-03 13:50:10,545 INFO 11    :<ipython-input-99-f8d20ea7a183>(126) - Visited Alta Resources
2019-06-03 13:50:10,545 INFO 11    :<ipython-input-99-f8d20ea7a183>(126) - Visited Alta Resources
2019-06-03 13:50:16,220 INFO 11    :<ipython-input-99-f8d20ea7a183>(126) - Visited Littler Mendelson
2019-06-03 13:50:16,220 INFO 11    :<ipython-input-99-f8d20ea7a183>(126) - Visited Littler Mendelson
2019-06-03 13:50:20,288 INFO 11    :<ipython-input-99-f8d20ea7a183>(126) - Visited PIRCH
2019-06-03 13:50:20,288 INFO 11    :<ipython-input-99-f8d20ea7a183>(126) - Visited PIRCH
2019-06-03 13:50:24,394 INFO 11    :<ipython-input-99-f8d20ea7a183>(126) - Visited Empyrean Solar
2019-06-03 13:50:24,394 INFO 11    :<ipython-input-99-f8d20ea7a183>(126) - Visited Empyrean Solar
2019-06-03 13:50:28,548 INFO 11    :<ipython-input-99-f8d20ea7a183>(126) - Visited Princeton Review
2019-06-03 13:50:28,548 INFO 11    :<ipython-input-99-f8d20ea7a183>(126) - Visited Princeton Review
2019-06-03 13:50:34,286 INFO

In [4]:
# if i need to open up a company info list
# once again, this code is for when I take breaks and close jupyter notebook, but kept scraped data in a file
# list_json = 'company_info4-2.json'
# with open(list_json) as f:
#     company_info = json.load(f)

# len(company_info)

5641

In [121]:
# convert scraped data into a dataframe
company_df = pd.DataFrame(company_info)

# store dataframe information into a csv file
csv_file = "firstpage.csv"
company_df.to_csv(csv_file, index=False, encoding='utf-8')

company_df

Unnamed: 0,Benefits URL,Competitors,Founded,Headquarters,Industry,Jobs Available,Locations,Name,Overview URL,Revenue,Salary URL,Size,Type,Website
0,https://www.glassdoor.com/Benefits/Cisco-Syste...,,1984,"San Jose, CA",Computer Hardware & Software,2.2k,,Cisco Systems,https://www.glassdoor.com/Overview/Working-at-...,$10+ billion (USD) per year,https://www.glassdoor.com/Salary/Cisco-Systems...,10000+ employees,Company - Private,www.cisco.com
1,https://www.glassdoor.com/Benefits/Kaiser-Perm...,,1945,"Oakland, CA",Health Care Services & Hospitals,3.0k,,Kaiser Permanente,https://www.glassdoor.com/Overview/Working-at-...,$10+ billion (USD) per year,https://www.glassdoor.com/Salary/Kaiser-Perman...,10000+ employees,Nonprofit Organization,www.kaiserpermanente.org
2,https://www.glassdoor.com/Benefits/Apple-US-Be...,,1976,"Cupertino, CA",Computer Hardware & Software,4.8k,,Apple,https://www.glassdoor.com/Overview/Working-at-...,$10+ billion (USD) per year,https://www.glassdoor.com/Salary/Apple-Salarie...,10000+ employees,Company - Public (AAPL),www.apple.com
3,https://www.glassdoor.com/Benefits/Oracle-US-B...,,1977,"Redwood City, CA",Enterprise Software & Network Solutions,11k,"[Austin, TX, Beijing, Beijing, Bellevue, WA, B...",Oracle,https://www.glassdoor.com/Overview/Working-at-...,$10+ billion (USD) per year,https://www.glassdoor.com/Salary/Oracle-Salari...,10000+ employees,Company - Public (ORCL),www.oracle.com
4,https://www.glassdoor.com/Benefits/Wells-Fargo...,,1852,"San Francisco, CA",,5.2k,,Wells Fargo,https://www.glassdoor.com/Overview/Working-at-...,$10+ billion (USD) per year,https://www.glassdoor.com/Salary/Wells-Fargo-S...,10000+ employees,Company - Public (WFC),www.wellsfargo.com
5,https://www.glassdoor.com/Benefits/Google-US-B...,"Microsoft, Apple, Facebook",1998,"Mountain View, CA",Internet,4.5k,"[Ann Arbor, MI, Austin, TX, Cambridge, MA, Chi...",Google,https://www.glassdoor.com/Overview/Working-at-...,$10+ billion (USD) per year,https://www.glassdoor.com/Salary/Google-Salari...,10000+ employees,Company - Public (GOOG),www.google.com
6,https://www.glassdoor.com/Benefits/Target-US-B...,,1962,"Minneapolis, MN",General Merchandise & Superstores,13k,,Target,https://www.glassdoor.com/Overview/Working-at-...,$10+ billion (USD) per year,https://www.glassdoor.com/Salary/Target-Salari...,10000+ employees,Company - Public (TGT),www.target.com
7,https://www.glassdoor.com/Benefits/Qualcomm-US...,"Intel Corporation, MediaTek, Broadcom",1985,"San Diego, CA",Computer Hardware & Software,1.6k,"[Atlanta, GA, Austin, TX, Boulder, CO, Boxboro...",Qualcomm,https://www.glassdoor.com/Overview/Working-at-...,$10+ billion (USD) per year,https://www.glassdoor.com/Salary/Qualcomm-Sala...,10000+ employees,Company - Public (QCOM),www.qualcomm.com
8,https://www.glassdoor.com/Benefits/Starbucks-U...,"Dunkin' Brands, McDonald's",1971,"Seattle, WA",Fast-Food & Quick-Service Restaurants,43k,"[Amsterdam (Netherlands), Burbank, CA, Chicago...",Starbucks,https://www.glassdoor.com/Overview/Working-at-...,$10+ billion (USD) per year,https://www.glassdoor.com/Salary/Starbucks-Sal...,10000+ employees,Company - Public (SBUX),www.starbucks.com
9,https://www.glassdoor.com/Benefits/Intel-Corpo...,,1968,"Santa Clara, CA",Computer Hardware & Software,2.0k,"[Bayan Lepas (Malaysia), Bengaluru (India), Gd...",Intel Corporation,https://www.glassdoor.com/Overview/Working-at-...,$10+ billion (USD) per year,https://www.glassdoor.com/Salary/Intel-Corpora...,10000+ employees,Company - Public (INTC),www.intel.com


In [6]:
# check the unique industries and see which one should count as a tech company
company_df.drop_duplicates(['Industry'])

Unnamed: 0,Benefits URL,Competitors,Founded,Headquarters,Industry,Name,Now known as,Overview URL,Part of,Revenue,Salary URL,Size,Type,Website
0,https://www.glassdoor.com/Benefits/Vallarta-Su...,Unknown,1999,"Sylmar, CA",Vehicle Dealers,Vallarta Supermarkets,,https://www.glassdoor.com/Overview/Working-at-...,,$500 million to $1 billion (USD) per year,https://www.glassdoor.com/Salary/Vallarta-Supe...,1001 to 5000 employees,Company - Private,www.vallartasupermarkets.com
1,https://www.glassdoor.com/Benefits/Booz-Allen-...,,1914,"Mc Lean, VA",Consulting,Booz Allen Hamilton,,https://www.glassdoor.com/Overview/Working-at-...,,$5 to $10 billion (USD) per year,https://www.glassdoor.com/Salary/Booz-Allen-Ha...,10000+ employees,Company - Public (BAH),www.boozallen.com
2,https://www.glassdoor.com/Benefits/Tech-Mahind...,"Wipro, Infosys, HCL Technologies",1986,Pune (India),IT Services,Tech Mahindra,,https://www.glassdoor.com/Overview/Working-at-...,,$2 to $5 billion (USD) per year,https://www.glassdoor.com/Salary/Tech-Mahindra...,10000+ employees,Company - Public (TECHM),www.techmahindra.com
3,https://www.glassdoor.com/Benefits/NCR-US-Bene...,Unknown,1884,"Atlanta, GA",Computer Hardware & Software,NCR,,https://www.glassdoor.com/Overview/Working-at-...,,$5 to $10 billion (USD) per year,https://www.glassdoor.com/Salary/NCR-Salaries-...,10000+ employees,Company - Public (NCR),www.ncr.com
4,https://www.glassdoor.com/Benefits/TJX-Compani...,,1976,"Framingham, MA","Department, Clothing, & Shoe Stores",TJX Companies,,https://www.glassdoor.com/Overview/Working-at-...,,$10+ billion (USD) per year,https://www.glassdoor.com/Salary/TJX-Companies...,10000+ employees,Company - Public (TJX),www.jobs.tjx.com
5,https://www.glassdoor.com/Benefits/Covance-US-...,,1996,"Princeton, NJ",Biotech & Pharmaceuticals,Covance,,https://www.glassdoor.com/Overview/Working-at-...,LabCorp,$10+ billion (USD) per year,https://www.glassdoor.com/Salary/Covance-Salar...,10000+ employees,Company - Public,www.covance.com
6,https://www.glassdoor.com/Benefits/Stantec-US-...,Unknown,1954,"Edmonton, AB (Canada)",Architectural & Engineering Services,Stantec,,https://www.glassdoor.com/Overview/Working-at-...,,$1 to $2 billion (USD) per year,https://www.glassdoor.com/Salary/Stantec-Salar...,10000+ employees,Company - Public (STN),www.stantec.com
7,https://www.glassdoor.com/Benefits/Delaware-No...,,1915,"Buffalo, NY",Catering & Food Service Contractors,Delaware North,,https://www.glassdoor.com/Overview/Working-at-...,,$2 to $5 billion (USD) per year,https://www.glassdoor.com/Salary/Delaware-Nort...,10000+ employees,Company - Private,www.delawarenorth.com
8,https://www.glassdoor.com/Benefits/Towne-Park-...,Unknown,1988,"Annapolis, MD","Hotels, Motels, & Resorts",Towne Park,,https://www.glassdoor.com/Overview/Working-at-...,,$100 to $500 million (USD) per year,https://www.glassdoor.com/Salary/Towne-Park-Sa...,10000+ employees,Company - Private,www.townepark.com
10,https://www.glassdoor.com/Benefits/Compass-US-...,,2012,"New York, NY",Enterprise Software & Network Solutions,Compass,,https://www.glassdoor.com/Overview/Working-at-...,,$500 million to $1 billion (USD) per year,https://www.glassdoor.com/Salary/Compass-Salar...,1001 to 5000 employees,Company - Private,www.compass.com


In [11]:
# filter dataframe to only include tech companies
tech_company = ['Computer Hardware & Software',
                'Internet',
                'IT Services',
                'Video Games',
                'Enterprise Software & Network Solutions',
                'Transportation Equipment Manufacturing',
                'Electrical & Electronic Manufacturing']

tech_df = company_df[company_df['Industry'].isin(tech_company)]
tech_df

# store dataframe information into a csv file
csv_file = "1-690tech.csv"
tech_df.to_csv(csv_file, index=False, encoding='utf-8')

tech_df

Unnamed: 0,Benefits URL,Competitors,Founded,Headquarters,Industry,Name,Now known as,Overview URL,Part of,Revenue,Salary URL,Size,Type,Website
2,https://www.glassdoor.com/Benefits/Tech-Mahind...,"Wipro, Infosys, HCL Technologies",1986,Pune (India),IT Services,Tech Mahindra,,https://www.glassdoor.com/Overview/Working-at-...,,$2 to $5 billion (USD) per year,https://www.glassdoor.com/Salary/Tech-Mahindra...,10000+ employees,Company - Public (TECHM),www.techmahindra.com
3,https://www.glassdoor.com/Benefits/NCR-US-Bene...,Unknown,1884,"Atlanta, GA",Computer Hardware & Software,NCR,,https://www.glassdoor.com/Overview/Working-at-...,,$5 to $10 billion (USD) per year,https://www.glassdoor.com/Salary/NCR-Salaries-...,10000+ employees,Company - Public (NCR),www.ncr.com
10,https://www.glassdoor.com/Benefits/Compass-US-...,,2012,"New York, NY",Enterprise Software & Network Solutions,Compass,,https://www.glassdoor.com/Overview/Working-at-...,,$500 million to $1 billion (USD) per year,https://www.glassdoor.com/Salary/Compass-Salar...,1001 to 5000 employees,Company - Private,www.compass.com
13,https://www.glassdoor.com/Benefits/eTouch-Syst...,Unknown,1997,"Fremont, CA",IT Services,eTouch Systems,,https://www.glassdoor.com/Overview/Working-at-...,,$50 to $100 million (USD) per year,https://www.glassdoor.com/Salary/eTouch-System...,501 to 1000 employees,Company - Private,www.etouch.net
14,https://www.glassdoor.com/Benefits/Adaptive-In...,,2003,"Palo Alto, CA",Enterprise Software & Network Solutions,Adaptive Insights,,https://www.glassdoor.com/Overview/Working-at-...,Workday,Unknown / Non-Applicable,https://www.glassdoor.com/Salary/Adaptive-Insi...,501 to 1000 employees,Company - Public,www.adaptiveinsights.com
24,https://www.glassdoor.com/Benefits/MobiTV-US-B...,,1999,"Emeryville, CA",Computer Hardware & Software,MobiTV,,https://www.glassdoor.com/Overview/Working-at-...,,Unknown / Non-Applicable,https://www.glassdoor.com/Salary/MobiTV-Salari...,201 to 500 employees,Company - Private,www.mobitv.com
34,https://www.glassdoor.com/Benefits/Check-Point...,,1993,"San Carlos, CA",Computer Hardware & Software,Check Point Software Technologies,,https://www.glassdoor.com/Overview/Working-at-...,,$1 to $2 billion (USD) per year,https://www.glassdoor.com/Salary/Check-Point-S...,5001 to 10000 employees,Company - Public (CHKP),www.checkpoint.com
36,https://www.glassdoor.com/Benefits/Xactly-Corp...,,2005,"San Jose, CA",Computer Hardware & Software,Xactly Corp.,,https://www.glassdoor.com/Overview/Working-at-...,Vista Equity Partners,$100 to $500 million (USD) per year,https://www.glassdoor.com/Salary/Xactly-Corp-S...,501 to 1000 employees,Company - Private,www.xactlycorp.com
49,https://www.glassdoor.com/Benefits/Hightail-US...,Unknown,2004,"Campbell, CA",Internet,Hightail,,https://www.glassdoor.com/Overview/Working-at-...,,$10 to $25 million (USD) per year,https://www.glassdoor.com/Salary/Hightail-Sala...,51 to 200 employees,Company - Private,www.hightail.com
50,https://www.glassdoor.com/Benefits/Velocify-US...,"InsideSales.Com, SalesLoft, Outreach",2004,"El Segundo, CA",Computer Hardware & Software,Velocify,,https://www.glassdoor.com/Overview/Working-at-...,Ellie Mae,$50 to $100 million (USD) per year,https://www.glassdoor.com/Salary/Velocify-Sala...,201 to 500 employees,Subsidiary or Business Segment,www.velocify.com


In [12]:
tech_df_short = tech_df[['Name', 'Industry', 'Overview URL', 'Salary URL', 'Benefits URL']]
tech_df_short

# store dataframe information into a csv file
# this is to send to jac and zoe
csv_file = "1-690short.csv"
tech_df_short.to_csv(csv_file, index=False, encoding='utf-8')

In [13]:
# print the dataframe to check
tech_df_short

Unnamed: 0,Name,Industry,Overview URL,Salary URL,Benefits URL
2,Tech Mahindra,IT Services,https://www.glassdoor.com/Overview/Working-at-...,https://www.glassdoor.com/Salary/Tech-Mahindra...,https://www.glassdoor.com/Benefits/Tech-Mahind...
3,NCR,Computer Hardware & Software,https://www.glassdoor.com/Overview/Working-at-...,https://www.glassdoor.com/Salary/NCR-Salaries-...,https://www.glassdoor.com/Benefits/NCR-US-Bene...
10,Compass,Enterprise Software & Network Solutions,https://www.glassdoor.com/Overview/Working-at-...,https://www.glassdoor.com/Salary/Compass-Salar...,https://www.glassdoor.com/Benefits/Compass-US-...
13,eTouch Systems,IT Services,https://www.glassdoor.com/Overview/Working-at-...,https://www.glassdoor.com/Salary/eTouch-System...,https://www.glassdoor.com/Benefits/eTouch-Syst...
14,Adaptive Insights,Enterprise Software & Network Solutions,https://www.glassdoor.com/Overview/Working-at-...,https://www.glassdoor.com/Salary/Adaptive-Insi...,https://www.glassdoor.com/Benefits/Adaptive-In...
24,MobiTV,Computer Hardware & Software,https://www.glassdoor.com/Overview/Working-at-...,https://www.glassdoor.com/Salary/MobiTV-Salari...,https://www.glassdoor.com/Benefits/MobiTV-US-B...
34,Check Point Software Technologies,Computer Hardware & Software,https://www.glassdoor.com/Overview/Working-at-...,https://www.glassdoor.com/Salary/Check-Point-S...,https://www.glassdoor.com/Benefits/Check-Point...
36,Xactly Corp.,Computer Hardware & Software,https://www.glassdoor.com/Overview/Working-at-...,https://www.glassdoor.com/Salary/Xactly-Corp-S...,https://www.glassdoor.com/Benefits/Xactly-Corp...
49,Hightail,Internet,https://www.glassdoor.com/Overview/Working-at-...,https://www.glassdoor.com/Salary/Hightail-Sala...,https://www.glassdoor.com/Benefits/Hightail-US...
50,Velocify,Computer Hardware & Software,https://www.glassdoor.com/Overview/Working-at-...,https://www.glassdoor.com/Salary/Velocify-Sala...,https://www.glassdoor.com/Benefits/Velocify-US...


In [14]:
# also check the length of the dataframe
len(tech_df_short)

1266

In [134]:
# combining some dataframes
# code that was used to combine dataframes back when I scraped data a little at a time and kept them in seperate files
# allpd = pd.concat([a_short, b_short, c_short, d_short])
# allpd

# csv_file = "allraw.csv"
# allpd.to_csv(csv_file, index=False, encoding='utf-8')