In [1]:
from datetime import datetime
from collections import OrderedDict
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
import time
import json


In [2]:
JOB_TITLES = ["Data Engineer", "Business Intelligence Engineer", "Business Intelligence Developer", "BI Engineer"]
LOCATIONS = ["Mountain View, CA", "Remote"]

GOOD_KEYWORDS = [
        "SQL","POWER BI", "POWERBI", "PYTHON", "TABLEAU" "DATA ENGINEER",
        "ETL", "ELT", "BI ", "BUSINESS INTELLIGENCE", "DATA ANALYSIS", "DATA VISUALIZATION",
        "DATA ANALYTICS"
        ]

TITLE_BAD_KEYWORDS = [
        "SECURITY", " QA ", "JAVA", " OS ", "SYSTEMS DESIGN", "SYSTEM DESIGN", "NOSQL",
        "CPU", "PRINCIPAL", "GIS", "NODE", "REACT", "ANDROID", "TECH LEAD", "COGNOS"

]

COMPANY_BAD_KEYWORDS = [
        "DIVERSE LYNX", 
        
]

BAD_KEYWORDS = [
        "CLEARANCE", "SECURITY CLEAR", "NOT PROVIDE SPONSOR", "VISA SPONSORSHIP IS NOT"
        , "WITHOUT FUTURE SPONSOR", "CLEARABLE", "ONLY US CITIZEN", "WITHOUT REQUIRING SPONSOR"
        ,"WITHOUT SPONSOR", "NOT SPONSOR", "NOT ELIGIBLE FOR VISA", "NOT ELIGIBLE FOR SPONSOR"
        , "CITIZEN", "C++", "C#", "DISTRIBUTED SYSTEM", "DIRECTOR", "DOES NOT PROVIDE ANY TYPE OF SPONSORSHIP"
        ,"8+", "9+", "10+", "AT LEAST 8 YEARS","12+", "15 OR MORE YEARS", "5+ YEARS OF PYTHON"
        ,"INTERN ", "INTERNSHIP", "EXPECTED GRADUATION DATE", "YEARS OF CORPORATE TREASURY", "CONTRACT"
        ,"YEARS IN SUPERVISORY ROLE", "FEDERAL AGENC", "WITHOUT A NEED FOR CURRENT OR FUTURE SPONSORSHIP"
        , "NOT ABLE TO SPONSOR", "PERMANENT RESIDENT", "WITHOUT COMPANY SPONSOR", "FLUENCY IN SPANISH"
        ,"MUST BE A U.S. CITIZEN", "MUST BE A US CITIZEN", "DOES NOT SPONSOR", "NOT PROVIDE VISA"
        ,"NOT PROVIDE SPONSOR", "CONTRACT", "PHD PREFERRED", "PRINCIPAL", "STAFF ENGINEER", "STAFF DATA ENGINEER"
        ,"WORKING SHIFTS", "WORK SHIFT", "ADVANCED PROFICIENCY WITH PYTHON", "WEB APPLICATION", "SPRING BOOT"
        ,"ANSIBLE", "TERRAFORM", "KINESIS", "JMP"
        ]

SPREADSHEET_NAME = "test"
SHEET_NAME = "List"

In [6]:
class GlassDoorScrapper:
    def __init__(self, numStars=None, onlyRecentJobs=False):
        self.driver = self.startDriver()
        self.sheet = self.getGSheet()
        self.jobList = []
        self.jobDetails = OrderedDict()
        self.runTime = datetime.now().strftime("%m/%d/%Y %H:%M:%S")
        self.numStars = numStars
        self.onlyRecentJobs = onlyRecentJobs
        self.sheetRecords = None
        self.startTime = time.time()
        
    def run(self):
        try:
            self.login()
            for title in JOB_TITLES:
                for location in LOCATIONS:
                    print(f"Searching for {title} in {location}")
                    self.newSearch(title, location)
                    self.closePopup()
                    self.setFilters(location)
                    self.searchJobs()

        except Exception as err:
            raise err
        finally:
            self.addToGSheet()
            # self.driver.quit()
            print(f"Program ran for {round((time.time() - self.startTime)/60, 2)} minutes")

    def startDriver(self):
        DRIVER_PATH = "chromedriver.exe"
        options = Options()
        # options.add_argument('--headless')
        options.add_argument("--window-size=1920,1080")
        driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
        return driver
    
    def login(self):
        WEBSITE = 'https://www.glassdoor.com/member/home/index.htm'
        self.driver.get(WEBSITE)
        self.sendKeyFromJson("username")
        self.sendKeyFromJson("password")
            
        print('Logging In...')

    def sendKeyFromJson(self, field):
        idName = "inlineUserEmail" if field == "username" else "inlineUserPassword"
        with open("glassdoor_login.json") as jsonFile:
            element = self.waitFind(By.ID, idName)
            element.send_keys(json.load(jsonFile).get(field))
            element.send_keys(Keys.RETURN)

    def waitFind(self, findBy, findText, waitForType="presence", waitTime=10):
        if waitForType == "presence":
            element = WebDriverWait(self.driver, waitTime).until(
                EC.presence_of_element_located((findBy, findText))
            )
        elif waitForType == "clickable":
            element = WebDriverWait(self.driver, waitTime).until(
                EC.element_to_be_clickable((findBy, findText))
            )
        elif waitForType == "visibility":
                element = WebDriverWait(self.driver, waitTime).until(
                EC.visibility_of_any_elements_located((findBy, findText))
            )
        else:
            element = None

        return element

    def setFilters(self, location):
        filtersButton = self.waitFind(By.CSS_SELECTOR, "div[data-test='more-filter']", waitForType="clickable")
        filtersButton.click()
        if self.numStars:
            self.filterNumStars()
        if location == "Remote":
            self.filterToRemote()

        recentJobsBtn = self.waitFind(By.ID, "filter_fromAge", waitForType="clickable")
        recentJobsBtn.click()
        if self.onlyRecentJobs:
            self.filterToRecent()

    def filterToRecent(self):
        print("Filtering to Last Week of Jobs")
        for i in [3, 2, 1, 0]: # 4 attempts
            try:
                recentJobs = self.waitFind(By.CLASS_NAME, "css-x9a2o", waitForType="visibility")[4]
                break
            except:
                continue
        recentJobs.click()
        time.sleep(1)

    def filterToRemote(self):
        print("Filtering to Remote Jobs")
        wfhToggle = self.waitFind(By.CLASS_NAME, "css-163gce3", waitForType="visibility")[1]
        wfhToggle.click()
        time.sleep(1)
        
    def filterNumStars(self):
        if self.numStars >= 1 and self.numStars <= 5:
            print(f"Filtering to {self.numStars} Stars")
            starIdx = self.numStars - 1
            numStarsButton = self.waitFind(By.CLASS_NAME, "gd-ui-star", waitForType="visibility")[starIdx]
            time.sleep(0.5)
            numStarsButton.click()
        else:
            print("Invalid num stars")
            raise Exception

    def newSearch(self, searchKeyword, searchLocation):
        searchBar = self.waitFind(By.ID, "sc.keyword")
        self.clearText(searchBar)
        searchBar.send_keys(searchKeyword)
        
        location = self.waitFind(By.ID, "sc.location")
        self.clearText(location)
        location.send_keys(searchLocation)
        location.send_keys(Keys.ENTER)

        try:
            seeAllJobs = self.waitFind(By.CSS_SELECTOR, "a[data-test='jobs-location-see-all-link']", waitForType="visibility")
            seeAllJobs[0].click()
        except:
            return

    def clearText(self, element):
        time.sleep(0.25)
        element.send_keys(Keys.LEFT_CONTROL, "a")
        time.sleep(0.25)
        element.send_keys(Keys.BACKSPACE)

    def nextPage(self):
        print("getting next page...")
        button = self.waitFind(By.CLASS_NAME, "nextButton", "clickable")
        button.click()
        time.sleep(1)

    def closePopup(self):
        time.sleep(2)
        ActionChains(self.driver).send_keys(Keys.ESCAPE).perform()

    def searchJobs(self):
        self.getGsheetRecords()
        while True:
            time.sleep(2)
            try:
                jobs = self.waitFind(By.CLASS_NAME, "react-job-listing", waitForType="visibility")
            except:
                break
            for job in jobs:
                try:
                    time.sleep(0.25)
                    job.click()
                    
                    self.clickShowMore()
                    url = job.find_element(By.CSS_SELECTOR, "a[data-test='job-link']").get_attribute("href")
                    self.getJobDetails(url)

                    if self.isValidJob():
                        self.addJobToList()
                except Exception as err:
                    print("Failed to add job, moving on...")
                    continue

            if self.isLastPage():
                print("Finished Job Search!")
                break
            self.nextPage()

    def clickShowMore(self):
        try:
            self.waitFind(By.CLASS_NAME, "css-t3xrds", "clickable", waitTime=3).click()
        except:
            return
            
    def isLastPage(self):
        pageText = self.waitFind(By.CLASS_NAME, "paginationFooter").text.split(" ")
        # Example Text: Page 8 of 9

        if pageText[1] == pageText[-1]:
            return True
        return False

    def isValidJob(self):
        if not self.jobDetailContains("jobDescr", BAD_KEYWORDS) \
            and not self.isAlreadyAdded() \
            and self.isValidIndustry() \
            and not self.jobDetailContains("jobTitle", TITLE_BAD_KEYWORDS) \
            and not self.jobDetailContains("employer", COMPANY_BAD_KEYWORDS):
            # and self.jobDetailContains("jobDescr", GOOD_KEYWORDS) \
                return True
        return False

    def isValidIndustry(self):
        if self.jobDetails["jobIndustry"] in ["HR Consulting", "Staffing & Subcontracting"]:
            return False
        return True

    def isAlreadyAdded(self):
        sheetJobs = [str(field["employer"])+field["jobTitle"] for field in self.sheetRecords if field["employer"] != ""]
        currentJobs = [field[0]+field[2] for field in self.jobList]

        jobKey = self.jobDetails["employer"]+self.jobDetails["jobTitle"]
        if jobKey in sheetJobs or jobKey in currentJobs:
            print("Job is already added...")
            return True

        return False

    def jobDetailContains(self, jobDetail, wordList):
        return any(word in self.jobDetails[jobDetail].upper() for word in wordList)
        
    def addJobToList(self):
        self.jobList.append(list(self.jobDetails.values())[1:]) # Add all fields except jobDescr
        print('+ Added', self.jobDetails['jobTitle'],',' , self.jobDetails['employer'], ',', self.jobDetails['location'])
            
    def getJobDetails(self, url):
        # Emp Info
        empInfoDict = self.getEmpInfo()

        # Rating and Employer Name
        empAndRating = self.getEmpNameAndRating()

        # Order matters here for column ingest
        self.jobDetails["jobDescr"] = self.waitFind(By.CLASS_NAME, "jobDescriptionContent").text.upper()
        self.jobDetails["employer"] = empAndRating[0]
        self.jobDetails["rating"] = empAndRating[1] if len(empAndRating) > 1 else ""
        self.jobDetails["jobTitle"] = self.waitFind(By.CSS_SELECTOR, "div[data-test='jobTitle']").text
        self.jobDetails["location"] = self.waitFind(By.CSS_SELECTOR, "div[data-test='location']").text
        self.jobDetails["jobSize"] = empInfoDict.get("Size", "")
        self.jobDetails["jobIndustry"] = empInfoDict.get("Industry", "")
        self.jobDetails["url"] = url
        self.jobDetails["dayAdded"] = self.runTime

    def getEmpInfo(self):
        try:
            empInfo = self.waitFind(By.ID, "EmpBasicInfo", waitTime=5).text.split("\n")
            return {empInfo[i]:empInfo[i+1] for i in range(1, len(empInfo)-1, 2)}
            
        except:
            return {}

    def getEmpNameAndRating(self):
        try:
            return self.waitFind(By.CSS_SELECTOR, "div[data-test='employerName']").text.split("\n")
        except:
            print("Failed to get Employer name")
            return [""]

    def getGSheet(self, name=SPREADSHEET_NAME, worksheet=SHEET_NAME):
        scope = ["https://spreadsheets.google.com/feeds"
                ,"https://www.googleapis.com/auth/drive"]

        creds = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scope)
        client = gspread.authorize(creds)
        gsheet = client.open(name).worksheet(worksheet)
        return gsheet

    def addToGSheet(self):
        self.sheet.append_rows(self.jobList)
        print(f"Gsheet Updated with {len(self.jobList)} new jobs!")

    def getGsheetRecords(self):
        self.sheetRecords = self.sheet.get_all_records()


In [7]:
scrap = GlassDoorScrapper(numStars=3, onlyRecentJobs=True)
scrap.run()

  driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)


Logging In...
Searching for Data Engineer in Mountain View, CA
Filtering to 3 Stars
Filtering to Last Week of Jobs
+ Added Data Engineer - USDS , TikTok , Mountain View, CA
+ Added Data Center Operations Engineer , Juniper Networks , Sunnyvale, CA
+ Added Senior Data Engineer , Zoom Video Communications, Inc. , San Jose, CA
+ Added Senior Software Engineer, TikTok Protected Data Infrastructure , TikTok , San Jose, CA
+ Added Systems Engineer - Modeling and Data Analysis , NEXTracker , Fremont, CA
+ Added Senior Cloud Support Engineer - Data Platform , Snowflake , Dublin, CA
+ Added Big Data Platform Engineer , Apple , Cupertino, CA
+ Added Data Pipelines Software Engineer, Battery Automation Software , Tesla , Fremont, CA
+ Added Senior CV deep learning engineer (Synthetic data) , Apple , Cupertino, CA
+ Added AIML - Sr ML Engineer, Data and ML Innovation , Apple , Cupertino, CA
+ Added Systems Engineer - Modeling and Data Analysis , Flex , Fremont, CA
+ Added Senior Data Engineer , Ta

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=114.0.5735.199)
Stacktrace:
Backtrace:
	GetHandleVerifier [0x002BA813+48355]
	(No symbol) [0x0024C4B1]
	(No symbol) [0x00155358]
	(No symbol) [0x0013D293]
	(No symbol) [0x0019E37B]
	(No symbol) [0x001AC473]
	(No symbol) [0x0019A536]
	(No symbol) [0x001782DC]
	(No symbol) [0x001793DD]
	GetHandleVerifier [0x0051AABD+2539405]
	GetHandleVerifier [0x0055A78F+2800735]
	GetHandleVerifier [0x0055456C+2775612]
	GetHandleVerifier [0x003451E0+616112]
	(No symbol) [0x00255F8C]
	(No symbol) [0x00252328]
	(No symbol) [0x0025240B]
	(No symbol) [0x00244FF7]
	BaseThreadInitThunk [0x76B87D59+25]
	RtlInitializeExceptionChain [0x77A4B74B+107]
	RtlClearBits [0x77A4B6CF+191]
	(No symbol) [0x00000000]


In [6]:
scrap.driver.quit()