In [1]:
from datetime import datetime
from collections import OrderedDict
import gspread
import pandas as pd
from oauth2client.service_account import ServiceAccountCredentials
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
import time
import json


In [2]:
JOB_TITLES = ["Data Engineer", "Business Intelligence Engineer", "Business Developer", "BI Engineer"]
LOCATIONS = ["Mountain View, CA", "Remote"]

GOOD_KEYWORDS = [
        "SQL","POWER BI", "POWERBI", "PYTHON", "TABLEAU" "DATA ENGINEER",
        "ETL", "ELT", "BI ", "BUSINESS INTELLIGENCE", "DATA ANALYSIS", "DATA VISUALIZATION",
        "DATA ANALYTICS"
        ]

BAD_KEYWORDS = [
        "CLEARANCE", "SECURITY CLEAR", "NOT PROVIDE SPONSOR", "VISA SPONSORSHIP IS NOT"
        , "WITHOUT FUTURE SPONSOR", "CLEARABLE", "ONLY US CITIZEN", "WITHOUT REQUIRING SPONSOR"
        ,"WITHOUT SPONSOR", "NOT SPONSOR", "NOT ELIGIBLE FOR VISA", "NOT ELIGIBLE FOR SPONSOR"
        , "CITIZEN", "C++", "C#", "DISTRIBUTED SYSTEM", "DIRECTOR"
        ,"8+", "9+", "10+", "AT LEAST 8 YEARS","12+", "15 OR MORE YEARS"
        ,"INTERN ", "INTERNSHIP", "EXPECTED GRADUATION DATE", "YEARS OF CORPORATE TREASURY"
        ,"YEARS IN SUPERVISORY ROLE", "FEDERAL AGENC", "WITHOUT A NEED FOR CURRENT OR FUTURE SPONSORSHIP"
        , "NOT ABLE TO SPONSOR", "PERMANENT RESIDENT", "WITHOUT COMPANY SPONSOR", "FLUENCY IN SPANISH"
        ,"MUST BE A U.S. CITIZEN", "MUST BE A US CITIZEN", "DOES NOT SPONSOR", "NOT PROVIDE VISA"
        ,"NOT PROVIDE SPONSOR", "CONTRACT", "PHD PREFERRED", "PRINCIPAL", "STAFF ENGINEER", "STAFF DATA ENGINEER"
        ]

SPREADSHEET_NAME = "test"
SHEET_NAME = "List"

In [3]:
class IndeedScrapper:
    def __init__(self, numStars=None):
        self.driver = self.startDriver()
        self.sheet = self.getGSheet()
        self.jobList = []
        self.jobDetails = OrderedDict()
        self.runTime = datetime.now().strftime("%m/%d/%Y %H:%M:%S")
        self.numStars = numStars
        self.sheetRecords = None
        self.startTime = time.time()
        
    def run(self):
        try:
            self.login()
            for title in JOB_TITLES[:1]: #TODO: remove limit
                for location in LOCATIONS[:1]: #TODO: remove limit
                    print(f"Searching for {title} in {location}")
                    self.newSearch(title, location)
                    # self.setFilters()
                    # self.openFilters()
                    # if self.numStars:
                    #     self.filterNumStars()
                    # if location == "Remote":
                    #     self.filterToRemote()
                    self.searchJobs()

        except Exception as err:
            raise err
        finally:
            # self.addToGSheet()
            # self.driver.quit()
            print(f"Program ran for {round((time.time() - self.startTime)/60, 2)} minutes")

    def startDriver(self):
        DRIVER_PATH = "chromedriver.exe"
        options = Options()
        # options.add_argument('--headless')
        options.add_argument("--window-size=1920,1080")
        driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
        return driver
    
    def login(self):
        WEBSITE = 'https://www.indeed.com/'
        self.driver.get(WEBSITE)
        # self.sendKeyFromJson("username")
        # self.sendKeyFromJson("password")
            
        # print('Logging In...')

    def sendKeyFromJson(self, field):
        idName = "inlineUserEmail" if field == "username" else "inlineUserPassword"
        with open("glassdoor_login.json") as jsonFile:
            element = self.waitFind(By.ID, idName)
            element.send_keys(json.load(jsonFile).get(field))
            element.send_keys(Keys.RETURN)

    def waitFind(self, findBy, findText, waitForType="presence", waitTime=10, driver=None):
        if driver is None:
            driver=self.driver
            
        if waitForType == "presence":
            element = WebDriverWait(driver, waitTime).until(
                EC.presence_of_element_located((findBy, findText))
            )
        elif waitForType == "clickable":
            element = WebDriverWait(driver, waitTime).until(
                EC.element_to_be_clickable((findBy, findText))
            )
        elif waitForType == "visibility":
                element = WebDriverWait(driver, waitTime).until(
                EC.visibility_of_any_elements_located((findBy, findText))
            )
        else:
            element = None

        return element

    def openFilters(self):
        filtersButton = self.waitFind(By.CSS_SELECTOR, "div[data-test='more-filter']", waitForType="clickable")
        filtersButton.click()

    def filterToRemote(self):
        print("Filtering to Remote Jobs")
        wfhToggle = self.waitFind(By.CLASS_NAME, "css-163gce3", waitForType="visibility")[1]
        wfhToggle.click()
        time.sleep(1)
        
    def filterNumStars(self):
        if self.numStars >= 1 and self.numStars <= 5:
            print(f"Filtering to {self.numStars} Stars")
            starIdx = self.numStars - 1
            numStarsButton = self.waitFind(By.CLASS_NAME, "gd-ui-star", waitForType="visibility")[starIdx]
            time.sleep(0.5)
            numStarsButton.click()
        else:
            print("Invalid num stars")
            raise Exception

    def newSearch(self, searchKeyword, searchLocation):
        searchBar = self.waitFind(By.ID, "text-input-what")
        self.clearText(searchBar)
        searchBar.send_keys(searchKeyword)
        
        location = self.waitFind(By.ID, "text-input-where")
        self.clearText(location)
        location.send_keys(searchLocation)
        location.send_keys(Keys.ENTER)

        # seeAllJobs = self.waitFind(By.CSS_SELECTOR, "a[data-test='jobs-location-see-all-link']", waitForType="visibility")
        # seeAllJobs[0].click()

    def clearText(self, element):
        time.sleep(0.25)
        element.send_keys(Keys.LEFT_CONTROL, "a")
        time.sleep(0.25)
        element.send_keys(Keys.BACKSPACE)

    def nextPage(self):
        print("getting next page...")
        button = self.waitFind(By.CLASS_NAME, "nextButton", "clickable")
        button.click()
        time.sleep(1)

    def searchJobs(self):
        self.getGsheetRecords()
        while True:
            time.sleep(3)
            jobs = self.waitFind(By.CLASS_NAME, "slider_item", waitForType="visibility")
            for job in jobs:
                try:
                    time.sleep(1)
                    job.click()
                    
                    self.getJobDetails(job)
                    # if self.isValidJob(): #TODO: fix this
                    self.addJobToList()
                        
                except Exception as err:
                    print("Failed to add job, moving on...")
                    raise err
                    continue
            # break
            if self.isLastPage():
                print("Finished Job Search!")
                break
            self.getNextPageButton().click()
            
    def isLastPage(self):
        try:
            self.getNextPageButton()
            return False
        except:
            return True

    def getNextPageButton(self):
        return self.waitFind(By.CSS_SELECTOR, "a[data-testid='pagination-page-next'", waitForType="clickable")

    def isValidJob(self):
        if self.jobDescrContains(GOOD_KEYWORDS) \
            and not self.jobDescrContains(BAD_KEYWORDS) \
            and not self.isAlreadyAdded():
                return True
        return False

    def isAlreadyAdded(self):
        sheetJobs = [field["employer"]+field["jobTitle"] for field in self.sheetRecords]
        currentJobs = [field[0]+field[2] for field in self.jobList]

        jobKey = self.jobDetails["employer"]+self.jobDetails["jobTitle"]
        if jobKey in sheetJobs or jobKey in currentJobs:
            print("Job is already added...")
            return True

        return False
        
    def jobDescrContains(self, wordList):
        return any(word in self.jobDetails["jobDescr"] for word in wordList)
        
    def addJobToList(self):
        self.jobList.append(list(self.jobDetails.values())[1:]) # Add all fields except jobDescr
        print('+ Added', self.jobDetails['jobTitle'],',' , self.jobDetails['employer'], ',', self.jobDetails['location'])
            
    def getJobDetails(self, job):
        # Order matters here for column ingest
        self.jobDetails["jobDescr"] = self.waitFind(By.ID, "jobDescriptionText").text.upper()
        self.jobDetails["employer"] = self.waitFind(By.CLASS_NAME, "companyName", driver=job).text
        self.jobDetails["rating"] = self.getRating(job)
        self.jobDetails["jobTitle"] = self.waitFind(By.CLASS_NAME, "jobTitle", driver=job).text
        self.jobDetails["location"] = self.waitFind(By.CLASS_NAME, "companyLocation", driver=job).text
        self.jobDetails["jobSize"] = ""
        self.jobDetails["jobIndustry"] = ""
        self.jobDetails["url"] = self.waitFind(By.CLASS_NAME, "jcs-JobTitle", driver=job).get_attribute("href")
        self.jobDetails["dayAdded"] = self.runTime

    def getRating(self, job):
        try:
            return self.waitFind(By.CLASS_NAME, "ratingNumber", waitTime=0.25, driver=job).text
        except:
            return ""


    def getGSheet(self, name=SPREADSHEET_NAME, worksheet=SHEET_NAME):
        scope = ["https://spreadsheets.google.com/feeds"
                ,"https://www.googleapis.com/auth/drive"]

        creds = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scope)
        client = gspread.authorize(creds)
        gsheet = client.open(name).worksheet(worksheet)
        return gsheet

    def addToGSheet(self):
        self.sheet.append_rows(self.jobList)
        print(f"Gsheet Updated with {len(self.jobList)} new jobs!")

    def getGsheetRecords(self):
        self.sheetRecords = self.sheet.get_all_records()


In [4]:
scrap = IndeedScrapper(numStars=3)
scrap.run()

  driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)


Searching for Data Engineer in Mountain View, CA
+ Added Data Engineer , Mitra Chem , Mountain View, CA
+ Added Sr. Data Engineer , Supernal , Fremont, CA 94537
+ Added Data Engineer , FutureSoft IT , Sunnyvale, CA 94043
+ Added Data Engineer - USDS , TikTok , Mountain View, CA
+ Added Data Engineer , Tesla , Fremont, CA
+ Added Data Pipeline Engineer , Metis Technology Solutions Inc , Moffett Field, CA 94035
+ Added Senior Data Engineer , Humu , Mountain View, CA
+ Added Data Engineer, Product Analytics , Meta , Fremont, CA 94555 
(Northgate area)
+ Added Data engineer , Stacklogy , Fremont, CA
+ Added Senior Data Engineer , Arkose Labs , Remote in San Mateo, CA
Failed to add job, moving on...
Program ran for 0.44 minutes


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=114.0.5735.199)
Stacktrace:
Backtrace:
	GetHandleVerifier [0x002BA813+48355]
	(No symbol) [0x0024C4B1]
	(No symbol) [0x00155358]
	(No symbol) [0x0013D293]
	(No symbol) [0x0019E37B]
	(No symbol) [0x001AC473]
	(No symbol) [0x0019A536]
	(No symbol) [0x001782DC]
	(No symbol) [0x001793DD]
	GetHandleVerifier [0x0051AABD+2539405]
	GetHandleVerifier [0x0055A78F+2800735]
	GetHandleVerifier [0x0055456C+2775612]
	GetHandleVerifier [0x003451E0+616112]
	(No symbol) [0x00255F8C]
	(No symbol) [0x00252328]
	(No symbol) [0x0025240B]
	(No symbol) [0x00244FF7]
	BaseThreadInitThunk [0x76B87D59+25]
	RtlInitializeExceptionChain [0x77A4B74B+107]
	RtlClearBits [0x77A4B6CF+191]
	(No symbol) [0x00000000]


In [5]:
scrap.driver.quit()