In [1]:
import pandas as pd
import numpy as np
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, ElementNotInteractableException, StaleElementReferenceException

In [8]:
# Create a function to scrape job postings
def scraping_jobs(keyword, num_pages):
    
    # Create a list to contain the scraped job posting info
    jobs=[]
    
    # scrape job postings nationwide
    state_names = ["Alaska", "Alabama", "Arkansas", "Arizona", "California", 
               "Colorado", "Connecticut", "Washington, DC", "Delaware", "Florida", 
               "Georgia", "Hawaii", "Iowa", "Idaho", "Illinois",
               "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", 
               "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", 
               "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", 
               "New Hampshire", "New Jersey", "New Mexico", "Nevada", "New York", 
               "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", 
               "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", 
               "Virginia", "Vermont", "Washington State", "Wisconsin", "West Virginia", 
               "Wyoming"]
    
    # Open Chrome using webdriver
    driver = webdriver.Chrome(r"C:\Users\hannah\chromedriver.exe")
    
    # Go to the job search page in the Glassdoor
    url = "https://www.glassdoor.com/Job/index.htm"
    driver.get(url)
    
    # Enter the job key word 'data analyst' in th search box
    search_job = driver.find_element_by_xpath('//input[@class="keyword"]')
    search_job.send_keys([keyword])
    
    
    for location in state_names:
        
        # Enter the state name in the location box
        search_location = driver.find_element_by_xpath('//input[@class="loc"]')
        search_location.clear()
        search_location.send_keys([location])
    
        # Click the search button
        search_button = driver.find_element_by_xpath('//button[@id="HeroSearchButton"]')
        search_button.click()
        
        page = 1
        
        
        # loop until the page reaches the page number you set
        while page <= num_pages:
            
            # wait until the job postings are completely opened in the screen (4 seconds) 
            time.sleep(4)
            
            # First click the selected job posting
            driver.find_element_by_class_name("selected").click()
            time.sleep(1)
            
            # Close the pop-up window to ask you to log in the site
            try:
                driver.find_element_by_xpath("//*[@id='JAModal']/div/div[2]/span").click()
            except NoSuchElementException:
                pass
        
            # Find the job posting elements (about 30 postings) and store into the 'job_buttons'
            job_buttons = driver.find_elements_by_class_name("jl")
    
            # loop for each jop posting
            for job_button in job_buttons:
                
                # Click the job posting
                try:
                    job_button.click()
                except (ElementNotInteractableException, StaleElementReferenceException):
                    pass
                
                # Wait for the relevant job posting to be downloaded in the screen
                time.sleep(1)
        
                # Scrape the name of the company for the posting
                try:
                    company = driver.find_element_by_xpath('.//div[@class="employerName"]').text.split('\n')[0]
                except IndexError:
                    company = ''
                    
                # Scrape the job title    
                try:    
                    title = driver.find_element_by_xpath('.//div[contains(@class, "title")]').text
                except IndexError:
                    title = ''
                    
                # Scrape the location of the job    
                try:
                    location = driver.find_element_by_xpath('.//div[@class="location"]').text
                except IndexError:
                    location = ''
            
                # Scrape the salary information for the job
                try:
                    salary = driver.find_element_by_xpath('.//div[@class="salary"]/span').text.split()[0]
                except NoSuchElementException:
                    salary = ""
                    
                # Scrape the rating of the company    
                try:
                    rating = driver.find_element_by_xpath('.//span[@class="rating"]').text
                except NoSuchElementException:
                    rating = np.nan
                   
                # Scrape the job description
                try:
                    job_description = driver.find_element_by_xpath('.//div[@class="jobDescriptionContent desc"]').text
                except NoSuchElementException:
                    job_description = ''
                    
                # Open the company info page    
                try:
                    driver.find_element_by_xpath('.//div[@class="tab" and @data-tab-type="overview"]').click()
                    time.sleep(.3)
                    
                    lists=[]
                    
                    # Check what kind of info for the company is in the company info page
                    for i in range(1,len(driver.find_elements_by_xpath('//div[@class="info row"]/div'))+1):
                        s = driver.find_element_by_xpath('//div[@class="info row"]/div[{}]/label'.format(i)).text
                        lists.append(s)
                        
                    # Scrape company size    
                    try:
                        if 'Size' in lists:
                            s = lists.index('Size') + 1
                            size = driver.find_element_by_xpath('//div[@class="info row"]/div[{}]/span[@class="value"]'.format(s)).text
                        else:
                            size = ''
                    except NoSuchElementException:
                        size = ''
                        
                    # Scrape the industry which the company belongs to    
                    try:
                        if 'Industry' in lists:
                            d = lists.index('Industry') + 1
                            industry = driver.find_element_by_xpath('//div[@class="info row"]/div[{}]/span[@class="value"]'.format(d)).text
                        else:
                            industry = ''
                    except NoSuchElementException:
                        industry = ''
                        
                    # Scrape the sector which the company belongs to    
                    try:
                        if 'Sector' in lists:
                            e = lists.index('Sector') + 1
                            sector = driver.find_element_by_xpath('//div[@class="info row"]/div[{}]/span[@class="value"]'.format(e)).text
                        else:
                            sector = ''
                    except NoSuchElementException:
                        sector = ''
                        
                    # Scrape the revenue info     
                    try:
                        if 'Revenue' in lists:
                            r = lists.index('Revenue') + 1
                            revenue = driver.find_element_by_xpath('//div[@class="info row"]/div[{}]/span[@class="value"]'.format(r)).text
                        else:
                            revenue = ''
                    except NoSuchElementException:
                        revenue = ''
                        
                        
                except NoSuchElementException:
                    size = ''
                    industry = ''
                    sector = ''
                    revenue = ''
            
                # Put the scraped info from the job posting into the list named 'jobs'
                jobs.append({'company':company, 
                             'title':title, 
                             'location':location, 
                             'salary':salary, 
                             'rating':rating, 
                             'size':size,
                             'industry':industry,
                             'sector':sector,
                             'revenue':revenue,
                             'job_description':job_description})
            
            page += 1
            
            # Go to the next page
            try:
                driver.find_element_by_xpath('.//li[@class="next"]//a').click()
            except NoSuchElementException:
                break
            
    return pd.DataFrame(jobs)

In [10]:
# Run the function 'scraping_jobs' with the key word 'data analyst' and the page number 3
df = scraping_jobs('data analyst', 3)

In [12]:
df.head(50)

Unnamed: 0,company,title,location,salary,rating,size,industry,sector,revenue,job_description
0,CTG,DATA CENTER SERVICES ANALYST I,"Anchorage, AK",,3.4,1001 to 5000 Employees,IT Services,Information Technology,$100 to $500 million (USD),CTG is searching for a full time *Data Center ...
1,Manmade Creative Inc,DATA ANALYST and HEALTH,"Anchorage, AK",,,,,,,"Familiarity in analysis reporting, data entry,..."
2,DS Technologies Inc,Data Analyst,Alaska,,5.0,1 to 50 Employees,,,$1 to $5 million (USD),"Hi,Greetings from DSTechnologiesinc!\n\nIam re..."
3,Integrated Statistics,Data Analyst/Modeler,"Anchorage, AK",$48K-$86K,5.0,51 to 200 Employees,"Health, Beauty, & Fitness",Consumer Services,$5 to $10 million (USD),Data Analyst/Modeler Needed\n\nIntegrated Stat...
4,Southcentral Foundation,Data Analyst,"Anchorage, AK",$39K-$68K,3.5,1001 to 5000 Employees,Membership Organizations,Business Services,$100 to $500 million (USD),The Southcentral Foundation (SCF) Data Analyst...
5,National Renewable Energy Lab,Cold Climate Data Researcher / Analyst II (Ala...,"Fairbanks, AK",$67K-$148K,3.9,1001 to 5000 Employees,Energy,"Oil, Gas, Energy & Utilities",Unknown / Non-Applicable,Posting TitleCold Climate Data Researcher / An...
6,Alaska USA Federal Credit Union,"Data Center Services Analyst I, II, III, or Se...","Anchorage, AK",$26K-$48K,3.4,1001 to 5000 Employees,Banks & Credit Unions,Finance,$100 to $500 million (USD),Reports to: Varies by location\n\nFunctions Su...
7,General Communication Inc.,"Analyst, Network Statistical Data","Anchorage, AK",$43K-$101K,3.7,1001 to 5000 Employees,"Cable, Internet & Telephone Providers",Telecommunications,$500 million to $1 billion (USD),GCI's Network Statistical Data Analyst will pr...
8,Alaska USA Federal Credit Union,Data Center Services Analyst II,"Anchorage, AK",$36K-$69K,3.4,1001 to 5000 Employees,Banks & Credit Unions,Finance,$100 to $500 million (USD),Reports to: Varies by location\n\nFunctions Su...
9,Austal USA,HCM Data Analyst,"Mobile, AL",$55K-$94K,3.1,1001 to 5000 Employees,Industrial Manufacturing,Manufacturing,$1 to $2 billion (USD),Any qualified individual with a disability who...


In [15]:
# Store the scraped job posting info as the csv file
df.to_csv(r'...\glassdoor_data_analyst.csv', index=False)