## Glassdoor Job Scraper


In [18]:
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager  
import time
import csv
import pandas as pd

In [19]:
# The webdriver initialization
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.set_window_size(1120, 1000)
url = "https://www.glassdoor.com/Job/germany-data-scientist-jobs-SRCH_IL.0,7_IN96_KO8,22.htm"
driver.get(url)

In [20]:
def close_popup_window():
    # Close popup glassdoor window
    try:
        driver.find_element(By.XPATH,'.//div[@class="d-flex flex-column pl-sm css-3g3psg css-1of6cnp e1rrn5ka4"]').click()
        driver.find_element(By.XPATH, './/span[@alt="Close"]').click()
    except NoSuchElementException:
        pass   

In [21]:
def scrape_job_card(job_card):
    # Extract from single card job related data
    try:
        company_name = driver.find_element(By.XPATH,'.//div[@data-test="employerName"]').text
    except NoSuchElementException:
        company_name = "NI"
    try:
        position = driver.find_element(By.XPATH,'.//div[@data-test="jobTitle"]').text
    except NoSuchElementException:
        position = "NI"       
    try:
        location = driver.find_element(By.XPATH, './/div[@data-test="location"]').text
    except NoSuchElementException:
        location = "NI"
    try:
        salary = job_card.find_element(By.XPATH,'.//span[@data-test="detailSalary"]').text
    except NoSuchElementException:
        salary = "NI"    
    try:
        rating = driver.find_element(By.XPATH, './/span[@data-test="detailRating"]').text
    except NoSuchElementException:
        rating = "NI"
    try:
        description = driver.find_element(By.XPATH, './/div[@class="jobDescriptionContent desc"]').text
    except NoSuchElementException:
        description = "NI"
    try:
        company_overview = driver.find_element(By.XPATH, './/div[@class="d-flex flex-wrap"]').text
    except NoSuchElementException:
        company_overview = "NI"
    try:
        ratings_overview = driver.find_element(By.XPATH, './/ul[@class="css-38kpu8 erz4gkm0"]').text
    except NoSuchElementException:
        ratings_overview = "NI"
    try:
        pros_and_cons = driver.find_element(By.XPATH, './/div[@class="css-1xgpjcn e1vn3ovn5"]').text
    except NoSuchElementException:
        pros_and_cons = "NI"

    return position, company_name, location, rating, salary, description, company_overview, ratings_overview, pros_and_cons

In [22]:
def save_data_to_csv_file(records):
    # Save scraped data to csv file
    with open('jobs_info.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['position', 'company_name', 'location', 'rating', 'salary', 'description', 'company_overview', 'ratings_overview', 'pros_and_cons'])
        writer.writerows(records)

In [23]:
def glassdoor_scraper(num_job_cards, waiting_time):
    close_popup_window()
    jobs = []
    while len(jobs) < num_job_cards:
        job_cards_list = driver.find_elements(By.XPATH,'.//div[@class="d-flex flex-column pl-sm css-3g3psg css-1of6cnp e1rrn5ka4"]')
        # Going through each job-card
        for i in range(len(job_cards_list)):
            print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_job_cards)))
            if len(jobs) >= num_job_cards:
                break
            job_cards_list[i].click()
            time.sleep(waiting_time)
            jobs.append(scrape_job_card(job_cards_list[i]))
            if i == (len(job_cards_list)-1):
                # Clicking on the "next page" button
                driver.find_element(By.XPATH, './/span[@alt="next-icon"]').click()
                time.sleep(waiting_time)
    save_data_to_csv_file(jobs)

In [24]:
# Run a search function (it will open a new chrome window and start the scraping) 
glassdoor_scraper(num_job_cards=5, waiting_time=3)

Progress: 0/5
Progress: 1/5
Progress: 2/5
Progress: 3/5
Progress: 4/5
Progress: 5/5


In [25]:
# Open csv file with the results and displays the first five rows of the results 
df = pd.read_csv("./jobs_info.csv")

print(df.head())

                                            position  \
0                      Senior Data Scientist (m/f/d)   
1                                Data Scientist (gn)   
2                             Data Scientist (m/w/d)   
3                      Senior Data Scientist (m/w/d)   
4  IT System Engineer / Data Scientist (m/f/d) Ja...   

                              company_name  location rating salary  \
0                      Hoffmann Group\n3.6    Munich    3.6     NI   
1                       LichtBlick SE\n3.7   Hamburg    3.7     NI   
2  Endress+Hauser Conducta GmbH+Co.KG\n4.1  Waldheim    4.1     NI   
3                               EMPIT GmbH    Berlin     NI     NI   
4                      Hapag Lloyd AG\n3.9   Hamburg    3.9     NI   

                                         description  \
0  More than 4,000 highly motivated employees in ...   
1  LichtBlick sucht dich! Ab sofort und in Vollze...   
2  Aufgabe + Herausforderungen\nFestlegung von Da...   
3  Über uns:\nAls 