# INDEED JOB SCRAPER
#### _________ 2-3 jobs/sec __________

### Enter key words for your custom job role and your scrapeops api key below:

In [1]:
# Your job role
job_role = 'Data Analyst' # example

# Location to search in
location = 'London%2C+Greater+London' # example

# Radius to scan from given location
radius = 100 # in miles

# Your scrapeops api key (Create an account at https://scrapeops.io/ to get a free apikey)
API_KEY = 'your-api-key'

#### _______________________________
#### Install required packages (uncomment)

In [2]:
# !pip install beautifulsoup4
# !pip install glom
# !pip install lxml

In [3]:
# Import packages for scraping
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlencode

# Import packages for data manipulation
import pandas as pd
import time

# Check current working directory
import os
print(os.getcwd())

# Record start time
start_time = time.time()

/Users/guts/Documents/Projects/Indeed Jobs Webscraping


#### Access Job Website

In [5]:
# Your job portal (works only on indeed uk for now)
baseUrl = 'https://uk.indeed.com'

# Your search params
query = job_role.replace(' ', '+').lower()

# Create url generator
def get_url(query, location, radius):
    return f'{baseUrl}/jobs?q={query}&l={location}&radius={radius}'

# Create proxy url generator
def get_scrapeops_url(url, API_KEY):
    payload = {'api_key': API_KEY, 'url': url}
    proxy_url = 'https://proxy.scrapeops.io/v1/?' + urlencode(payload)
    return proxy_url

#### Extract Raw Html

In [6]:
# Get custom url
url = get_url(query, location, radius)

# Send an indirect get request to indeed uk via scrapeops to avoid bot detection
response = requests.get(get_scrapeops_url(url, API_KEY))
response, response.reason

(<Response [200]>, 'OK')

In [7]:
# Get the content from the response of get-request
htmlContent = response.text

# Create a soup object from the html content using the lxml parser
soup = BeautifulSoup(htmlContent, 'lxml')

# Identify all jobcards in the html table tag with their class name
cards = soup.find_all('div', 'job_seen_beacon')

# Verify number of job cards being displayed on the page
len(cards)

15

### Retreive all jobs data

In [8]:
# Define a function that retreives data from a job card
def get_record(card):
    # Get job title, job url, company name
    aTag = card.h2.a
    jobTitle = aTag.span.text
    jobUrl = baseUrl + aTag.get('href')

    # Initiate optional variables
    company = 'NA'
    companyRating = 'NA'
    jobLocation = 'NA'
    salary = 'NA'
    jobShift = 'NA'
    postDate = 'NA'
    myJobState = 'not visited'

    # Find all corresponding html elements
    companySpan = card.find('span', 'companyName')
    ratingSpan = card.find('span', 'ratingNumber')
    locationDiv = card.find('div', 'companyLocation')
    salaryDiv = card.find('div', 'salary-snippet-container')
    jobShiftSvg = card.find('svg', {'aria-label':'Job type'})
    dateSpan = card.find('span', 'date')
    jobStateSpan = card.find('span', 'myJobsState')

    # Update all optional variables that are available
    if companySpan: company = companySpan.text.strip()
    if ratingSpan: companyRating = ratingSpan.span.text.strip()
    if locationDiv: jobLocation = locationDiv.text.strip()
    if salaryDiv: salary = salaryDiv.text.strip()
    if jobShiftSvg: jobShift = jobShiftSvg.parent.text.strip()
    if dateSpan: postDate = dateSpan.contents[-1].strip()
    if jobStateSpan: myJobState = jobStateSpan.text.strip()
    
    # Return an indexed pandas series of all data
    return pd.Series({'Job Title': jobTitle,
                      'Url': jobUrl,
                      'Employer': company,
                      'Employer Rating': companyRating,
                      'Location': jobLocation,
                      'Pay': salary,
                      'Shift':jobShift,
                      'Posted': postDate,
                      'Status': myJobState})

In [9]:
# Create a pandas data frame with all job records
recordsDf = pd.DataFrame(columns=['Job Title', 'Url', 'Employer',
                                  'Employer Rating', 'Location',
                                  'Pay', 'Shift', 'Posted', 'Status'])
# Loop through each job card and add it into the recordsDf
for card in cards: recordsDf.loc[recordsDf.shape[0]] = get_record(card)
    
recordsDf.head(4)

Unnamed: 0,Job Title,Url,Employer,Employer Rating,Location,Pay,Shift,Posted,Status
0,Data and Performance Analyst,https://uk.indeed.com/rc/clk?jk=0e35c58e8c578c...,Great Ormond Street Hospital NHS Foundation Trust,4.0,London,"£49,036 - £55,049 a year",Permanent,Posted 11 days ago,not visited
1,Flight Data Analyst,https://uk.indeed.com/rc/clk?jk=315edac9b4075e...,British Airways,3.7,Heathrow,,,Posted 10 days ago,not visited
2,Graduate Data Analyst,https://uk.indeed.com/rc/clk?jk=f8af466bd089af...,Metropolitan Thames Valley,3.5,Remote in London EC1N,"£31,500 - £35,000 a year",Full-time,Posted 4 days ago,not visited
3,Data Analyst,https://uk.indeed.com/rc/clk?jk=7819da148cf5f2...,Maggie's,4.2,London,"£38,325 - £50,548 a year",Permanent,Posted 3 days ago,not visited


#### Repeating the same until last page
(This code cell might take a few minutes to finish if there are 100s of total jobs because there are only 15 jobs visible per page)

In [10]:
# Loop until the last page
nextPageUrl = True
while nextPageUrl:
    nextPageUrl = False # this will break the loop if it remains False
    # Find the link to go to next page
    nextPageATag = soup.find('a', {'aria-label':"Next Page"})

    if nextPageATag:
        # Update the new next page url and get new response
        nextPageUrl = baseUrl + nextPageATag.get('href')
        newResponse = requests.get(get_scrapeops_url(nextPageUrl, API_KEY))
        # create soup object of new response and retrieve all new cards
        soup = BeautifulSoup(newResponse.text, 'html.parser')
        newCards = soup.find_all('div', 'job_seen_beacon')
        # Store each card in the recordsDf
        for card in newCards: recordsDf.loc[recordsDf.shape[0]] = get_record(card)

# Check final shape
recordsDf.shape

(992, 9)

#### Write data frame to excel file

In [11]:
# Drop the status column
newDf = recordsDf.drop(columns='Status') # columns: 9-1 = 8

# create an ExcelWriter object for the output file
writer = pd.ExcelWriter(f'{job_role} Jobs.xlsx', engine='xlsxwriter')

# write newDf for worksheet with the query name
newDf.to_excel(writer, sheet_name=query, index=False)

# get the workbook and the worksheet
workbook = writer.book
worksheet = writer.sheets[query]

# set the format for the column labels in the worksheet
label_format = workbook.add_format({'text_wrap': True, 'align': 'center'})
worksheet.set_row(0, None, label_format)

# set column widths for all columns
worksheet.set_column('A:A', 35)   # Job Title column
worksheet.set_column('B:B', 5)    # Url column
worksheet.set_column('C:C', 30)   # Employer column
worksheet.set_column('D:D', 15)   # Employer Rating column
worksheet.set_column('E:E', 25)   # Location column
worksheet.set_column('F:F', 20)   # Pay column
worksheet.set_column('G:H', 15)   # Shift and Posted columns

# save the Excel file
writer.save()
# Record end time
end_time = time.time()

#### Performance Statisics

In [17]:
# Calculate total notebook run time in seconds
runtime = end_time - start_time
# Calculate net speed
speed = newDf.shape[0]/runtime

# Rounding off
rounded_minutes = round(runtime//60)
rounded_seconds = round(runtime%60)
rounded_speed = round(speed,2)

# Print final runtime and speed
print(f'\n{newDf.shape[0]} jobs found! in {rounded_minutes} min {rounded_seconds} sec @ {rounded_speed} jobs/sec')
print(f"Saved as '{job_role} Jobs.xlsx'")


992 jobs found! in 7 min 24 sec @ 2.23 jobs/sec
Saved as 'Data Analyst Jobs.xlsx'


#### END