## Create a general purpose job scraper for www.indeed.com

In [199]:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
import time 

In [184]:
def get_job_search_url(position, postedDays):
    """Generate a url from position and posted days ago"""

    template = 'https://www.indeed.com/jobs?q={}&fromage={}'
    url=template.format(position, postedDays)
    
    return url

In [217]:
url = get_job_search_url('machine learning', 1)


##  Extract raw html


In [218]:
response = requests.get(url)

In [219]:
response

<Response [200]>

In [220]:
type(response)

requests.models.Response

In [221]:
#response.text

In [222]:
soup = BeautifulSoup(response.text, 'html.parser')

In [223]:
soup


<!DOCTYPE html>

<html dir="ltr" lang="en">
<head>
<meta content="text/html;charset=utf-8" http-equiv="content-type"/>
<script src="//d3fw5vlhllyvee.cloudfront.net/s/ffe72ff/en_US.js" type="text/javascript"></script>
<link href="//d3fw5vlhllyvee.cloudfront.net/s/105b986/jobsearch_all.css" rel="stylesheet" type="text/css"/>
<link href="https://rss.indeed.com/rss?q=machine+learning" rel="alternate" title="Machine Learning Jobs, Employment" type="application/rss+xml"/>
<link href="/m/jobs?q=machine+learning&amp;fromage=1" media="only screen and (max-width: 640px)" rel="alternate"/>
<script type="text/javascript">

if (typeof window['closureReadyCallbacks'] == 'undefined') {
window['closureReadyCallbacks'] = [];
}

function call_when_jsall_loaded(cb) {
if (window['closureReady']) {
cb();
} else {
window['closureReadyCallbacks'].push(cb);
}
}
</script>
<meta content="1" name="ppstriptst"/>
<script>
var _scriptDownloadCount = 0;
var _retryDownload = function() {
var script = document.createE

In [225]:
total_jobs = soup.find('div',{'id':'searchCountPages'}).text.strip()
total_jobs

'Page 1 of 198 jobs'

In [226]:
cards = soup.find_all('div', 'jobsearch-SerpJobCard')

In [227]:
len(cards)

15

## Prototype the model with a single record

In [194]:
card = cards[0]

In [195]:
aTag = card.h2.a

In [196]:
job_title = aTag.get('title')
job_title

'Data Scientist'

In [197]:
job_id = card.get('data-jk')
job_id

'2a073aaa41c0c83f'

In [200]:
time.sleep(60)
job_url = 'https://www.indeed.com' + aTag.get('href')
job_url

post_response = requests.get(job_url)
post_soup = BeautifulSoup(post_response.text, 'html.parser')
job_description = post_soup.find('div','jobsearch-jobDescriptionText').text.strip()
job_description

try:
    job_detail = post_soup.find('div','jobsearch-JobDescriptionSection-section').text.strip()
except:
    job_detail = ''
job_detail

''

In [201]:
job_description

'Tradeweb is looking to add a Data Scientist to its Data & Analytics team. Our Data Science team is responsible for managing the development and optimization of advanced analytics on a global scale across the company.\nIn this role, you will enjoy working with one of the richest financial data sets in the world, cutting edge technology, and the ability to see your insights turned into real products on a regular basis. The ideal candidate will have experience doing advanced data analysis, will have worked with large data stores, and will have experience building machine learning models. Candidates should be curious, focused on results, a self-starter, and have demonstrated success in using analytics to drive value in an organization.\nDesign, build, test and deliver new data science services and advanced analytics to Tradeweb globally\nDevelop scalable tools leveraging machine learning and/or deep learning models to solve real-world problems in areas such as time series predictions\nSug

In [111]:
job_company = card.find('span','company').text.strip()
job_company

'Oxford Global Resources'

In [112]:
try:
    company_rating = card.find('span','ratingsContent').text.strip()
except AttributeError:
    company_rating = ''
company_rating

'3.7'

In [113]:
job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
job_location

'Remote'

In [114]:
job_summary = card.find('div', 'summary').text.strip()
job_summary

'Scientists will participate in data model development and petabyte-level data process optimization.\nSome experience in big data processing.'

In [115]:
job_post_date = card.find('span','date').text.strip()

In [116]:
today = datetime.today().strftime('%Y-%m-%d') # To do: need to add time 

In [117]:
datetime.today().strftime('%Y_%m_%d_%H_%M_%S')

'2021_02_14_20_13_19'

In [118]:
datetime.today()

datetime.datetime(2021, 2, 14, 20, 13, 19, 452396)

In [119]:
try:
    job_salary = card.find('span','salaryText').text.strip()
except AttributeError:
    job_salary = ''


In [120]:
try:
    job_remote = card.find('span','remote').text.strip()
except AttributeError:
    job_remote = ''

## Generalize the model with a function

In [228]:
def get_record(card):
    """Extract individual job post data from a single record """
    # required variables
    job_id = card.get('data-jk')
    aTag = card.h2.a
    job_title = aTag.get('title')
    
    job_url = 'https://www.indeed.com' + aTag.get('href')
    #job_url = 'https://www.indeed.com/viewjob?jk=' + job_id
    post_response = requests.get(job_url)
    post_soup = BeautifulSoup(post_response.text, 'html.parser')
    try:
        job_description = post_soup.find('div','jobsearch-jobDescriptionText').text.strip()
    except AttributeError:
        job_description = ''
        
    
    try:
        job_detail = post_soup.find('div','jobsearch-JobDescriptionSection-section').text.strip()
    except AttributeError:
        job_detail = ''
    
    job_company = card.find('span','company').text.strip()
    job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
    job_summary = card.find('div', 'summary').text.strip()
    job_post_date = card.find('span','date').text.strip()
    
    # optional variables
    try:
        company_rating = card.find('span','ratingsContent').text.strip()
    except AttributeError:
        company_rating = ''
    
    try:
        job_salary = card.find('span','salaryText').text.strip()
    except AttributeError:
        job_salary = ''
    
    try:
        job_remote = card.find('span','remote').text.strip()
    except AttributeError:
        job_remote = ''
        
    today = datetime.today().strftime('%Y-%m-%d')
    
    record = (job_id, job_title,job_company,job_location,company_rating, job_post_date,today,job_summary,job_salary, job_remote, job_url, job_description, job_detail)
    
    return record


In [161]:
records = []

for card in cards:
    record = get_record(card)
    records.append(record)

In [163]:
records[11]

('1260f89ca0cbf7d8',
 'Data Scientist',
 'LoadSpring Solutions, Inc',
 'Remote',
 '2.3',
 '5 days ago',
 '2021-02-14',
 'Participate in data strategy and infrastructure discussions, help define requirements for data structures and data retention.',
 '',
 '',
 'https://www.indeed.com/rc/clk?jk=1260f89ca0cbf7d8&fccid=a39f5e8cf0ebec8c&vjs=3',
 'Are you a Data Scientist who prospers when they can evaluate and improve customers’ products and help them drive their business goals with data? At LoadSpring, you’ll work with large, complex data sets to solve difficult, non-routine analysis problems, and apply advanced analytical methods. You will build and deploy ML models for predictive insights and collaborate extensively with product managers, other data engineers, and UI/UX designers to bring your creations to life! If this excites you then we want to meet you!\nThe Data Scientist Position\nArticulate and translate business questions and using statistical techniques to arrive at answers usin

## Getting the next page


In [233]:
soup

<html>
<head>
<title>hCaptcha solve page</title>
<script async="" defer="" src="https://www.hcaptcha.com/1/api.js"></script>
</head>
<body>
<form action="/jobs?q=machine+learning&amp;fromage=1&amp;start=120" method="POST">
<div class="h-captcha" data-sitekey="eb27f525-f936-43b4-91e2-95a426d4a8bd"></div>
<br/>
<input type="submit" value="Submit"/>
</form>
</body>
</html>

In [239]:
records =[]
url = get_job_search_url('machine learning', 1)
print( url )
while True:
    print(url)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')   
    cards = soup.find_all('div', 'jobsearch-SerpJobCard')
    print(len(cards))
 
    '''for card in cards:
        record = get_record(card)
        records.append(record)
        time.sleep(3)

print(len(records))'''
    if len(cards)> 0:
        try:
            url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except AttributeError:
            total_jobs = soup.find('div',{'id':'searchCountPages'}).text.strip()
            print(url)
            print(total_jobs)
            break
        time.sleep(1)
    else: break

https://www.indeed.com/jobs?q=machine learning&fromage=1
https://www.indeed.com/jobs?q=machine learning&fromage=1
15
https://www.indeed.com/jobs?q=machine+learning&fromage=1&start=10
15
https://www.indeed.com/jobs?q=machine+learning&fromage=1&start=20
15
https://www.indeed.com/jobs?q=machine+learning&fromage=1&start=30
15
https://www.indeed.com/jobs?q=machine+learning&fromage=1&start=40
15
https://www.indeed.com/jobs?q=machine+learning&fromage=1&start=50
15
https://www.indeed.com/jobs?q=machine+learning&fromage=1&start=60
15
https://www.indeed.com/jobs?q=machine+learning&fromage=1&start=70
15
https://www.indeed.com/jobs?q=machine+learning&fromage=1&start=80
15
https://www.indeed.com/jobs?q=machine+learning&fromage=1&start=90
15
https://www.indeed.com/jobs?q=machine+learning&fromage=1&start=100
0


In [237]:
soup.find('a', {'aria-label': 'Next'}).get('href')

AttributeError: 'NoneType' object has no attribute 'get'

In [56]:
len(records)

1027

In [55]:
url
cards

[<div class="jobsearch-SerpJobCard unifiedRow row result" data-ci="105245054" data-empn="8690912762161442" data-jk="e1ca911d35f33873" data-tu="https://jsv3.recruitics.com/partner/a51b8de1-f7bf-11e7-9edd-d951492604d9.gif?client=521&amp;rx_c=&amp;rx_campaign=indeed16&amp;rx_group=105274&amp;rx_source=Indeed&amp;job=ET3-16186674&amp;rx_r=none&amp;rx_ts=20210213T174822Z&amp;rx_pre=1&amp;indeed=sp" id="pj_e1ca911d35f33873">
 <style>
 .jobcard_logo{margin:6px 0}.jobcard_logo img{width:auto;max-width:80px;max-height:30px}.jasxrefreshcombotst .jobcard_logo img{max-height:2rem;max-width:100%}
 </style>
 <h2 class="title">
 <a class="jobtitle turnstileLink" data-tn-element="jobTitle" href="/pagead/clk?mo=r&amp;ad=-6NYlbfkN0CpFJQzrgRR8WqXWK1qKKEqALWJw739KlKqr2H-MSI4eh4ZOxqVaUrhNSyjVEAq5t6UFhwwe4_dkhWx5YkiA2P1bl_msm64C9caZLjh6fNk_zKO8S7Pwh5f00ipWjKaQpAa-uA3IKXqOmXxeQfGyxEYr2G08Ia5zpcPDcYAqaBTzAhWYNmA_4W3joTVn3Zik4KSbOrFHb8IDmcDVmBApkcinpXDdEJmpZVUKmlfVaUAnoEzniGzu7rz5rfZD7UbErhcjCl0shyAdk5h02-_cQy

## Putting it all together

In [274]:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
import time

def get_job_search_url(position, postedDays):
    """Generate a url from position and posted days ago"""

    template = 'https://www.indeed.com/jobs?q={}&fromage={}&limit=50'
    url=template.format(position, postedDays)
    
    return url

def get_record(card):
    """Extract individual job post data from a single record """
    # required variables
    job_id = card.get('data-jk')
    aTag = card.h2.a
    job_title = aTag.get('title')
    
    #job_url = 'https://www.indeed.com' + aTag.get('href')
    job_url = 'https://www.indeed.com/viewjob?jk=' + job_id
    post_response = requests.get(job_url)
    post_soup = BeautifulSoup(post_response.text, 'html.parser')
    try:
        job_description = post_soup.find('div','jobsearch-jobDescriptionText').text.strip()
    except AttributeError:
        job_description =''
        
    try:
        job_detail = post_soup.find('div','jobsearch-JobDescriptionSection-section').text.strip()
    except AttributeError:
        job_detail = ''
    
    job_company = card.find('span','company').text.strip()
    job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
    job_summary = card.find('div', 'summary').text.strip()
    job_post_date = card.find('span','date').text.strip()
    
    # optional variables
    try:
        company_rating = card.find('span','ratingsContent').text.strip()
    except AttributeError:
        company_rating = ''
        
    try:
        job_salary = card.find('span','salaryText').text.strip()
    except AttributeError:
        job_salary = ''
    
    try:
        job_remote = card.find('span','remote').text.strip()
    except AttributeError:
        job_remote = ''
        
    today = datetime.today().strftime('%Y-%m-%d')
    
    record = (job_id, job_title,job_company,job_location,company_rating, job_post_date,today,job_summary,job_salary, job_remote, job_url, job_description, job_detail)
    
    return record

def main(position, postedDay, fileName):
    """Run the main program routine"""
    records = []
    url = get_job_search_url(position, postedDay)

    # extract the job data
    while True:        
        print(url)
       
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        if soup.title.text.find('Captcha') != -1: 
            print('blocked by Captcha!')
            break
            
        cards = soup.find_all('div', 'jobsearch-SerpJobCard')        
        total_jobs = soup.find('div',{'id':'searchCountPages'}).text.strip()
        print(len(cards), total_jobs)
        
        for card in cards:
            record = get_record(card)
            records.append(record)
            time.sleep(3)
   
        try:
            url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href') 
        except AttributeError:     
            print('done!')
            break
            
        #time.sleep(60)
        
    print(position + ': '+ str(len(records)))
    # save the job data
    if (len(records) > 0):
        with open('../data/raw/'+fileName+datetime.today().strftime('%Y_%m_%d_%H_%M_%S')+ '.csv', 'w', newline ='', encoding ='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['JobID', 'JobTitle', 'Company','Location', 'CompanyRating', 'PostDate', 'ExtractDate','Summary', 'Salary', 'Remote','JobUrl','JobDescription', 'JobDetail'])
            writer.writerows(records)

Notes:

1: Indeed will not return exactly amount of jobs shown on the pagination based on some internal filtering they run, unless you add query string &filter = 0 in the url. 
https://www.indeed.com/jobs?q=machine+learning&fromage=1&filter=0&start=200

2: Added some sleep time to avoid being blocked by CAPTCHA

3: blocked by Captcha after scraping around 1k , 1073/3463  with sleep time(3) (5)


In [277]:
# run the main program
#main('data scientist', 7 ,'ds_last7d_')
#main('machine learning', 7, 'ml_last7d_')
#main('data analyst', 7 , 'dat_last7d_')
#main('data analytics', 7, 'das_last7d_')

main('data science, data scientist',1 ,'ds_last1d_')      # < 2,000 result per day
#main('data analytics, data analyst',1 ,'ds_last1d_')     # < 2,000 result per day
#main('machine learning', 3 ,'ml_last1d_')     # <2,000 result  per 3 days



https://www.indeed.com/jobs?q=data science, data scientist&fromage=1&limit=50
blocked by Captcha!
data science, data scientist: 0


In [262]:
soup.find('div','h-captcha')
soup.title.text.find('Captcha')

1

In [270]:
if soup.title.text.find('Captcha') != -1: print('blocked by Captcha!')
len(records)

blocked by Captcha!


0