## Create a general purpose job scraper for www.indeed.com

In [370]:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
import time 
import numpy as np

In [404]:
def get_job_search_url(position, postedDays):
    """Generate a url from position and posted days ago"""

    template = 'https://www.indeed.com/jobs?q={}&fromage={}&limit=50&filter=0'
    url=template.format(position, postedDays)
    
    return url

In [386]:
url = get_job_search_url('machine learning', 1)


##  Extract raw html


In [292]:
response = requests.get(url)

In [293]:
response

<Response [200]>

In [294]:
type(response)

requests.models.Response

In [295]:
#response.text

In [296]:
soup = BeautifulSoup(response.text, 'html.parser')

In [297]:
total_jobs = soup.find('div',{'id':'searchCountPages'}).text.strip()
total_jobs

'Page 1 of 546 jobs'

In [298]:
cards = soup.find_all('div', 'jobsearch-SerpJobCard')

In [299]:
len(cards)

15

## Prototype the model with a single record

In [194]:
card = cards[0]

In [195]:
aTag = card.h2.a

In [196]:
job_title = aTag.get('title')
job_title

'Data Scientist'

In [197]:
job_id = card.get('data-jk')
job_id

'2a073aaa41c0c83f'

In [375]:
# To check if the card already stored in the records  # 517da30c1f7ab5f0
type(records)
records_ar = np.array(records)
#'517da30c1f7ab5f0' in np.array(records)
'517da30c1f7ab5f0' in np.array(records)[:,0]

True

In [200]:
time.sleep(60)
job_url = 'https://www.indeed.com' + aTag.get('href')
job_url

post_response = requests.get(job_url)
post_soup = BeautifulSoup(post_response.text, 'html.parser')
job_description = post_soup.find('div','jobsearch-jobDescriptionText').text.strip()
job_description

try:
    job_detail = post_soup.find('div','jobsearch-JobDescriptionSection-section').text.strip()
except:
    job_detail = ''
job_detail

''

In [201]:
job_description

'Tradeweb is looking to add a Data Scientist to its Data & Analytics team. Our Data Science team is responsible for managing the development and optimization of advanced analytics on a global scale across the company.\nIn this role, you will enjoy working with one of the richest financial data sets in the world, cutting edge technology, and the ability to see your insights turned into real products on a regular basis. The ideal candidate will have experience doing advanced data analysis, will have worked with large data stores, and will have experience building machine learning models. Candidates should be curious, focused on results, a self-starter, and have demonstrated success in using analytics to drive value in an organization.\nDesign, build, test and deliver new data science services and advanced analytics to Tradeweb globally\nDevelop scalable tools leveraging machine learning and/or deep learning models to solve real-world problems in areas such as time series predictions\nSug

In [111]:
job_company = card.find('span','company').text.strip()
job_company

'Oxford Global Resources'

In [112]:
try:
    company_rating = card.find('span','ratingsContent').text.strip()
except AttributeError:
    company_rating = ''
company_rating

'3.7'

In [113]:
job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
job_location

'Remote'

In [114]:
job_summary = card.find('div', 'summary').text.strip()
job_summary

'Scientists will participate in data model development and petabyte-level data process optimization.\nSome experience in big data processing.'

In [115]:
job_post_date = card.find('span','date').text.strip()

In [116]:
today = datetime.today().strftime('%Y-%m-%d') # To do: need to add time 

In [117]:
datetime.today().strftime('%Y_%m_%d_%H_%M_%S')

'2021_02_14_20_13_19'

In [118]:
datetime.today()

datetime.datetime(2021, 2, 14, 20, 13, 19, 452396)

In [119]:
try:
    job_salary = card.find('span','salaryText').text.strip()
except AttributeError:
    job_salary = ''


In [120]:
try:
    job_remote = card.find('span','remote').text.strip()
except AttributeError:
    job_remote = ''

## Generalize the model with a function

In [387]:
def get_record(card, withDetail):
    """Extract individual job post data from a single record """
    # required variables
    job_id = card.get('data-jk')
    aTag = card.h2.a
    job_title = aTag.get('title')
    
    #job_url = 'https://www.indeed.com' + aTag.get('href')
    job_url = 'https://www.indeed.com/viewjob?jk=' + job_id
    
    if withDetail:
        post_response = requests.get(job_url)
        post_soup = BeautifulSoup(post_response.text, 'html.parser')
        try:
            job_description = post_soup.find('div','jobsearch-jobDescriptionText').text.strip()
        except AttributeError:
            job_description = ''       

        try:
            job_detail = post_soup.find('div','jobsearch-JobDescriptionSection').text.strip()
        except AttributeError:
            job_detail = ''
    
    job_company = card.find('span','company').text.strip()
    job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
    job_summary = card.find('div', 'summary').text.strip()
    job_post_date = card.find('span','date').text.strip()
    
    # optional variables
    try:
        company_rating = card.find('span','ratingsContent').text.strip()
    except AttributeError:
        company_rating = ''
    
    try:
        job_salary = card.find('span','salaryText').text.strip()
    except AttributeError:
        job_salary = ''
    
    try:
        job_remote = card.find('span','remote').text.strip()
    except AttributeError:
        job_remote = ''
        
    today = datetime.today().strftime('%Y-%m-%d')
    
    if withDetail:
        record = (job_id, job_title,job_company,job_location,company_rating, job_post_date,today,job_summary,job_salary, job_remote, job_url, job_detail, job_description)
    else:
        record = (job_id, job_title,job_company,job_location,company_rating, job_post_date,today,job_summary,job_salary, job_remote, job_url)
   
    return record


In [400]:
%time
records = []

for card in cards:
    if records == []:
        record = get_record(card, withDetail = True)
        records.append(record)
    else:
        #To check if the card is already stored in the records
        id_array = np.array([r[0] for r in records])
        if not card.get('data-jk') in id_array:   
            record = get_record(card, withDetail = True)
            records.append(record)

Wall time: 0 ns


In [401]:
np.array([])

array([], dtype=float64)

In [383]:
id_array, len(records)

(array(['51d2ea04d15fbf1f', '571e360548a8d0c1', '2f143848e64c0573',
        '32e4ffa91d91f8ed', '517da30c1f7ab5f0', 'eb71688408d695be',
        'd2a2d1dba9456a6d', 'd080318aa423d790', '558379216e05a524',
        'd9c74d0d57cd11f2', '9bfea2a8b8774194', 'e874bf89e3161b17',
        'a078eabd7f5b3f38', '1145d029b0d268ae'], dtype='<U10786'),
 15)

In [402]:
type(records[11])

tuple

## Getting the next page


In [233]:
soup

<html>
<head>
<title>hCaptcha solve page</title>
<script async="" defer="" src="https://www.hcaptcha.com/1/api.js"></script>
</head>
<body>
<form action="/jobs?q=machine+learning&amp;fromage=1&amp;start=120" method="POST">
<div class="h-captcha" data-sitekey="eb27f525-f936-43b4-91e2-95a426d4a8bd"></div>
<br/>
<input type="submit" value="Submit"/>
</form>
</body>
</html>

In [406]:
records =[]
url = get_job_search_url('machine learning', 1)
print( url )
while True:
    print(url)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')   
    cards = soup.find_all('div', 'jobsearch-SerpJobCard')
    print(len(cards))
 
    '''for card in cards:
        record = get_record(card, withDetail = False)
        records.append(record)
        #time.sleep(3)'''
        
    for card in cards:
        if records == []:
            record = get_record(card, withDetail = True)
            records.append(record)
        else:
            #To check if the card is already stored in the records
            id_array = np.array([r[0] for r in records])
            if not card.get('data-jk') in id_array:   
                record = get_record(card, withDetail = False)
                records.append(record)

    print(len(records))
    if len(cards)> 0:
        try:
            url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except AttributeError:
            total_jobs = soup.find('div',{'id':'searchCountPages'}).text.strip()
            print(url)
            print(total_jobs)
            break
        time.sleep(1)
    else: break
        
print(len(records))

https://www.indeed.com/jobs?q=machine learning&fromage=1&limit=50&filter=0
https://www.indeed.com/jobs?q=machine learning&fromage=1&limit=50&filter=0
55
55
https://www.indeed.com/jobs?q=machine+learning&limit=50&fromage=1&filter=0&start=50
55
110
https://www.indeed.com/jobs?q=machine+learning&limit=50&fromage=1&filter=0&start=100
55
165
https://www.indeed.com/jobs?q=machine+learning&limit=50&fromage=1&filter=0&start=150
55
220
https://www.indeed.com/jobs?q=machine+learning&limit=50&fromage=1&filter=0&start=200
55
275
https://www.indeed.com/jobs?q=machine+learning&limit=50&fromage=1&filter=0&start=250
55
330
https://www.indeed.com/jobs?q=machine+learning&limit=50&fromage=1&filter=0&start=300
54
384
https://www.indeed.com/jobs?q=machine+learning&limit=50&fromage=1&filter=0&start=350
55
439
https://www.indeed.com/jobs?q=machine+learning&limit=50&fromage=1&filter=0&start=400
55
494
https://www.indeed.com/jobs?q=machine+learning&limit=50&fromage=1&filter=0&start=450
54
548
https://www.indee

In [399]:
np.array([r[0] for r in records])


array(['589dfaeb201f6431', '8e330e086b957051'], dtype='<U16')

In [237]:
soup.find('a', {'aria-label': 'Next'}).get('href')

AttributeError: 'NoneType' object has no attribute 'get'

In [56]:
len(records)

1027

## Putting it all together

In [407]:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
import time 
import numpy as np

def get_job_search_url(position, postedDays):
    """Generate a url from position and posted days ago
    with 50 job posts per page and no filter"""

    template = 'https://www.indeed.com/jobs?q={}&fromage={}&limit=50&filter=0'
    url=template.format(position, postedDays)
    
    return url

def get_record(card, withDetail=False):
    """Extract individual job post data from a single record to create a tuple record
    with option to get detail job description """
    
    # required variables
    job_id = card.get('data-jk')
    aTag = card.h2.a
    job_title = aTag.get('title')
    
    #job_url = 'https://www.indeed.com' + aTag.get('href')
    job_url = 'https://www.indeed.com/viewjob?jk=' + job_id
    
    if withDetail:
        post_response = requests.get(job_url)
        post_soup = BeautifulSoup(post_response.text, 'html.parser')
        try:
            job_description = post_soup.find('div','jobsearch-jobDescriptionText').text.strip()
        except AttributeError:
            job_description = ''       

        try:
            job_detail = post_soup.find('div','jobsearch-JobDescriptionSection').text.strip()
        except AttributeError:
            job_detail = ''
    
    job_company = card.find('span','company').text.strip()
    job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
    job_summary = card.find('div', 'summary').text.strip()
    job_post_date = card.find('span','date').text.strip()
    
    # optional variables
    try:
        company_rating = card.find('span','ratingsContent').text.strip()
    except AttributeError:
        company_rating = ''
    
    try:
        job_salary = card.find('span','salaryText').text.strip()
    except AttributeError:
        job_salary = ''
    
    try:
        job_remote = card.find('span','remote').text.strip()
    except AttributeError:
        job_remote = ''
        
    today = datetime.today().strftime('%Y-%m-%d')
    
    # Create tuple of each job posting record
    if withDetail:
        record = (job_id, job_title,job_company,job_location,company_rating, job_post_date,today,job_summary,job_salary, job_remote, job_url, job_detail, job_description)
    else:
        record = (job_id, job_title,job_company,job_location,company_rating, job_post_date,today,job_summary,job_salary, job_remote, job_url)
   
    return record

def main(position, postedDay, fileName, withDetail):
    """Run the main program routine"""
    records = []
    url = get_job_search_url(position, postedDay)

    # extract the job data
    while True:        
        print(url)
       
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        if soup.title.text.find('Captcha') != -1: 
            print('blocked by Captcha!')
            break
            
        cards = soup.find_all('div', 'jobsearch-SerpJobCard')        
        total_jobs = soup.find('div',{'id':'searchCountPages'}).text.strip()
        print(len(cards), total_jobs)
        
           
        for card in cards:
            if records == []:
                record = get_record(card, withDetail)
                records.append(record)
            else:
                #To check if the card is already stored in the records to reduce the duplicate records
                id_array = np.array([r[0] for r in records])
                if not card.get('data-jk') in id_array:   
                    record = get_record(card, withDetail)
                    records.append(record)
   
        try:
            url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href') 
        except AttributeError:     
            print('done!')
            break
            
        #time.sleep(60)
        
   
    firstRow ='{0}: {1} {2}'.format('test', str(len(records)),total_jobs[total_jobs.index('of'):])
    print(firstRow)
    # save the job data
    if (len(records) > 0):
        if withDetail: 
            fn = '../data/raw/{0}{1}_W.csv'.format(fileName,datetime.today().strftime('%Y_%m_%d'))
        else: 
            fn = '../data/raw/{0}{1}.csv'.format(fileName,datetime.today().strftime('%Y_%m_%d'))
            
        with open(fn, 'w', newline ='', encoding ='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(firstRow)
            if withDetail:
                writer.writerow(['JobID', 'JobTitle', 'Company','Location', 'CompanyRating', 'PostDate', 'ExtractDate','Summary', 'Salary', 'Remote','JobUrl','JobDetail', 'JobDescription'])
            else:
                writer.writerow(['JobID', 'JobTitle', 'Company','Location', 'CompanyRating', 'PostDate', 'ExtractDate','Summary', 'Salary', 'Remote','JobUrl'])
            writer.writerows(records)

Notes:

1: Indeed will not return exactly amount of jobs shown on the pagination based on some internal filtering they run, unless you add query string &filter = 0 in the url.  With the filter, it will still only can get to 20*50 =1000 records 
https://www.indeed.com/jobs?q=machine+learning&fromage=1&filter=0&start=200
"We have removed 2,463 job postings very similar to those already shown. To see these additional results, you may repeat your search with the omitted job postings included." 

2: Every midnight, run program for 3 search criterion without job detailed description for 1 day posting without filter
Observesd: the maximum number of records Indeed allow to get is 50 * 20 = 1000

3: With job detailed description, need to added some sleep time to avoid being blocked by CAPTCHA
Also, need to come up some algorithm to random sample the detailed job description, considering only scrape it of remote jobs


4: sometimes may encounter connection broken from Request

To do list:
1: To add scheduler 
2: To add algorithm for random sampling for detailed job description 




In [331]:
# run the main program
#main('data scientist', 7 ,'ds_last7d_')
#main('machine learning', 7, 'ml_last7d_')
#main('data analyst', 7 , 'dat_last7d_')
#main('data analytics', 7, 'das_last7d_')

# without detailed job description  #with filter =0 # with no sleep time
time.sleep(10000)
main('data science, data scientist',1 ,'ds_last1d_', withDetail= False) 

main('data analytics, data analyst',1 ,'da_last1d_', withDetail= False)     

main('machine learning', 1 ,'ml_last1d_', withDetail= False)   

 


# add discription 


https://www.indeed.com/jobs?q=data science, data scientist&fromage=1&limit=50&filter=0
57 Page 1 of 4,704 jobs
https://www.indeed.com/jobs?q=data+science%2C+data+scientist&limit=50&fromage=1&filter=0&start=50
56 Page 2 of 4,704 jobs
https://www.indeed.com/jobs?q=data+science%2C+data+scientist&limit=50&fromage=1&filter=0&start=100
56 Page 3 of 4,562 jobs
https://www.indeed.com/jobs?q=data+science%2C+data+scientist&limit=50&fromage=1&filter=0&start=150
55 Page 4 of 4,562 jobs
https://www.indeed.com/jobs?q=data+science%2C+data+scientist&limit=50&fromage=1&filter=0&start=200
56 Page 5 of 4,562 jobs
https://www.indeed.com/jobs?q=data+science%2C+data+scientist&limit=50&fromage=1&filter=0&start=250
55 Page 6 of 4,562 jobs
https://www.indeed.com/jobs?q=data+science%2C+data+scientist&limit=50&fromage=1&filter=0&start=300
55 Page 7 of 4,562 jobs
https://www.indeed.com/jobs?q=data+science%2C+data+scientist&limit=50&fromage=1&filter=0&start=350
55 Page 8 of 4,562 jobs
https://www.indeed.com/jobs?q

In [262]:
soup.find('div','h-captcha')
soup.title.text.find('Captcha')

1

In [270]:
if soup.title.text.find('Captcha') != -1: print('blocked by Captcha!')
len(records)

blocked by Captcha!


0