# Data collection from www.indeed.com

***
Data collection steps: 
* scrape search result on 4 key words ('data science' 'data analyst' 'machine learning' 'artifical intelligence') from Indeed website daily.
* extract all related features using BeautifulSoup
* save into 4 seperate files for each key word
* save company related information into Companies.csv

Data items collected in each job posting files: 
* r[0] JobID
* r[1] JobTitle
* r[2] Company
* r[3] Location
* r[4] CompanyRating
* r[5] CompanyUrl
* r[6] CompanyIndustry
* r[7] CompanySize
* r[8] PostDate
* r[9] ExtractDate
* r[10] Summary
* r[11] Salary
* r[12] Remote
* r[13] JobUrl
* r[14] JobDetail
* r[15] JobDescription

Data items collected in Companies.csv:
* CompanyUrl (on Indeed website)
* Size
* CompanyRevenue
* Industry
* CompanyCEO
* Founded
* Link
* Detail
* AddDate (Insertion Date)

***

In [6]:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
import time 
import numpy as np
from random import randint

In [43]:
def get_job_search_url(position, postedDays):
    """Generate a url from position and posted days ago"""

    template = 'https://www.indeed.com/jobs?q={}&fromage={}&limit=50&filter=0'
    url=template.format(position, postedDays)
    
    return url

In [44]:
url = get_job_search_url('machine learning', 1)
print(url)

https://www.indeed.com/jobs?q=machine learning&fromage=1&limit=50&filter=0


##  Extract raw html


In [45]:
response = requests.get(url)

In [46]:
response

<Response [200]>

In [47]:
type(response)

requests.models.Response

In [48]:
#response.text

In [49]:
soup = BeautifulSoup(response.text, 'html.parser')

In [50]:
total_jobs = soup.find('div',{'id':'searchCountPages'}).text.strip()
total_jobs

'Page 1 of 910 jobs'

In [55]:
#soup

In [56]:
cards = soup.find_all('div', 'jobsearch-SerpJobCard')

In [57]:
len(cards)

54

## Prototype the model with a single record

In [58]:
card = cards[3]

In [59]:
aTag = card.h2.a

In [60]:
job_title = aTag.get('title')
job_title

'Machine Learning Engineer'

In [61]:
job_id = card.get('data-jk')
job_id

'd44ebfb52bfc20d1'

In [62]:
# To check if the card already stored in the records  # 517da30c1f7ab5f0
type(records)
records_ar = np.array(records)
#'517da30c1f7ab5f0' in np.array(records)
#'517da30c1f7ab5f0' in np.array(records)[:,0]

NameError: name 'records' is not defined

In [63]:
#time.sleep(60)
#job_url = 'https://www.indeed.com' + aTag.get('href')
job_url = 'https://www.indeed.com/viewjob?jk=' + job_id
job_url

post_response = requests.get(job_url)
post_soup = BeautifulSoup(post_response.text, 'html.parser')
job_description = post_soup.find('div','jobsearch-jobDescriptionText').text.strip()
job_description

try:
    job_detail = post_soup.find('div','jobsearch-JobDescriptionSection-section').text.strip()
except:
    job_detail = ''

'''items = post_soup.find_all('div','jobsearch-JobDescriptionSection-sectionItem')
for i in items:
    job_detail += '{};'.format(i.text.strip())'''
    

print(job_detail)

req_quals=post_soup.find_all('li','jobsearch-ReqAndQualSection-item')
#for r in req_quals:
#    job_skills +='{};'.format(r.text.strip())
#job_skills




In [64]:
req_quals

[]

In [65]:
job_detail =''

items = post_soup.find_all('div','jobsearch-JobDescriptionSection-sectionItem')


In [66]:
for i in items:
   
    job_detail += '{};'.format(i.text.strip())
job_detail

''

In [67]:
job_skills = ''
req_quals=post_soup.find_all('li','jobsearch-ReqAndQualSection-item')
for r in req_quals:
    job_skills +='{};'.format(r.text.strip())
job_skills

''

In [68]:
req_quals

[]

In [69]:

job_url

'https://www.indeed.com/viewjob?jk=d44ebfb52bfc20d1'

In [70]:
job_description

'Are you excited by the prospect of creating state-of-art algorithms to solve real world problems?\nDo you like to own end-to-end business problems/metrics that directly impact the profitability of the company? Areas we are working on are diverse and include NLP, Time-Series Analysis, Deep Learning, Reinforcement learning, Recommender Systems and more. Come join a team of scientists who use machine learning and AI to help WeWork scale.\nResponsibilities\nYou will work with complex data sets to develop advanced analytical methods, mathematical modeling, and large-scale implementation strategies\nYou can experiment and develop machine learning algorithms for tasks including demand forecasting, elasticity calculations, and inventory optimization\nYou will take ownership of whole end-to-end predictive modeling projects - from data processing, training, optimization to real-time monitoring and maintenance\nRequirements\nM.Sc / PhD in Statistics, Machine Learning, Computer Science, Operation

In [71]:
job_company = card.find('span','company').text.strip()
job_company

'Wework'

In [72]:
#card = cards[28]
try: 
    company_url = 'https://www.indeed.com{}'.format(card.find('span','company').a.get('href'))
except AttributeError:
    company_url=''
print(company_url)

https://www.indeed.com/cmp/Wework


In [73]:
company_url = 'https://www.indeed.com/cmp/Divihn-Integration'#
if len(company_url)>0:
    
    company_response = requests.get(company_url)
    company_soup = BeautifulSoup(company_response.text, 'html.parser')
    try:
        company_industry = company_soup.find('div',text='Industry').find_next_sibling('div').text.strip()
    except AttributeError:
        company_industry = ''
    try:
        company_size =company_soup.find('div',text='Company size').find_next_sibling('div').text.strip()
    except AttributeError:
        company_size = ''
else:
    company_industry = ''
    company_size = ''

print(company_industry, company_size)

Information Technology 51 to 200


In [74]:
company_soup.find('div',text='Industry').find_next_sibling('div').text.strip()

'Information Technology'

In [75]:
company_soup.find('div',text='Company size').find_next_sibling('div').text.strip()


'51 to 200'

In [76]:
try:
    company_rating = card.find('span','ratingsContent').text.strip()
except AttributeError:
    company_rating = ''
company_rating

'3.3'

In [77]:
job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
job_location

'New York, NY'

In [78]:
job_summary = card.find('div', 'summary').text.strip()
job_summary

'Are you excited by the prospect of creating state-of-art algorithms to solve real world problems? Do you like to own end-to-end business problems/metrics…'

In [79]:
job_post_date = card.find('span','date').text.strip()

In [80]:
today = datetime.today().strftime('%Y-%m-%d') # To do: need to add time 

In [81]:
datetime.today().strftime('%Y_%m_%d_%H_%M_%S')

'2021_05_13_10_24_16'

In [82]:
datetime.today()

datetime.datetime(2021, 5, 13, 10, 24, 18, 520655)

In [83]:
try:
    job_salary = card.find('span','salaryText').text.strip()
except AttributeError:
    job_salary = ''


In [84]:
try:
    job_remote = card.find('span','remote').text.strip()
except AttributeError:
    job_remote = ''

## Generalize the model with a function

In [90]:
# Get company details
def get_company(Url, name):
    company_response = requests.get(Url)
    company_soup = BeautifulSoup(company_response.text, 'html.parser')
    try:
        Industry = company_soup.find('div',text='Industry').find_next_sibling('div').text.strip()
    except AttributeError:
        Industry = ''
    try:
        Size = company_soup.find('div',text='Company size').find_next_sibling('div').text.strip()
    except AttributeError:
        Size = ''

    try:
        CEO = company_soup.find('div',text='CEO').find_next_sibling('div').text.strip()
    except AttributeError:
        CEO = ''
    try:
        Revenue = company_soup.find('div',text='Revenue').find_next_sibling('div').text.strip()
    except AttributeError:
        Revenue = ''
    try:
        Founded = company_soup.find('div',text='Founded').find_next_sibling('div').text.strip()
    except AttributeError:
        Founded = ''

    try:  # not working at now 
        Detail = company_soup.find('a',text='Learn more').find_parent('div').find_previous_sibling('div').text.strip()
        #Detail = company_soup.find('div', 'css-64l8lu eu4oa1w0')
    except AttributeError:
        Detail = ''
        
    Link =''

    company= (name, Url, Size, Revenue, Industry, CEO, Founded, Link, Detail)

    return company

In [91]:
from random import randint
def get_record(card):
    """Extract individual job post data from a single record """
    # required variables
    job_id = card.get('data-jk')
    aTag = card.h2.a
    job_title = aTag.get('title')
    
    #job_url = 'https://www.indeed.com' + aTag.get('href')
    job_url = 'https://www.indeed.com/viewjob?jk=' + job_id
       
    job_company = card.find('span','company').text.strip()
    job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
    job_summary = card.find('div', 'summary').text.strip()
    job_post_date = card.find('span','date').text.strip()
    
    # optional variables
    try: 
        company_url = 'https://www.indeed.com{}'.format(card.find('span','company').a.get('href'))
    except AttributeError:
        company_url=''
    if len(company_url)>0:    
        company_response = requests.get(company_url)
        company_soup = BeautifulSoup(company_response.text, 'html.parser')
        try:
            company_industry = company_soup.find('div',text='Industry').find_next_sibling('div').text.strip()
        except AttributeError:
            company_industry = ''
        try:
            company_size = company_soup.find('div',text='Company size').find_next_sibling('div').text.strip()
        except AttributeError:
            company_size = ''
        time.sleep(randint(0,1))
    else:
        company_industry = ''
        company_size = ''
        
    try:
        company_rating = card.find('span','ratingsContent').text.strip()
    except AttributeError:
        company_rating = ''
    
    try:
        job_salary = card.find('span','salaryText').text.strip()
    except AttributeError:
        job_salary = ''
    
    try:
        job_remote = card.find('span','remote').text.strip()
    except AttributeError:
        job_remote = ''
        
    # To determine if include detailed job decription: only include job description for remote job
    detailInclued = (job_remote =='Remote') | (job_location == 'Remote')
    
    if detailInclued:
        post_response = requests.get(job_url)
        post_soup = BeautifulSoup(post_response.text, 'html.parser')
        
        try:
            job_description = post_soup.find('div','jobsearch-jobDescriptionText').text.strip()
        except AttributeError:
            job_description = ''
                 
        try:
            job_detail = post_soup.find('div','jobsearch-JobDescriptionSection-section').text.strip()
        except:
            job_detail = '' 
         
        time.sleep(randint(1,3))
    else:
        job_description = ''  
        job_detail = ''
        
    today = datetime.today().strftime('%Y-%m-%d')
    
    record = (job_id, job_title,job_company,job_location,company_rating, company_url, company_industry, company_size, job_post_date,today,job_summary,job_salary, job_remote, job_url, job_detail, job_description)
   
   
    return record


In [92]:
len(cards)

54

In [94]:
#cards[3]

In [95]:
%time
records = []
companies =[]

for card in cards:
    #To check if the card is already stored in the records
    if not card.get('data-jk') in [r[0] for r in records]:   
        record = get_record(card)
        records.append(record)

        if (len(record[5]) > 0) & (not record[5] in [c[1] for c in companies]):
            companies.append(get_company(record[5], record[2]))

Wall time: 0 ns


In [96]:
companies

[('Kelly',
  'https://www.indeed.com/cmp/Kelly-Services',
  'more than 10,000',
  '$5B to $10B (USD)',
  'Human Resources & Staffing',
  'Peter Quigley',
  '1946',
  '',
  ''),
 ('Matlen Silver',
  'https://www.indeed.com/cmp/The-Matlen-Silver-Group,-Inc.',
  '501 to 1,000',
  '$100M to $500M (USD)',
  'Information Technology',
  '',
  '1980',
  '',
  ''),
 ('Harnham',
  'https://www.indeed.com/cmp/Harnham',
  '51 to 200',
  'less than $1M (USD)',
  'Consulting and Business Services',
  '',
  '2006',
  '',
  ''),
 ('Wework',
  'https://www.indeed.com/cmp/Wework',
  '5,001 to 10,000',
  '$100M to $500M (USD)',
  'Management & Consulting',
  'Sandeep Mathrani',
  '2010',
  '',
  ''),
 ('ON Semiconductor',
  'https://www.indeed.com/cmp/On-Semiconductor',
  'more than 10,000',
  '$1B to $5B (USD)',
  'Electronics Manufacturing',
  '',
  '1999',
  '',
  ''),
 ('Etsy',
  'https://www.indeed.com/cmp/Etsy',
  '',
  '',
  'Retail & Wholesale',
  '',
  '2005',
  '',
  ''),
 ('General Atomics and

In [97]:
type(records[11])

tuple

In [98]:
company_response = requests.get('https://www.indeed.com/cmp/Booz-Allen-Hamilton')
#company_response = requests.get('https://www.indeed.com/cmp/Amazon.com')
company_soup = BeautifulSoup(company_response.text, 'html.parser')

try:
    Industry = company_soup.find('div',text='Industry').find_next_sibling('div').text.strip()
except AttributeError:
    Industry = ''
try:
    Size = company_soup.find('div',text='Company size').find_next_sibling('div').text.strip()
except AttributeError:
    Size = ''

try:
    CEO = company_soup.find('div',text='CEO').find_next_sibling('div').text.strip()
except AttributeError:
    CEO = ''
try:
    Revenue = company_soup.find('div',text='Revenue').find_next_sibling('div').text.strip()
except AttributeError:
    Revenue = ''
try:
    Founded = company_soup.find('div',text='Founded').find_next_sibling('div').text.strip()
except AttributeError:
    Founded = ''
try:  # not working right now 
    Detail = company_soup.find('a',text='Learn more').find_parent('div').find_previous_sibling('div').text.strip()
    #Detail = company_soup.find('div', 'css-64l8lu eu4oa1w0')
except AttributeError:
    Detail = ''
print(Industry, Size,CEO,Revenue, Founded, Detail)

Aerospace & Defense more than 10,000 Horacio D. Rozanski $5B to $10B (USD) 1914 


In [99]:
company_soup.find('a',text='Learn more').find_parent('div').find_previous_sibling('div').text.strip()
#company_soup.find('div', 'css-64l8lu eu4oa1w0')

#company_soup.find('p')

''

## Getting the next page


In [100]:
soup

<!DOCTYPE html>

<html dir="ltr" lang="en">
<head>
<meta content="text/html;charset=utf-8" http-equiv="content-type"/>
<script id="polyfill-script-bundle">/* Disable minification (remove `.min` from URL path) for more info */

(function(self, undefined) {function ArrayCreate(r){if(1/r==-Infinity&&(r=0),r>Math.pow(2,32)-1)throw new RangeError("Invalid array length");var n=[];return n.length=r,n}function Call(t,l){var n=arguments.length>2?arguments[2]:[];if(!1===IsCallable(t))throw new TypeError(Object.prototype.toString.call(t)+"is not a function.");return t.apply(l,n)}function Get(n,t){return n[t]}function HasOwnProperty(r,t){return Object.prototype.hasOwnProperty.call(r,t)}function HasProperty(n,r){return r in n}function IsArray(r){return"[object Array]"===Object.prototype.toString.call(r)}function IsCallable(n){return"function"==typeof n}function RequireObjectCoercible(e){if(null===e||e===undefined)throw TypeError();return e}function SameValueNonNumber(e,n){return e===n}function ToBo

In [101]:
records =[]
url = get_job_search_url('machine learning', 1)
companies = []
print( url )
while True:
    print(url)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')   
    cards = soup.find_all('div', 'jobsearch-SerpJobCard')
    print(len(cards))
 
       
    for card in cards:
        #To check if the card is already stored in the records
        if not card.get('data-jk') in [r[0] for r in records]:   
            record = get_record(card)
            records.append(record)

            if (len(record[5]) > 0) & (not record[5] in [c[1] for c in companies]):
                companies.append(get_company(record[5],record[2]))
                

    print(len(records))
    if len(cards)> 0:
        try:
            url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except AttributeError:
            total_jobs = soup.find('div',{'id':'searchCountPages'}).text.strip()
            print(url)
            print(total_jobs)
            break
        time.sleep(1)
    else: break
        
print(len(records))

https://www.indeed.com/jobs?q=machine learning&fromage=1&limit=50&filter=0
https://www.indeed.com/jobs?q=machine learning&fromage=1&limit=50&filter=0
55
55
https://www.indeed.com/jobs?q=machine+learning&limit=50&fromage=1&filter=0&start=50
55
109
https://www.indeed.com/jobs?q=machine+learning&limit=50&fromage=1&filter=0&start=100
0
109
109


In [102]:
np.array([r[0] for r in records])


array(['4a6ed7208e2b3abb', '499d818f32be3c42', '57fd730c99c8acc8',
       '55aa50b25abd630f', '5f8f093976d42f51', 'd44ebfb52bfc20d1',
       '2c436db2b968f741', 'e63ee90359816f45', 'e5d79751d335b449',
       'c403f506cc3189b7', '7ed38abb1db28d6f', '6f198553443178f9',
       'a8e13e80ac54d26b', '19df15b71681cf11', 'bd0c8153a2731455',
       '91f9e7a781b8cddd', '6bba278967a98f0d', '0caa11692b7ff376',
       '820c02c66ce1241e', '9a8c77627366f082', '3bec99b2b0d174fa',
       '1102a87e448c8f5a', 'ca28a0d16a7e6dec', 'a2576aca27daaccd',
       '774f9fd9dd1d79f7', '5486716317382fa7', 'e7a7f028c3f5f93b',
       '323115b5f3095c3c', '964f049349273a10', '89ba4f34e6b54613',
       'd4f45d5fdd54c8ee', '12809fcde708ff0f', 'f367e3087f544179',
       '54968060f717bdb6', 'a02e8f18fe6e57eb', '2fdf2b29e5f9f77d',
       '2b166abd976c1d03', 'f6f9e2ac0ca30543', 'da876276d18366e2',
       '68b5966c03d60ea4', '902cdcd50812c532', 'c4240427c72e39fd',
       '5b95c4e82d12c4d9', '6cea023062fb9603', '6ac6a8a36999c1

In [103]:
len(companies)

67

In [104]:
#https://www.indeed.com/cmp/North-Greenville-University
if 'https://www.indeed.com/cmp/Salesforce' in companiesURL_list: print('yes')

yes


In [105]:
import pandas as pd
import numpy as np
companiesURL_pd = pd.read_csv('../data/raw/Companies.csv',usecols=[1])
companiesURL_list= companiesURL_pd['CompanyUrl'].tolist()
de_companies = [c for c in companies if c[1] not in companiesURL_list]
#companies_np = np.genfromtxt('../data/raw/Companies.csv',skiprows=1,usecols=[0],dtype=str)
companiesURL_list
companies

[('Kelly',
  'https://www.indeed.com/cmp/Kelly-Services',
  'more than 10,000',
  '$5B to $10B (USD)',
  'Human Resources & Staffing',
  'Peter Quigley',
  '1946',
  '',
  ''),
 ('Harnham',
  'https://www.indeed.com/cmp/Harnham',
  '51 to 200',
  'less than $1M (USD)',
  'Consulting and Business Services',
  '',
  '2006',
  '',
  ''),
 ('Citizens',
  'https://www.indeed.com/cmp/Citizens-2',
  'more than 10,000',
  '$5B to $10B (USD)',
  'Financial Services',
  'Bruce Van Saun',
  '1988',
  '',
  ''),
 ('CyberCoders',
  'https://www.indeed.com/cmp/Cybercoders',
  '201 to 500',
  '$500M to $1B (USD)',
  'Human Resources & Staffing',
  'Shane Lamb',
  '1999',
  '',
  ''),
 ('Verizon',
  'https://www.indeed.com/cmp/Verizon',
  'more than 10,000',
  'more than $10B (USD)',
  'Telecommunications',
  'Hans Vestberg',
  '2000',
  '',
  ''),
 ('Wework',
  'https://www.indeed.com/cmp/Wework',
  '5,001 to 10,000',
  '$100M to $500M (USD)',
  'Management & Consulting',
  'Sandeep Mathrani',
  '201

In [106]:
len(companiesURL_list)

7232

In [109]:
companiesURL_pd['CompanyUrl'].str.contains('https://www.indeed.com/cmp/Astellas-Pharmaceuticals').sum()
print(len(de_companies),len(companies), len(companiesURL_list))

5 67 7232


In [110]:
de_companies = [c for c in companies if c[1] not in companiesURL_list]

In [111]:
de_companies

[('Audible',
  'https://www.indeed.com/cmp/Audible',
  '1001 to 5,000',
  '$500M to $1B (USD)',
  'Media & Communication',
  'Don Katz',
  '1995',
  '',
  ''),
 ('Pala Casino Spa & Resort',
  'https://www.indeed.com/cmp/Pala-Casino-Spa-&-Resort',
  '',
  '',
  'Hotels & Travel Accommodation',
  '',
  '2000',
  '',
  'Founded2000IndustryHotels & Travel Accommodation'),
 ('HRmango',
  'https://www.indeed.com/cmp/Hrmango',
  '11 to 50',
  'less than $1M (USD)',
  'Staffing & Subcontracting',
  '',
  '',
  '',
  'Company size11 to 50Revenueless than $1M (USD)IndustryStaffing & Subcontracting'),
 ('Majorel',
  'https://www.indeed.com/cmp/Majorel',
  'more than 10,000',
  '',
  'Telecommunications',
  'Fara Haron',
  '',
  '',
  ''),
 ('Thatcher Technology Group',
  'https://www.indeed.com/cmp/Thatcher-Technology-Group',
  '11 to 50',
  '$1M to $5M (USD)',
  'Internet & Web Services',
  '',
  '1999',
  '',
  'Founded1999Company size11 to 50Revenue$1M to $5M (USD)IndustryInternet & Web Servic

In [112]:
import pandas as pd
import numpy as np
#de-dup the companies already exists in the companies file
companiesURL_pd = pd.read_csv('../data/raw/Companies.csv',usecols=[1])
companiesURL_list= companiesURL_pd['CompanyUrl'].tolist()
de_companies = [c for c in companies if c[1] not in companiesURL_list]

if (len(de_companies) > 0):
    fn = '../data/raw/Companies.csv'
            
    with open(fn, 'a', newline ='', encoding ='utf-8') as f:
        writer = csv.writer(f)
        writer.writerows(de_companies)

In [113]:
len(records)


109

## Putting it all together

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
import time 
import numpy as np
from random import randint
import pandas as pd

def get_job_search_url(position, postedDays):
    """Generate a url from position and posted days ago
    with 50 job posts per page and no filter"""

    template = 'https://www.indeed.com/jobs?q={}&fromage={}&limit=50&filter=0'
    url=template.format(position, postedDays)
    
    return url

def get_company(Url, name):
    company_response = requests.get(Url)
    company_soup = BeautifulSoup(company_response.text, 'html.parser')
    try:
        Industry = company_soup.find('div',text='Industry').find_next_sibling('div').text.strip()
    except AttributeError:
        Industry = ''
    try:
        Size = company_soup.find('div',text='Company size').find_next_sibling('div').text.strip()
    except AttributeError:
        Size = ''

    try:
        CEO = company_soup.find('div',text='CEO').find_next_sibling('div').text.strip()
    except AttributeError:
        CEO = ''
    try:
        Revenue = company_soup.find('div',text='Revenue').find_next_sibling('div').text.strip()
    except AttributeError:
        Revenue = ''
    try:
        Founded = company_soup.find('div',text='Founded').find_next_sibling('div').text.strip()
    except AttributeError:
        Founded = ''

    try:  # not working very well
        Detail = company_soup.find('a',text='Learn more').find_parent('div').find_previous_sibling('div').text.strip()
        #Detail = company_soup.find('div', 'css-64l8lu eu4oa1w0')
    except AttributeError:
        Detail = ''
        
    Link =''
    
    today = datetime.today().strftime('%Y-%m-%d')

    company= (name, Url, Size, Revenue, Industry, CEO, Founded, Link, Detail, today)

    return company

def get_record(card):
    """Extract individual job post data from a single record """
    # required variables
    job_id = card.get('data-jk')
    aTag = card.h2.a
    job_title = aTag.get('title')
    
    #job_url = 'https://www.indeed.com' + aTag.get('href')
    job_url = 'https://www.indeed.com/viewjob?jk=' + job_id
       
    try: 
        job_company = card.find('span','company').text.strip()
    except AttributeError:
        job_company =''
    
    try: 
        job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
    except AttributeError:
        job_location = ''
        
    try: 
        job_summary = card.find('div', 'summary').text.strip()
    except AttributeError:
        job_summary =''
        
    try: 
        job_post_date = card.find('span','date').text.strip()
    except AttributeError:
        job_post_date =''
    
    try: 
        company_url = 'https://www.indeed.com{}'.format(card.find('span','company').a.get('href'))
    except AttributeError:
        company_url=''
        
    if len(company_url)>0:    
        company_response = requests.get(company_url)
        company_soup = BeautifulSoup(company_response.text, 'html.parser')
        try:
            company_industry = company_soup.find('div',text='Industry').find_next_sibling('div').text.strip()
        except AttributeError:
            company_industry = ''
        try:
            company_size = company_soup.find('div',text='Company size').find_next_sibling('div').text.strip()
        except AttributeError:
            company_size = ''
        time.sleep(randint(0,1))
    else:
        company_industry = ''
        company_size = ''
        
    try:
        company_rating = card.find('span','ratingsContent').text.strip()
    except AttributeError:
        company_rating = ''
    
    try:
        job_salary = card.find('span','salaryText').text.strip()
    except AttributeError:
        job_salary =''
    
    try:
        job_remote = card.find('span','remote').text.strip()
    except AttributeError:
        job_remote = ''
        
    # To determine if include detailed job decription 
    detailInclued = (job_remote =='Remote') | (job_location == 'Remote')
    
    if detailInclued:
        post_response = requests.get(job_url)
        post_soup = BeautifulSoup(post_response.text, 'html.parser')
        
        try:
            job_description = post_soup.find('div','jobsearch-jobDescriptionText').text.strip()
        except AttributeError:
            job_description = ''
        try:
            job_detail = post_soup.find('div','jobsearch-JobDescriptionSection').text.strip()    
        except AttributeError:
            job_detail = ''
               
        time.sleep(randint(1,3))
    else:
        job_description = ''   
        job_detail = ''
        
    today = datetime.today().strftime('%Y-%m-%d')
    
    record = (job_id, job_title,job_company,job_location,company_rating, company_url, company_industry, company_size, job_post_date,today,job_summary,job_salary, job_remote, job_url, job_detail, job_description)
   
    return record

def main(position, postedDay, fileName):
    '''Run the main program routine'''
    records = []
    companies = []
    url = get_job_search_url(position, postedDay)

    # extract the job data
    while True:        
        print(url)
       
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        try:
            if soup.title.text.find('Captcha') != -1: 
                print('blocked by Captcha!')
                break
        except AttributeError:
            print(soup.title)      

        cards = soup.find_all('div', 'jobsearch-SerpJobCard')        
        total_jobs = soup.find('div',{'id':'searchCountPages'}).text.strip()
        print(len(cards), total_jobs)        

        for card in cards:
            #To check if the card is already stored in the records
            if not card.get('data-jk') in [r[0] for r in records]:   
                record = get_record(card)
                records.append(record)
                
                #To add company info if the company url is not in the list
                if (len(record[5]) > 0) & (not record[5] in [c[1] for c in companies]):
                    companies.append(get_company(record[5], record[2]))

        try:
            url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href') 
        except AttributeError:     
            print('done!')
            break              
        
            
   
    firstRow ='{0}: {1} {2}'.format(position, str(len(records)),total_jobs[total_jobs.index('of'):])
    print(firstRow)
    # save the job data
    if (len(records) > 0):
        fn = '../data/raw/{0}{1}.csv'.format(fileName,datetime.today().strftime('%Y_%m_%d'))            
        with open(fn, 'w', newline ='', encoding ='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(firstRow)
            writer.writerow(['JobID', 'JobTitle', 'Company','Location', 'CompanyRating', 'CompanyUrl', 'CompanyIndustry','CompanySize', 'PostDate', 'ExtractDate','Summary', 'Salary', 'Remote','JobUrl','JobDetail', 'JobDescription'])
            writer.writerows(records)
            
    # save company info to companies file after de-dup
    companiesURL_pd = pd.read_csv('../data/raw/Companies.csv',usecols=[1])
    companiesURL_list= companiesURL_pd['CompanyUrl'].tolist()
    de_companies = [c for c in companies if c[1] not in companiesURL_list]

    if (len(de_companies) > 0):
        fn = '../data/raw/Companies.csv'
        with open(fn, 'a', newline ='', encoding ='utf-8') as f:
            writer = csv.writer(f)
            writer.writerows(de_companies)

Notes:

1: Indeed will not return exactly amount of jobs shown on the pagination based on some internal filtering they run, unless you add query string &filter = 0 in the url.  With the filter, it will still only can get to 20*50 =1000 records 
https://www.indeed.com/jobs?q=machine+learning&fromage=1&filter=0&start=200
"We have removed 2,463 job postings very similar to those already shown. To see these additional results, you may repeat your search with the omitted job postings included." 

2: Every midnight, run program for 3 search criterion without job detailed description for 1 day posting without filter
Observesd: the maximum number of records Indeed allow to get is 50 * 20 = 1000

3: With job detailed description, need to added some sleep time to avoid being blocked by CAPTCHA
Also, need to come up some algorithm to random sample the detailed job description, considering only scrape it of remote jobs


4: sometimes may encounter connection broken from Request

5: Modified the search string by removing the ',' in order to narrow the result according to the Law of Diminishing Marginal Utility.    2/19/21



In [2]:
# To do list:
# 1: Add a scheduler 
# 2: Add algorithm for random sampling for detailed job description : 
# option1: include detailed description only for remote job:             done!
# option2: include detailed description only for several companies
# 3: Add a log writer
# 4: Add company industry and size:     done!
# 5: Create a seperate company dateset: update companies.csv company name information

# LOG:
#2/23/2021(T):'data science','data analytics data analyst','machine learning','artificial intelligence'. 
# 4 files with description on remote jobs.  Run at 8AM 
#2/24/2021(W):'"data science"','"data analyst"','"machine learning"','"artificial intelligence"'.
# 4 files with description on remote jobs.  Run at 8AM 
#2/25/2021(TH):same as before, run at 8Am, 2863 records
#2/27/2021(Sat.): same as before, run at 9am. Add companyUrl, companyIndustry, companySize, change file name convention .
#2/28/2021
#3/7/2021: add fuction to add company info to company file
#3/17/2021: found a error in the main, it may be the root cuase for wrong company info
#5/12/2021: add company name to the Companies.csv 
#5/13/2021: fixed a bug


In [3]:
# run the main program

#time.sleep(10000)
import schedule
def RunMain():
    main('"data science"',1 ,'ds_last1d_RC_') 

    main('"data analyst"',1 ,'da_last1d_RC_')     

    main('"machine learning"', 1 ,'ml_last1d_RC_')   

    main('"artificial intelligence"', 1, 'ai_last1d_RC_') 


In [6]:
RunMain()

https://www.indeed.com/jobs?q="data science"&fromage=1&limit=50&filter=0
55 Page 1 of 132 jobs
https://www.indeed.com/jobs?q=%22data+science%22&limit=50&fromage=1&filter=0&start=50
56 Page 2 of 139 jobs
https://www.indeed.com/jobs?q=%22data+science%22&limit=50&fromage=1&filter=0&start=100
44 Page 3 of 139 jobs
done!
"data science": 144 of 139 jobs
https://www.indeed.com/jobs?q="data analyst"&fromage=1&limit=50&filter=0
38 Page 1 of 32 jobs
done!
"data analyst": 38 of 32 jobs
https://www.indeed.com/jobs?q="machine learning"&fromage=1&limit=50&filter=0
54 Page 1 of 269 jobs
https://www.indeed.com/jobs?q=%22machine+learning%22&limit=50&fromage=1&filter=0&start=50
0 Page 2 of 267 jobs
https://www.indeed.com/jobs?q=%22machine+learning%22&limit=50&fromage=1&filter=0&start=100
0 Page 3 of 267 jobs
https://www.indeed.com/jobs?q=%22machine+learning%22&limit=50&fromage=1&filter=0&start=150
0 Page 4 of 267 jobs
https://www.indeed.com/jobs?q=%22machine+learning%22&limit=50&fromage=1&filter=0&start

In [6]:
schedule.every().day.at('07:30').do(RunMain)

while True:
    
    schedule.run_pending()
    time.sleep(1)

KeyboardInterrupt: 