### Indeed

In [None]:
import csv
import datetime
import requests
from bs4 import BeautifulSoup


In [None]:
template = 'https://ca.indeed.com/jobs?q={}&l={}'

In [None]:
def get_url(position,location):
    '''Generate a url from position and location'''
    template = 'https://ca.indeed.com/jobs?q={}&l={}'
    url = template.format(position,location)
    return url

In [None]:
url = get_url('data scientist','vancouver bc')
url

### Extract raw html

In [None]:
response = requests.get(url)
response.reason

In [None]:
soup = BeautifulSoup(response.text,'html.parser')
# soup

In [None]:
cards= soup.find_all('div','jobsearch-SerpJobCard')
len(cards)

### Prototype the model with a single record

In [None]:
card = cards[3]
# card

In [None]:
atag = card.h2.a
atag['title']

In [None]:
job_title=atag.get('title')
job_title

In [None]:
job_url = 'https://ca.indeed.com'+atag.get('href')

In [None]:
company =card.find('span','company').text.strip()
company

In [None]:
job_location =card.find('div','recJobLoc').get('data-rc-loc')

In [None]:
job_summary =card.find('div','summary').text.strip()

In [None]:
post_date =card.find('span','date').text

In [None]:
today = datetime.datetime.today().strftime('%Y-%m-%d')
today

In [None]:
try:
    job_salary=card.find('span','salaryText').text.strip()
except AttributeError:
    job_salary =''
job_salary

### Generalize the model with a function


In [None]:
def get_record(card):
    '''Extract job data from a single record'''
    atag = card.h2.a
    job_title=atag.get('title')
    job_url = 'https://ca.indeed.com'+atag.get('href')    
    company =card.find('span','company').text.strip()
    job_location =card.find('div','recJobLoc').get('data-rc-loc')    
    job_summary =card.find('div','summary').text.strip()   
    post_date =card.find('span','date').text    
    today = datetime.datetime.today().strftime('%Y-%m-%d')
    try:
        job_salary=card.find('span','salaryText').text.strip()
    except AttributeError:
        job_salary =''
        
        
    record = (job_title,company,job_location,post_date,today,job_summary,job_salary,job_url)
    return record

In [None]:
records =[]
for card in cards:
    record = get_record(card)
    records.append(record)

In [None]:
records[0]

### Getting the next page

In [None]:
while True:
    try:
        url = 'https://www.indeed.com'+soup.find('a',{'aria-label':'Next'}).get('href')
    except AttributeError:
        break
        
    response = requests.get(url)
    soup = BeautifulSoup(response.text,'html.parser')
    cards= soup.find_all('div','jobsearch-SerpJobCard')
    
    for card in cards:
        record=get_record(card)
        records.append(record)
        

In [None]:
# records

### Putting all together

In [118]:
import csv
import datetime
import requests
from bs4 import BeautifulSoup

def get_url(position,location_city,location_state):
    '''Generate a url from position and location'''
    template = 'https://ca.indeed.com/jobs?q={}&l={}%2C+{}'
    url = template.format(position,location_city,location_state)
    return url

def get_record(card):
    '''Extract job data from a single record'''
    atag = card.h2.a
    job_title=atag.get('title')
    job_url = 'https://ca.indeed.com'+atag.get('href')    
    company =card.find('span','company').text.strip()
    job_location =card.find('div','recJobLoc').get('data-rc-loc')    
    job_summary =card.find('div','summary').text.strip()   
    post_date =card.find('span','date').text    
    today = datetime.datetime.today().strftime('%Y-%m-%d')
    try:
        job_salary=card.find('span','salaryText').text.strip()
    except AttributeError:
        job_salary =''
        
        
    record = (job_title,company,job_location,post_date,today,job_summary,job_salary,job_url)
    return record

def main(position,location_city,location_state):
    '''Run the main program routine'''
    records = []
    url = get_url(position,location_city,location_state)
    i = 0
    while True:
        response = requests.get(url)
        soup = BeautifulSoup(response.text,'html.parser')
        cards= soup.find_all('div','jobsearch-SerpJobCard')
        for card in cards:
            record = get_record(card)
            records.append(record)
            
        try:
            url = 'https://ca.indeed.com'+soup.find('a',{'aria-label':'Next'}).get('href')
        except AttributeError:
            break
        
        print(i, url)
        i += 1
    with open('results_indeed.csv','w',newline='',encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['JobTile','Company','Location','PostDate','ExtractDate','Summary','Salary','JobURL'])
        writer.writerows(records)

In [119]:
main('data scientist','vancouver','BC')

0 https://ca.indeed.com/jobs?q=data+scientist&l=vancouver%2C+BC&start=10
1 https://ca.indeed.com/jobs?q=data+scientist&l=vancouver%2C+BC&start=20
2 https://ca.indeed.com/jobs?q=data+scientist&l=vancouver%2C+BC&start=30
3 https://ca.indeed.com/jobs?q=data+scientist&l=vancouver%2C+BC&start=40
4 https://ca.indeed.com/jobs?q=data+scientist&l=vancouver%2C+BC&start=50
