In [1]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup


In [2]:
def get_url(location):
    """Generate url from position and location"""
    template = 'https://au.indeed.com/jobs?q=&l={}'
    location = location.replace(' ', '+')
    url = template.format(location)
    return url

In [3]:
url = get_url('Australia')
print(url)

https://au.indeed.com/jobs?q=&l=Australia


In [4]:
response = requests.get(url)

In [5]:
soup = BeautifulSoup(response.text, 'html.parser')

In [6]:
cards = soup.find_all('div', 'jobsearch-SerpJobCard')

In [7]:
card = cards[0]

In [8]:

job_title = card.h2.a.get('title')

In [9]:
company = card.find('span', 'company').text.strip()

In [10]:
job_location = card.find('div', 'recJobLoc').get('data-rc-loc')

In [11]:
post_date = card.find('span', 'date').text
today = datetime.today().strftime('%Y-%m-%d')

In [12]:
summary = card.find('div', 'summary').text.strip().replace('\n', ' ')

In [13]:
# this does not exists for all jobs, so handle the exceptions
salary_tag = card.find('span', 'salaryText')
if salary_tag:
    salary = salary_tag.text.strip()
else:
    salary = ''

In [14]:
job_url = 'https://au.indeed.com' + card.h2.a.get('href')

In [15]:
record = (job_title, company, job_location, post_date, today, summary, salary, job_url)

In [16]:
record

('Casual Customer Service Officer',
 'Queensland Government',
 'Brisbane QLD',
 '4 days ago',
 '2020-09-29',
 'Support other customer service officers and trainees in a wide range of systems, products and services relevant to a customer service centre.',
 '',
 'https://au.indeed.com/rc/clk?jk=1828eb0ad33ed47d&fccid=033d5c6dd090c8a4&vjs=3')

In [17]:
def get_record(card):
    """Extract job data from a single record"""
    
    job_title = card.h2.a.get('title')
    company = card.find('span', 'company').text.strip()
    job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
    post_date = card.find('span', 'date').text
    today = datetime.today().strftime('%Y-%m-%d')
    summary = card.find('div', 'summary').text.strip().replace('\n', ' ')
    job_url = 'https://www.indeed.com' + card.h2.a.get('href')

    # this does not exists for all jobs, so handle the exceptions
    salary_tag = card.find('span', 'salaryText')
    if salary_tag:
        salary = salary_tag.text.strip()
    else:
        salary = ''  
        
    record = (job_title, company, job_location, post_date, today, summary, salary, job_url)
    return record

In [18]:
records = []

for card in cards:
    record = get_record(card)
    records.append(record)

In [19]:
while True:
    try:
        url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
    except AttributeError:
        break

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    cards = soup.find_all('div', 'jobsearch-SerpJobCard')

    for card in cards:
        record = get_record(card)
        records.append(record)

In [1]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup


def get_url(location):
    """Generate url from position and location"""
    template = 'https://au.indeed.com/jobs?q=&l={}'
    location = location.replace(' ', '+')
    url = template.format(location)
 
    return url

def get_record(card):
    """Extract job data from a single record"""
    job_title = card.h2.a.get('title')
    company = card.find('span', 'company').text.strip()
    job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
    post_date = card.find('span', 'date').text
    today = datetime.today().strftime('%Y-%m-%d')
    summary = card.find('div', 'summary').text.strip().replace('\n', ' ')
    review =  card.find('span','ratingsContent')
    job_url = 'https://au.indeed.com' + card.h2.a.get('href')

    # this does not exists for all jobs, so handle the exceptions
    salary_tag = card.find('span', 'salaryText')
    if salary_tag:
        salary = salary_tag.text.strip()
    else:
        salary = ''  
        
    record = (job_title, company, job_location, post_date, today, summary, salary, job_url,review)
    return record


def main(location):
    """Run the main program routine"""
    records = []
    url = get_url(location)
    
    # extract the job data
    while True:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('div', 'jobsearch-SerpJobCard')
        for card in cards:
            record = get_record(card)
            records.append(record)
        try:
            url = 'https://au.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except AttributeError:
            break
        
    # save the job data
    with open('result.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['JobTitle', 'Company', 'Location', 'PostDate', 'ExtractDate', 'Summary', 'Salary', 'JobUrl','Review'])
        writer.writerows(records)

In [None]:
main('Australia')

In [None]:
import pandas as pd

In [None]:
df1=pd.read_csv('result.csv')

In [None]:
df1.head(5)

In [None]:
df1.info()

In [26]:
df1['Review']=df1.Review.ffill()

In [27]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 9 columns):
JobTitle       15 non-null object
Company        15 non-null object
Location       15 non-null object
PostDate       15 non-null object
ExtractDate    15 non-null object
Summary        15 non-null object
Salary         2 non-null object
JobUrl         15 non-null object
Review         15 non-null object
dtypes: object(9)
memory usage: 1.2+ KB


In [28]:
df1['Review'] = df1['Review'].apply(lambda x: x.split('\n')[1])

In [29]:
df1['Review'] = df1['Review'].apply(lambda x: x.split('<svg')[0])

In [30]:
df1['Review']

0     4.0
1     3.4
2     3.8
3     3.8
4     3.6
5     4.0
6     3.8
7     4.2
8     4.0
9     3.8
10    3.5
11    3.5
12    4.0
13    4.0
14    4.0
Name: Review, dtype: object