In [1]:
# We will be looking at Software Developement Jobs as a practice for Beautiful Soup: a tutorial follow-a-long
# https://realpython.com/beautiful-soup-web-scraper-python/

import requests


In [2]:
# Assign the link to URL
URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'

# Pull a request from the URL to grab the data and save as a Python object
page = requests.get(URL)

In [3]:
print(page)
# Shows that the request was successful

<Response [200]>


In [4]:
# Inspecting the HTML in the Developer's Tool-- the HTML can look a bit messy
# To help you make sense of it--- use a HTML Formater to clean up the data, such as https://webformatter.com/html
# Beautiful soup can only be used to parse STATIC HTML requests. Dynamic websites will generate a Javascript code, in which case you would need another program to parse the data, something like Selenium 


In [11]:
from bs4 import BeautifulSoup
#import in BeautifulSoup


In [12]:
URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)

In [13]:
soup = BeautifulSoup(page.content, 'html.parser')

In [14]:
# Search for and find the container that shows all of the job results
results = soup.find(id = 'ResultsContainer')

In [15]:
#BeautifulSoup object can be prettify, which would print out all the HTML contained within the <div>
print(results.prettify)

<bound method Tag.prettify of <div class="mux-custom-scroll" data-extend="left" data-mux="customScroll" data-target="html" id="ResultsContainer">
<div class="scrollable" id="ResultsScrollable">
<script type="application/ld+json">
            {"@context":"https://schema.org","@type":"ItemList","mainEntityOfPage":{
            "@type":"CollectionPage","@id":"https://www.monster.com/jobs/search/?q=Software-Developer&amp;where=Australia"
            }
            ,"itemListElement":[

                 {"@type":"ListItem","position":1,"url":"https://job-openings.monster.com/lead-performance-engineer-software-systems-plantation-fl-sunnyvale-ca-culver-new-york-city-ca-seattle-wa-austin-tx-toronto-ny-us-magic-leap-inc/e5aff8c3-a1ff-4606-a1bc-793e8a588bc0"}
                    ,
                 {"@type":"ListItem","position":2,"url":""}
                    ,
                 {"@type":"ListItem","position":3,"url":"https://job-openings.monster.com/senior-lead-software-engineer-browser

In [16]:
# Going through the HTML, you see that the job postings are wrapped in a <section> element with class = 'card-content'
# Let's just select the job posting using this info

job_elems = results.find_all('section', class_='card-content')

# Using the find_all() returns an iterable with all the HTML for the all the job listins on that page

In [17]:
# Let's interate through the list, and print out the job_elem

for job_elem in job_elems:
    print(job_elem, end = '\n'*2)

<section class="card-content" data-jobid="e5aff8c3-a1ff-4606-a1bc-793e8a588bc0" onclick="MKImpressionTrackingMouseDownHijack(this, event)">
<div class="flex-row">
<div class="mux-company-logo thumbnail"></div>
<div class="summary">
<header class="card-header">
<h2 class="title"><a data-bypass="true" data-m_impr_a_placement_id="JSR2CW" data-m_impr_j_cid="660" data-m_impr_j_coc="" data-m_impr_j_jawsid="434161395" data-m_impr_j_jobid="2201018" data-m_impr_j_jpm="2" data-m_impr_j_jpt="3" data-m_impr_j_lat="0" data-m_impr_j_lid="0" data-m_impr_j_long="0" data-m_impr_j_occid="11969" data-m_impr_j_p="1" data-m_impr_j_postingid="e5aff8c3-a1ff-4606-a1bc-793e8a588bc0" data-m_impr_j_pvc="ec3a6188-6a80-441a-814d-a9e2c9b76318" data-m_impr_s_t="t" data-m_impr_uuid="e784fb42-1888-4f6e-9b54-9a563b4bd796" href="https://job-openings.monster.com/lead-performance-engineer-software-systems-plantation-fl-sunnyvale-ca-culver-new-york-city-ca-seattle-wa-austin-tx-toronto-ny-us-magic-leap-inc/e5aff8c3-a1ff-460

In [18]:
# This is still too much HTML to look through, so let's simplify with with a for loop to pull out only the info we want

In [20]:
# use the same method as you did before for the <section> element 
for job_elem in job_elems: 
    title_elem = job_elem.find('h2', class_ = 'title')
    company_elem = job_elem.find('div', class_ = 'company')
    location_elem = job_elem.find('div', class_ = 'location')
    print(title_elem)
    print(company_elem)
    print(location_elem)
    print()

<h2 class="title"><a data-bypass="true" data-m_impr_a_placement_id="JSR2CW" data-m_impr_j_cid="660" data-m_impr_j_coc="" data-m_impr_j_jawsid="434161395" data-m_impr_j_jobid="2201018" data-m_impr_j_jpm="2" data-m_impr_j_jpt="3" data-m_impr_j_lat="0" data-m_impr_j_lid="0" data-m_impr_j_long="0" data-m_impr_j_occid="11969" data-m_impr_j_p="1" data-m_impr_j_postingid="e5aff8c3-a1ff-4606-a1bc-793e8a588bc0" data-m_impr_j_pvc="ec3a6188-6a80-441a-814d-a9e2c9b76318" data-m_impr_s_t="t" data-m_impr_uuid="e784fb42-1888-4f6e-9b54-9a563b4bd796" href="https://job-openings.monster.com/lead-performance-engineer-software-systems-plantation-fl-sunnyvale-ca-culver-new-york-city-ca-seattle-wa-austin-tx-toronto-ny-us-magic-leap-inc/e5aff8c3-a1ff-4606-a1bc-793e8a588bc0" onclick="clickJobTitle('plid=0&amp;pcid=660&amp;poccid=11969','Software Developer',''); clickJobTitleSiteCat('{&quot;events.event48&quot;:&quot;true&quot;,&quot;eVar25&quot;:&quot;Lead Performance Engineer, Software Systems&quot;,&quot;eVar

In [21]:
# This is still WAY too much information!! (Take a 5 minute break)

In [23]:
# Good news! In BeautifulSoup, you can bypass all this HTML mumble jumble, by adding 'text' to the print()

for job_elem in job_elems:
    title_elem = job_elem.find('h2', class_='title')
    company_elem = job_elem.find('div', class_='company')
    location_elem = job_elem.find('div', class_='location')
    print(title_elem.text)
    print(company_elem.text)
    print(location_elem.text)
    print()

Lead Performance Engineer, Software Systems


Magic Leap, Inc.





Plantation, FL; Sunnyvale, CA; Culver New York City, CA; Seattle, WA; Austin, TX; Toronto, NY





AttributeError: 'NoneType' object has no attribute 'text'

In [26]:
# Oh no! there is a NoneType error that is being thrown. This is because the data is not uniformed, so in order to bypass this, modify the script to now read: 

for job_elem in job_elems:
    title_elem = job_elem.find('h2', class_='title')
    company_elem = job_elem.find('div', class_='company')
    location_elem = job_elem.find('div', class_='location')
    if None in (title_elem, company_elem, location_elem):
        continue
    print(title_elem.text)
    print(company_elem.text)
    print(location_elem.text)
    print()

Lead Performance Engineer, Software Systems
Magic Leap, Inc.
Plantation, FL; Sunnyvale, CA; Culver New York City, CA; Seattle, WA; Austin, TX; Toronto, NY

Senior/Lead Software Engineer, Browser
Magic Leap, Inc.
Sunnyvale, CA; Plantation, FL (HQ); Austin, TX; Culver New York City, CA; Seattle, WA; Toronto, NY

Service Consultant REST
TAL
Sydney, NSW

Solution Delivery Manager
Zuora
Sydney, NSW

Customer Experience Technical Analyst - Sydney, New South Wales
Mediaocean
Sydney, NSW

Runner
Vacasa
The Blue Mountains, ON

Associate Strategist
Khoros, LLC
Sydney, FL

Strategist
Khoros, LLC
Sydney, FL

Scrum Product Owner (Sr Advanced Project Eng)
Honeywell
Houston, TX



In [27]:
# Wow! Now we have a list of the job titles, the commpany, and the location, however, there is a lot of whitespace, so let's .strip() it out! 

for job_elem in job_elems:
    title_elem = job_elem.find('h2', class_='title')
    company_elem = job_elem.find('div', class_='company')
    location_elem = job_elem.find('div', class_='location')
    if None in (title_elem, company_elem, location_elem):
        continue
    print(title_elem.text.strip())
    print(company_elem.text.strip())
    print(location_elem.text.strip())
    print()


Lead Performance Engineer, Software Systems
Magic Leap, Inc.
Plantation, FL; Sunnyvale, CA; Culver New York City, CA; Seattle, WA; Austin, TX; Toronto, NY

Senior/Lead Software Engineer, Browser
Magic Leap, Inc.
Sunnyvale, CA; Plantation, FL (HQ); Austin, TX; Culver New York City, CA; Seattle, WA; Toronto, NY

Service Consultant REST
TAL
Sydney, NSW

Solution Delivery Manager
Zuora
Sydney, NSW

Customer Experience Technical Analyst - Sydney, New South Wales
Mediaocean
Sydney, NSW

Runner
Vacasa
The Blue Mountains, ON

Associate Strategist
Khoros, LLC
Sydney, FL

Strategist
Khoros, LLC
Sydney, FL

Scrum Product Owner (Sr Advanced Project Eng)
Honeywell
Houston, TX



In [29]:
# Let's filter it to be a bit ore specific, and look only for Python Developer in the h2 element, and let's print the results

python_jobs = results.find_all('h2', string='Python Developer')
print(python_jobs)

[]


In [30]:
# What is going on?! The string argument looks for the SPECIFIC str that we wrote and is case sensitive, so we need to expand our search a bit more
# So let's pass a fucntion as an arument in BS to help us look for Python jobs 
# This will look for ANY instance where 'python' is listed in the job title

python_jobs = results.find_all('h2',
                               string=lambda text: 'python' in text.lower())
print(python_jobs)

[]


In [31]:
print(len(python_jobs))

0


In [32]:
# Welp, it doesn't look like we will be working this year as a Python developer...But don't worry, we can change you search to see if there are other matches. 

python_jobs = results.find_all('h2',
                               string=lambda text: 'developer' in text.lower())
print(python_jobs)

[]


In [33]:
print(len(python_jobs))


0


In [34]:
# We aren't having much luck finding a job. But in case we do find a job, it might be worthwhile to grab the link to apply for it! 

python_jobs = results.find_all('h2',
                               string=lambda text: "software" in text.lower())

for p_job in python_jobs:
    link = p_job.find('a')['href']
    print(p_job.text.strip())
    print(f"Apply here: {link}\n")

Lead Performance Engineer, Software Systems
Apply here: https://job-openings.monster.com/lead-performance-engineer-software-systems-plantation-fl-sunnyvale-ca-culver-new-york-city-ca-seattle-wa-austin-tx-toronto-ny-us-magic-leap-inc/e5aff8c3-a1ff-4606-a1bc-793e8a588bc0

Senior/Lead Software Engineer, Browser
Apply here: https://job-openings.monster.com/senior-lead-software-engineer-browser-sunnyvale-ca-plantation-fl-hq-austin-tx-culver-new-york-city-ca-seattle-wa-toronto-ny-us-magic-leap-inc/36b509cf-114c-48aa-aede-e6574b6cbff5



In [35]:
# We got a hit! Great, now go ahead and apply. It doesn't matter if you're qualify, you probably aren't, so there really isn't any harm! 
# This is the end of this tutorial! I hope you liked it! 

In [None]:
# Documentation for BS: https://www.crummy.com/software/BeautifulSoup/bs4/doc/



