SIMPLE WEB SCRAPER

In [5]:
# Import necessary libraries
import undetected_chromedriver as uc
from bs4 import BeautifulSoup
import time
import numpy as np
import pandas as pd
from datetime import date

In [2]:
# This commented code outlines the basic steps for web scraping as general reference:
# create a new instance of Chrome
#chrome = Chrome()

# navigate to the website
#chrome.get('https://mx.indeed.com/jobs?q=data+science&l=Remote+-+Mexico&start=0&pp=gQAPAAAAAAAAAAAAAAACBYaYUwAZAQEBBwdtIXZKBqGEO0bEFksnjLeua_78ewAA&vjk=94621b0d50661801')

#time.sleep(10)
# close the browser
#chrome.close()

In [3]:
# Defining a function "extract" which takes an argument "page" (a number)
def extract(page):
    #'url' is a string that changes depending on the page number
    url = f'https://mx.indeed.com/jobs?q=data+science&l=Remote+-+Mexico&start={page}&pp=gQAPAAAAAAAAAAAAAAACBYaYUwAZAQEBBwdtIXZKBqGEO0bEFksnjLeua_78ewAA&vjk=94621b0d50661801'

    options = uc.ChromeOptions()
    driver = uc.Chrome(options=options)
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    #The browser is quit after extraction to free up resources
    driver.quit()
    return soup

In [34]:
# Defining a function "transform" to extract specific job-related information from the parsed HTML data
def transform(soup):
    divs = soup.find_all('div', class_ = 'slider_item')
    for item in divs:
        title = item.find('a').text.strip()
        company = item.find('span', class_ = 'companyName').text.strip()
        date = item.find('span', class_ = 'date').text.strip()
        link_tag = item.find('a', class_ = 'jcs-JobTitle')
        job_url = 'https://mx.indeed.com/viewjob?' + link_tag['href'].strip('/rc/clk?')
        try:
            salary = item.find('div', class_ = 'metadata salary-snippet-container').text.strip()
        except:
            salary = ''
        summary = item.find('div', class_ = 'job-snippet').text.strip().replace('\n', '')
    
        job = {
            'title': title,
            'company': company,
            'date' : date,
            'job_url': job_url,
            'salary': salary,
            'summary': summary
        }
        job_list.append(job)
    return

In [35]:
# Initializing an empty list "job_list" to hold all job dictionaries
job_list = []

for i in range(10,40,10): # This is for demonstration purposes so we iterate over a limited range of numbers
    print(f'Getting page, {i}')
    c = extract(i)
    transform(c)
    print(f'Waiting some seconds...')
    #The waiting time simulates human interaction and helps avoid being detected as a bot by the website
    time.sleep(np.random.randint(1,6) )
print('Done')

Getting page, 10
Waiting some seconds...
Getting page, 20
Waiting some seconds...
Getting page, 30
Waiting some seconds...
Done


In [36]:
# Converting the list of job dictionaries ("job_list") into a Pandas DataFrame "df"
df = pd.DataFrame(job_list)
print(f'Jobs extracted: {len(df)}')
#Printing the number of duplicated jobs in the DataFrame (if any)
print(f'Jobs duplicated: {df.duplicated().sum()}')
df.head()

Jobs extracted: 45
Jobs duplicated: 0


Unnamed: 0,title,company,date,job_url,salary,summary
0,BI Engineer (8 weeks project),EQUALS TRUE,PostedRecién publicado,https://mx.indeed.com/viewjob?jk=c84bfed12eb6d...,,"We believe that for Equality, Diversity, and I..."
1,Senior Data Engineer,CODIGOMX,PostedHoy,https://mx.indeed.com/viewjob?ompany/CodigoMX/...,"$48,000 a $55,000 por mes","Required Technologies AGILE(Scrum), .Git and G..."
2,Data Engineer,"Perficient, Inc",PostedPublicado hace más de 30 días,https://mx.indeed.com/viewjob?jk=bf21652f6ab6e...,,Are you theOracle Cloud ERP Applications Analy...
3,Junior FullStack Developer,Luxoft,PostedPublicado hace 22 días,https://mx.indeed.com/viewjob?jk=1ccfdaf7e07f9...,,Project Description We are looking for a Softw...
4,Machine Learning Engineer,iKraft Solutions,PostedPublicado hace 8 días,https://mx.indeed.com/viewjob?ompany/iKraft-So...,"$75,000 a $95,000 por mes","We are looking for Machine Learning Engineers,..."


In [37]:
# Printing the total number of jobs before and after deduplication (if any)
print(f'Number of jobs before deduplication: {len(df)}')
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
print(f'Number of jobs after deduplication: {len(df)}')

df.head()

Number of jobs before deduplication: 45
Number of jobs after deduplication: 45


Unnamed: 0,title,company,date,job_url,salary,summary
0,BI Engineer (8 weeks project),EQUALS TRUE,PostedRecién publicado,https://mx.indeed.com/viewjob?jk=c84bfed12eb6d...,,"We believe that for Equality, Diversity, and I..."
1,Senior Data Engineer,CODIGOMX,PostedHoy,https://mx.indeed.com/viewjob?ompany/CodigoMX/...,"$48,000 a $55,000 por mes","Required Technologies AGILE(Scrum), .Git and G..."
2,Data Engineer,"Perficient, Inc",PostedPublicado hace más de 30 días,https://mx.indeed.com/viewjob?jk=bf21652f6ab6e...,,Are you theOracle Cloud ERP Applications Analy...
3,Junior FullStack Developer,Luxoft,PostedPublicado hace 22 días,https://mx.indeed.com/viewjob?jk=1ccfdaf7e07f9...,,Project Description We are looking for a Softw...
4,Machine Learning Engineer,iKraft Solutions,PostedPublicado hace 8 días,https://mx.indeed.com/viewjob?ompany/iKraft-So...,"$75,000 a $95,000 por mes","We are looking for Machine Learning Engineers,..."


In [38]:
#Saving the DataFrame "df" as a .csv file, the filename includes today's date
date_today = date.today()
df.to_csv(f'indeed_jobs_{date_today}.cvs')