In [1]:
# Import necessary libraries
import undetected_chromedriver as uc
from bs4 import BeautifulSoup
import time
import numpy as np
import pandas as pd
from datetime import date

In [2]:
# This commented code outlines the basic steps for web scraping as general reference:
# create a new instance of Chrome
#chrome = Chrome()

# navigate to the website
#chrome.get('https://mx.indeed.com/jobs?q=data+science&l=Remote+-+Mexico&start=0&pp=gQAPAAAAAAAAAAAAAAACBYaYUwAZAQEBBwdtIXZKBqGEO0bEFksnjLeua_78ewAA&vjk=94621b0d50661801')

#time.sleep(10)
# close the browser
#chrome.close()

In [3]:
# Defining a function "extract" which takes an argument "page" (a number)
def extract(page):
    #'url' is a string that changes depending on the page number
    url = f'https://mx.indeed.com/jobs?q=data+science&l=Remote+-+Mexico&start={page}&pp=gQAPAAAAAAAAAAAAAAACBYaYUwAZAQEBBwdtIXZKBqGEO0bEFksnjLeua_78ewAA&vjk=94621b0d50661801'

    options = uc.ChromeOptions()
    driver = uc.Chrome(options=options)
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    #The browser is quit after extraction to free up resources
    driver.quit()
    return soup

In [4]:
# Defining a function "transform" to extract specific job-related information from the parsed HTML data
def transform(soup):
    divs = soup.find_all('div', class_ = 'slider_item')
    for item in divs:
        title = item.find('a').text.strip()
        company = item.find('span', class_ = 'companyName').text.strip()
        try:
            salary = item.find('div', class_ = 'metadata salary-snippet-container').text.strip()
        except:
            salary = ''
        summary = item.find('div', class_ = 'job-snippet').text.strip().replace('\n', '')
    
        job = {
            'title': title,
            'company': company,
            'salary': salary,
            'summary': summary
        }
        job_list.append(job)
    return

In [5]:
# Initializing an empty list "job_list" to hold all job dictionaries
job_list = []

for i in range(0,40,10): # The for loop iterates over a limited range of page numbers, as this is designed primarily for learning and demonstration purposes
    print(f'Getting page, {i}')
    c = extract(i)
    transform(c)
    print(f'Waiting some seconds...')
    #The waiting time simulates human interaction and helps avoid being detected as a bot by the website
    time.sleep(np.random.randint(1,6) )

Getting page, 0
Waiting some seconds...
Getting page, 10
Waiting some seconds...
Getting page, 20
Waiting some seconds...
Getting page, 30
Waiting some seconds...


In [6]:
# Converting the list of job dictionaries ("job_list") into a Pandas DataFrame "df"
df = pd.DataFrame(job_list)
print(f'Jobs extracted: {len(df)}')
#Printing the number of duplicated jobs in the DataFrame (if any)
print(f'Jobs duplicated: {df.duplicated().sum()}')
df.head()

Jobs extracted: 60
Jobs duplicated: 15


Unnamed: 0,title,company,salary,summary
0,Gestor de Mailing y analíticos,Intelli-Dixit MX S.C.,"Desde $18,000 por mes",Deseable: estudios en data science.Gestionar c...
1,IT Operations Analyst (Remote-Contract),GreatFit Talent Recruitment,$200 a $300 por hora,Conduct periodic access reviews and audits to ...
2,Mobile Application Developer,Integon Service Co.,"$80,181 a $88,000 por mes",Strong understanding of computer science conce...
3,Data Engineer,"Perficient, Inc",,Analyzing data to obtain new insights and hidd...
4,Machine Learning Engineer,iKraft Solutions,"$75,000 a $95,000 por mes",Searching and selecting appropriate data sets ...


In [7]:
# Printing the total number of jobs before and after deduplication (if any)
print(f'Number of jobs before deduplication: {len(df)}')
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
print(f'Number of jobs after deduplication: {len(df)}')

df.head()

Number of jobs before deduplication: 60
Number of jobs after deduplication: 45


Unnamed: 0,title,company,salary,summary
0,Gestor de Mailing y analíticos,Intelli-Dixit MX S.C.,"Desde $18,000 por mes",Deseable: estudios en data science.Gestionar c...
1,IT Operations Analyst (Remote-Contract),GreatFit Talent Recruitment,$200 a $300 por hora,Conduct periodic access reviews and audits to ...
2,Mobile Application Developer,Integon Service Co.,"$80,181 a $88,000 por mes",Strong understanding of computer science conce...
3,Data Engineer,"Perficient, Inc",,Analyzing data to obtain new insights and hidd...
4,Machine Learning Engineer,iKraft Solutions,"$75,000 a $95,000 por mes",Searching and selecting appropriate data sets ...


In [10]:
#Saving the DataFrame "df" as a .csv file, the filename includes today's date
date_today = date.today()
df.to_csv(f'indeed_jobs_{date_today}.cvs')