In [1]:
#importing required libraries
from __future__ import unicode_literals
import bs4
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import re
from collections import OrderedDict


In [2]:
def amazon_job(number_page=10):
    """
    retrieve job title, job location, job posting date, and job link from every page in 
    https://amazon.jobs.

    Arguments:
    number_page -- Number of pages that one wish to retrive the data from.

    Return:
    s -- A tuple including all the job infromation for each job in each page
    """
    
    job_title=[]
    location=[]
    posting_date=[]
    job_link=[]

    for i in range(number_page):
        driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()))
        
        #There are 10 job postings in each page. Therefore, job pages URL can be updated
        #by muliplying the counter ("i") by 10.
        URL='https://www.amazon.jobs/en/search?offset="+str(10*i)+"&result_limit=10&sort=relevant&job_type%5B%5D=Full-Time&business_category%5B%5D=amazon-web-services&distanceType=Mi&radius=24km&latitude=&longitude=&loc_group_id=&loc_query=&base_query=data%20engineer&city=&country=&region=&county=&query_options=&'
    
        driver.get(URL)
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        driver.quit()
        
         
        job_title.append([td.find('h3').text for td in soup.findAll("div", {"class": "job-tile"})])
        posting_date.append([re.sub('Posted ', '', td.text) for td in soup.findAll("h2", {"class": "posting-date"})])
        job_link.append(['https://www.amazon.jobs'+td.find('a').get('href') for td in soup.findAll("div", {"class": "job-tile"})])
        
           
        
    return job_title,location,posting_date,job_link

In [3]:
#extracting jobs information. Currently there are 346 pages in amazon.job. 
job=amazon_job(1)




[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [C:\Users\Lene\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache


In [4]:
def make_list(job):
    """
    put all the job data in alist that can be used to create a DataFrame

    Arguments:
    job_list -- A tuple containing job title, job location, job posting date, and job link .

    Return:
    s -- A list containing job information
    """
    t=[]
    for i in job:
        for b in i:
            for c in b:
                t.append(c)
    return t


In [5]:
#make a list of all job data
job_list=make_list(job)
len(job_list)
print(job_list)

['Data Engineer', 'Data Engineer, Data Center Automation', 'Data Engineer', 'Data Engineer', 'Data Engineer, Data Solutions & Engineering, Security', 'Data Engineer, AWS Econ Data', 'Data Engineer, AWS Econ Data', 'Data Engineer', 'Data Engineer', 'Data Engineer', 'October 12, 2021', 'June  1, 2022', 'December  7, 2021', 'January  5, 2021', 'June  1, 2022', 'March 25, 2022', 'March 14, 2022', 'June 18, 2021', 'May  6, 2021', 'October 20, 2021', 'https://www.amazon.jobs/en/jobs/1770158/data-engineer', 'https://www.amazon.jobs/en/jobs/2085517/data-engineer-data-center-automation', 'https://www.amazon.jobs/en/jobs/1841062/data-engineer', 'https://www.amazon.jobs/en/jobs/1391585/data-engineer', 'https://www.amazon.jobs/en/jobs/2085035/data-engineer-data-solutions-engineering-security', 'https://www.amazon.jobs/en/jobs/1999795/data-engineer-aws-econ-data', 'https://www.amazon.jobs/en/jobs/1981936/data-engineer-aws-econ-data', 'https://www.amazon.jobs/en/jobs/1603572/data-engineer', 'https:/

In [6]:
#Create a dataframe from the job information list
def make_dataframe(job_list):
    """
    ceate a dataframe from the job_list
    
    Arguments:
    job_list -- A tuple containing job title, job location, job posting date, and job link .

    Return:
    df -- A dataframe containing each job description, basic qualification and preferred qualification.
    """
    
    l=int(len(job_list))
    df=pd.DataFrame(OrderedDict({'Title': job_list[:l],'Posting_date':job_list[2*l:3*l], 'job_link': job_list[3*l:]}))
    
    print(df)
    


In [7]:
#DataFrame containing job title, job location, job posting date, and job link.
df1=pd.DataFrame(job_list)
df1.transpose()
df1.to_csv('df1.csv')
print(df1)

                                                    0
0                                       Data Engineer
1               Data Engineer, Data Center Automation
2                                       Data Engineer
3                                       Data Engineer
4   Data Engineer, Data Solutions & Engineering, S...
5                        Data Engineer, AWS Econ Data
6                        Data Engineer, AWS Econ Data
7                                       Data Engineer
8                                       Data Engineer
9                                       Data Engineer
10                                   October 12, 2021
11                                      June  1, 2022
12                                  December  7, 2021
13                                   January  5, 2021
14                                      June  1, 2022
15                                     March 25, 2022
16                                     March 14, 2022
17                          

In [8]:
def job_description(job_list):
    """
    retrieving job description, basic qualification and preferred qualification.
    we get the job link from the previous job_list and then this function goes to every posted job
    page to get each job description, basic qualification and preferred qualification.

    Arguments:
    job_list -- A tuple containing job title, job location, job posting date, and job link .

    Return:
    job_information -- A list containing each job description, basic qualification and preferred qualification.
    """
    
    l=int(len(job_list)/4)
    job_link=job_list[3*l:]
    job_information=[]
    
    
    for x in range(l):
        driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()))
        URL=job_link[x]
        driver.get(URL)
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        driver.quit()
        
        job_information.append([h2.next_sibling for h2 in soup.findAll("div", {"class": "section"})])
        
    return job_information

In [25]:
job_description=job_description(job_list)




[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [C:\Users\Lene\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache





[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [C:\Users\Lene\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache





[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [C:\Users\Lene\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache





[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [C:\Users\Lene\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache





[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [C:\Users\Lene\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache





[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [C:\Users\Lene\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache





[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [C:\Users\Lene\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache


In [26]:
#create a dataframe from the job description, basic qualification and preferred qualification
df2=pd.DataFrame(job_description)
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,"[[DESCRIPTION], [Job summary, [], Come and be ...","[[BASIC QUALIFICATIONS], [], [[5+ years of exp...","[[PREFERRED QUALIFICATIONS], [], [[Proven succ...",,"[[], [], [], []]","[[Working At Amazon], [[<a href=""/en/landing_p...","[[Help], [[<a data-action=""FAQ"" href=""/en/faqs...","[[[<div class=""d-none d-md-block""><div class=""...",
1,"[[DESCRIPTION], [JOB SUMMARY, [], The AI Data ...","[[BASIC QUALIFICATIONS], [], [[Bachelor's degr...","[[PREFERRED QUALIFICATIONS], [], [[Experience ...",,"[[], [], [], []]","[[Working At Amazon], [[<a href=""/en/landing_p...","[[Help], [[<a data-action=""FAQ"" href=""/en/faqs...","[[[<div class=""d-none d-md-block""><div class=""...",
2,"[[DESCRIPTION], [The Amazon Web Services’ (AWS...","[[BASIC QUALIFICATIONS], [], [[Bachelor's degr...","[[PREFERRED QUALIFICATIONS], [], [[Knowledge o...",,"[[], [], [], []]","[[Working At Amazon], [[<a href=""/en/landing_p...","[[Help], [[<a data-action=""FAQ"" href=""/en/faqs...","[[[<div class=""d-none d-md-block""><div class=""...",
3,"[[DESCRIPTION], [Job summary, [], At Amazon, S...","[[BASIC QUALIFICATIONS], [], [[Bachelor's degr...","[[PREFERRED QUALIFICATIONS], [], [[Meets/excee...",,"[[], [], [], []]","[[Working At Amazon], [[<a href=""/en/landing_p...","[[Help], [[<a data-action=""FAQ"" href=""/en/faqs...","[[[<div class=""d-none d-md-block""><div class=""...",
4,"[[DESCRIPTION], [Job summary, [], AWS is looki...","[[BASIC QUALIFICATIONS], [], [[Bachelor’s degr...","[[PREFERRED QUALIFICATIONS], [], [[Master`s de...",,"[[], [], [], []]","[[Working At Amazon], [[<a href=""/en/landing_p...","[[Help], [[<a data-action=""FAQ"" href=""/en/faqs...","[[[<div class=""d-none d-md-block""><div class=""...",
5,"[[DESCRIPTION], [Job summary, [], AWS is looki...","[[BASIC QUALIFICATIONS], [], [[Bachelor’s degr...","[[PREFERRED QUALIFICATIONS], [], [[Master`s de...",,"[[], [], [], []]","[[Working At Amazon], [[<a href=""/en/landing_p...","[[Help], [[<a data-action=""FAQ"" href=""/en/faqs...","[[[<div class=""d-none d-md-block""><div class=""...",
6,"[[DESCRIPTION], [The AWS Worldwide Revenue Ope...","[[BASIC QUALIFICATIONS], [[], [], · Bachelor's...","[[PREFERRED QUALIFICATIONS], [· Master’s degre...",,"[[], [], [], []]","[[Working At Amazon], [[<a href=""/en/landing_p...","[[Help], [[<a data-action=""FAQ"" href=""/en/faqs...","[[[<div class=""d-none d-md-block""><div class=""...",


In [32]:
#combining the two dataframes and save them in a csv file
result = pd.concat([df1[['Title','location','Posting_date']], df2[['DESCRIPTION','BASIC QUALIFICATIONS','PREFERRED QUALIFICATIONS']]], axis=1, join='inner')
result.to_csv('full_job_amazon.csv')


KeyboardInterrupt



In [None]:
ful_job=amazon_job()

In [None]:
ful_job=make_list(ful_job)

In [None]:
df1=make_dataframe(ful_job2)

In [None]:
ful_job_de=job_description(ful_job2)

In [None]:
df2=pd.DataFrame(ful_job_de, columns=['DESCRIPTION','BASIC QUALIFICATIONS','PREFERRED QUALIFICATIONS'])

df2.to_csv('job_link_des.csv')

In [None]:
result = pd.concat([df1[['Title','location','Posting_date']], df2[['DESCRIPTION','BASIC QUALIFICATIONS','PREFERRED QUALIFICATIONS']]], axis=1, join='inner')
result.to_csv('full_job_amazon_new.csv')