# SCRAPING WITH BEAUTIFULSOUP AND SELENIUM

In [None]:
from bs4 import BeautifulSoup
import urllib
import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from time import sleep

# 1. Scrape URLs

In [None]:
##### ACTIVE VERSION

# SECTION 1: read in existing file
consolidated_urls = pd.read_csv('./Data/D2/consolidated_urls.csv')


# SECTION 2: define search terms (add/edit if necessary)
search_terms = ['data scientist', 'data analyst', 'business analyst', 'business intelligence', 'data architect',\
                'data engineer','database engineer', 'research scientist', 'data governance', 'data manager',\
                'python developer']


# SECTION 3: create empty df to append urls
all_urls = pd.DataFrame()


# SECTION 4: iterate through search terms
for term in search_terms:

    # SECTION 4.1: TOTAL NUMBER OF JOBS TO ITERATE THROUGH (FOR THE WHILE LOOP THAT FOLLOWS)

    # get the total number of jobs in a search
    counter = 0
    path = 'https://www.mycareersfuture.sg/search?search=' + term + '&sortBy=new_posting_date&page='
    page = str(counter)
    path_page = str(path + page)

    # initialise browser
    browser = webdriver.Chrome()

    # navigate to url
    browser.get(path_page)

    sleep(5)

    # get the source code as string
    source = browser.page_source

    # parsing the string-type html to proper html format
    soup = BeautifulSoup(source, 'lxml')

    # get the number of jobs
    total_jobs = int([val.text for val in soup.findAll('span',{'class':'pl2 pl0-ns f5 black-70 fw4 db lh-copy'})][0].split(" ")[0])

    # close browser
    browser.close()


    ## SECTION 4.2: CODE TO ITERATE THROUGH THE JOBS TO GET THE URLS

    job_urls = []

    counter = 0

    while len(job_urls) < total_jobs:
        
        # filepath/url of each page
        path = 'https://www.mycareersfuture.sg/search?search=' + term + '&sortBy=new_posting_date&page='
        page = str(counter)
        path_page = path + page

        # initialise browser
        browser = webdriver.Chrome()

        # navigate to url
        browser.get(path_page)

        sleep(5)

        # this code gets the source code as string
        source = browser.page_source

        # parsing the string-type html to proper html format
        soup = BeautifulSoup(source, 'lxml')

        # get the element where the href exists

        rel_urls = []

        for item in soup.findAll('a', {'class': 'bg-white mb3 w-100 dib v-top pa3 no-underline flex-ns flex-wrap JobCard__card___22xP3'}, href=True):
            rel_urls.append(item['href'])


        # append the standard url format in front of the href link
        for item in rel_urls:
            link = 'https://www.mycareersfuture.sg' + item
            job_urls.append(link)


        counter = counter + 1

        browser.close()

    # parse into dataframe and append to all_urls
    temp_df = pd.DataFrame(job_urls)

    temp_df['search_type'] = term
    
    all_urls = all_urls.append(temp_df)

# SECTION 5: rename all_urls columns and remove duplicates
all_urls = all_urls.rename(columns={0:'url'})
all_urls.drop_duplicates('url', inplace=True)

# SECTION 6: append to consolidated_urls and export to csv
consolidated_urls = consolidated_urls.append(all_urls)
consolidated_urls.drop_duplicates('url', inplace=True)

consolidated_urls.to_csv('./Data/D2/consolidated_urls.csv', index=False)


In [None]:
# see how many unique urls
some_url = consolidated_urls.head()
some_url

# 2. Scrape Details

In [None]:
%%time

all_jobs = []

for link in consolidated_urls['url']:

    # initialise browser
    browser = webdriver.Chrome()

    # navigate to url
    browser.get(link)

    sleep(5)

    # get the source code as string
    source = browser.page_source

    # parsing the string-type html to proper html format
    soup = BeautifulSoup(source, 'lxml')

    browser.close()



    job = []

    # company = []
    for item in soup.findAll('p', {'name': 'company'}):
        job.append(item.text)

    # job_title = []
    for item in soup.findAll('h1', {'id': 'job_title'}):
        job.append(item.text)

    # job_id = []
    for item in soup.findAll('span', {'class': 'black-60 db f6 fw4 mv1'}):
        job.append(item.text)

    # job_type = []
    for item in soup.findAll('p', {'id': 'employment_type'}):
        job.append(item.text)

    salary = []
    for item in soup.findAll('span', {'class': 'dib'}):
        salary.append(item.text)
    salary_range = salary[1]
    job.append(salary_range)
    salary_freq = salary[-2]
    job.append(salary_freq)

    # level = []
    for item in soup.findAll('p', {'id': 'seniority'}):
        job.append(item.text)

    # industry = []
    for item in soup.findAll('p', {'id': 'job-categories'}):
        job.append(item.text)

    # job_description = []
    for item in soup.findAll('div', {'id': 'job_description'}):
        job.append(item.text)

    # requirements = []
    for item in soup.findAll('div', {'id': 'requirements'}):
        job.append(item.text)

    all_jobs.append(job)


columns = ['company', 'job_title', 'job_id', 'job_type', 'salary_range', 'salary_freq', 'level', 'industry',\
           'job_description', 'requirements']

job_df = pd.DataFrame(all_jobs, columns=columns)
job_df.to_csv('./Data/D2/job_data.csv', index=False)
job_df.head(20)

In [None]:
all_jobs

# Updated version where i did second round of scraping

In [None]:
### IMPORT LIBRARIES -------------------------------------------

from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from selenium import webdriver
from time import sleep




### SCRAPE URLs -------------------------------------------------

# SECTION 1: read in existing file
consolidated_urls = pd.read_csv('./Data/consolidated_urls.csv')


# SECTION 2: define search terms (add/edit if necessary)
search_terms = ['data specialist', 'data scientist', 'data analyst', 'business analyst', 'business intelligence',\
                'data architect', 'data engineer','database engineer', 'research scientist', 'data governance',\
                'data manager', 'python developer']


# SECTION 3: create empty df to store urls
all_urls = pd.DataFrame()


# SECTION 4: iterate through search terms
for term in search_terms:

    # SECTION 4.1: GET TOTAL NUMBER OF JOBS TO ITERATE THROUGH (FOR THE WHILE LOOP THAT FOLLOWS)

    # get the total number of jobs in a search
    counter = 0
    path = 'https://www.mycareersfuture.sg/search?search=' + term + '&sortBy=new_posting_date&page='
    page = str(counter)
    path_page = str(path + page)

    # initialise browser
    browser = webdriver.Chrome()

    # navigate to url
    browser.get(path_page)

    sleep(5) # allow time for page to load

    # get the source code as string
    source = browser.page_source

    # parsing the string-type html to proper html format
    soup = BeautifulSoup(source, 'lxml')

    # get the number of jobs
    total_jobs = int([val.text for val in soup.findAll('span',{'class':'pl2 pl0-ns f5 black-70 fw4 db lh-copy'})][0].split(" ")[0])

    # close browser
    # browser.close()



    ## SECTION 4.2: START TO ITERATE THROUGH THE JOBS TO GET THE URLS

    job_urls = []

    counter = 0

    while len(job_urls) < total_jobs: # loop will continue until the number of urls hit the total number of expected jobs found in each search term
        
        # filepath/url of each page
        path = 'https://www.mycareersfuture.sg/search?search=' + term + '&sortBy=new_posting_date&page='
        page = str(counter)
        path_page = path + page

        # initialise browser
        # browser = webdriver.Chrome()

        # navigate to url
        browser.get(path_page)

        sleep(5) # allow time for page to load

        # this code retrieves the source code as string
        source = browser.page_source

        # parsing the string-type html to proper html format
        soup = BeautifulSoup(source, 'lxml')

        # get the element where the href (url) exists

        rel_urls = []

        for item in soup.findAll('a', {'class': 'bg-white mb3 w-100 dib v-top pa3 no-underline flex-ns flex-wrap JobCard__card___22xP3'}, href=True):
            rel_urls.append(item['href'])


        # append the standard url format in front of the href link
        for item in rel_urls:
            link = 'https://www.mycareersfuture.sg' + item
            job_urls.append(link)


        counter = counter + 1 # once completed, +1 to move to the next page

        # browser.close()

    # parse into dataframe and append to all_urls
    temp_df = pd.DataFrame(job_urls)

    temp_df['search_type'] = term # add a column so we know which search term each url is found under
    
    all_urls = all_urls.append(temp_df)

# SECTION 5: rename all_urls columns and remove duplicates
all_urls = all_urls.rename(columns={0:'url'})
all_urls.drop_duplicates('url', inplace=True)

# SECTION 6: append to consolidated_urls, drop duplicates and finally export to csv
consolidated_urls = consolidated_urls.append(all_urls)
consolidated_urls.drop_duplicates('url', inplace=True)

consolidated_urls.to_csv('./Data/consolidated_urls.csv', index=False, date_format='%Y-%m-%d')





### SCRAPE DETAILS INSIDE EACH URL--------------------------------------------

all_jobs = []

for link in consolidated_urls['url']: # iterate through each of the URLs

    # initialise browser
    # browser = webdriver.Chrome()

    # navigate to url
    browser.get(link)

    sleep(5)

    # get the source code as string
    source = browser.page_source

    # parsing the string-type html to proper html format
    soup = BeautifulSoup(source, 'lxml')

    # browser.close()



    job = []

    # get company name
    for item in soup.findAll('p', {'name': 'company'}):
        job.append(item.text)

    # get job title
    for item in soup.findAll('h1', {'id': 'job_title'}):
        job.append(item.text)

    # get job id
    for item in soup.findAll('span', {'class': 'black-60 db f6 fw4 mv1'}):
        job.append(item.text)

    # get job type
    for item in soup.findAll('p', {'id': 'employment_type'}):
        job.append(item.text)

    # get salary (freq + range)
    salary = []
    for item in soup.findAll('span', {'class': 'dib'}):
        salary.append(item.text)
    salary_range = salary[1]
    job.append(salary_range)
    salary_freq = salary[-2]
    job.append(salary_freq)

    # get job level
    for item in soup.findAll('p', {'id': 'seniority'}):
        job.append(item.text)

    # get industry
    for item in soup.findAll('p', {'id': 'job-categories'}):
        job.append(item.text)

    # get job description
    for item in soup.findAll('div', {'id': 'job_description'}):
        job.append(item.text)

    # get job requirement
    for item in soup.findAll('div', {'id': 'requirements'}):
        job.append(item.text)

    all_jobs.append(job)

# close browser
browser.close()



# define column names
columns = ['company', 'job_title', 'job_id', 'job_type', 'salary_range', 'salary_freq', 'level', 'industry',\
           'job_description', 'requirements']

# put everything into a dataframe before exporting to csv
job_df = pd.DataFrame(all_jobs, columns=columns)
job_df.to_csv('./Data/job_data.csv', index=False, date_format='%Y-%m-%d')

In [None]:
browser = webdriver.Chrom()