In [1]:
"""
web scraping functionality from www.indeed.com (USA)
"""
import requests
import time
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import re

import pandas as pd

source = "indeed.com"
cookies = {'aep_usuc_f': 'region=US&site=glo&b_locale=en_US&c_tp=USD'}


def get_url(position):
    """
    Generate URL from position and company type: recruiter or direct employer; education level
    """
    url = f"https://indeed.com/jobs?q={position}"

    return url


def get_job_date(card):
    """
     extracts date from the job post record

    :param card:
    :return:
    """
    post_str = card.find('span', {'class': 'date'}).text  # text from the footer: days ago was posted
    post_days = re.findall(r'\d+', post_str)  # extracting number of days from posted_str

    if post_days:
        # calculated date of job posting if days are mentioned
        job_date = (datetime.now() - timedelta(days=int(post_days[0]))).strftime("%d/%m/%Y")
    else:
        job_date = datetime.now().strftime("%d/%m/%Y")  # if days are not mentioned - using today

    return job_date


def get_job_salaries(card):
    """
    extracts salaries
    :param card:
    :return:
    """

    try:
        salary_str = card.find('div', 'metadata salary-snippet-container').text
        salaries = re.findall(r"\b(\w+[.]\w+)", salary_str)

    except AttributeError:
        salaries = []

    return salaries


def get_record(card):
    """
    Extract job data from a single record
    """
    span_tag = card.h2.a.span
    a_tag = card.h2.a

    job_id = a_tag.get("data-jk")  # unique job id
    job_title = span_tag.get("title")  # job title
    job_url = 'https://www.indeed.com' + a_tag.get('href')  # job url
    company_name = card.find('span', {'class': 'companyName'}).text  # company name
    job_loc = card.find('div', {'class': 'companyLocation'}).text  # job location
    job_summary = card.find('div', {'class': 'job-snippet'}).text.strip()  # job description
    job_date = get_job_date(card)  # job posting date
    job_salary = get_job_salaries(card)  # job salaries if any

    record = (job_id, job_title, job_date, job_loc, job_summary, job_salary, job_url, company_name)

    return record


def get_jobs(position):
    """
    creates a DataFrame with all records (scraped jobs), scraping from all pages

    """

    url = get_url(position)
    records = []

    # extract the job data

    while True:

        response = ""
        while response == "":
            try:
                response = requests.get(url=url, cookies=cookies)
                break
            except ConnectionError:
                print("Connection refused by the server..")
                print("Let me sleep for 5 seconds")
                print("ZZzzzz...")
                time.sleep(5)
                print("Was a nice sleep, now let me continue...")
                continue

        soup = BeautifulSoup(response.text, 'html.parser')

        cards = soup.find_all('div', 'job_seen_beacon')

        for card in cards:
            record = get_record(card)
            records.append(record)

        time.sleep(3)  # making a pause before moving to the next page

        # moving to the next page - > assigning a new url
        try:
            url = 'https://indeed.com/' + soup.find('a', {'aria-label': 'Next'}).get('href')

        except AttributeError:
            break

    # save the data as DF
    columns = ['job_id',
               'job_title',
               'job_date',
               'job_loc',
               'job_summary',
               'job_salary',
               'job_url',
               'company_name']
    df = pd.DataFrame(data=records, columns=columns)

    # adding to DF columns with search parameters
    search_time = datetime.now().strftime("%d/%m/%Y, %H:%M:%S")

    df["search_time"] = search_time
    df["search_position"] = position
    df["source"] = source

    return df

In [2]:
"""
Functionality for transforming scraping results into a data-dump ready to be uploaded to a DB or saved to .csv
"""
from datetime import datetime
import pandas as pd


class DataDump:
    def __init__(self):
        self.df = pd.DataFrame()

    def merge(self, df):
        """
        Concatenates new df to the datadump

        :param df: pd.DataFrame
        :return: None
        """
        if len(self.df) != 0:
            self.df = pd.concat(objs=[self.df, df])
        else:
            self.df = df

    def remove_duplicates(self, field):
        """
        Removes duplicates from a df by the 'JobID' field.

        :return: None
        """
        self.df.sort_values(by=field, inplace=True)
        self.df.drop_duplicates(subset=[field], keep="first", inplace=True)

    def format_salaries(self, field):
        self.df[field] = self.df[field].astype("string")

    def save_to_csv(self, path=None):
        """
        Saves a DataFrame into CSV file
        :param path: string
        :return: None
        """
        suffix = datetime.now().strftime("%d%m%Y_%H%M%S")
        file_name = f"datadump_{suffix}.csv"
        self.df.to_csv(path + file_name, index=False)

In [3]:
"""
module with functionality of the logging the results of the program run.
"""
from datetime import datetime


def get_time():
    return datetime.now().strftime('%d-%m-%Y, %H:%M:%S')


def logging(time, message, logs_lst):
    log = (time, message)
    logs_lst.append(log)
    print(time + ': ' + message)


class Logger:
    def __init__(self):
        self.logs_lst = []

    def start_session(self):
        time = get_time()
        message = "Session starts"
        logging(time, message, self.logs_lst)

    def end_session(self):
        time = get_time()
        message = "Session ends."
        logging(time, message, self.logs_lst)

    def start_scraping(self, position):
        time = get_time()
        message = f"Scraping attempt with the key word: '{position}'... "

        logging(time, message, self.logs_lst)

    def scraping_result(self, df):
        time = get_time()
        message = f"....successfully implemented. Number of records found: {len(df)}"
        logging(time, message, self.logs_lst)

    def scraping_result_final(self, df):
        time = get_time()
        message = f"Full scraping is executed. The raw data dump file contains {len(df)} records.\n" \
                  f"Number of unique records is {df['job_id'].nunique()}."
        logging(time, message, self.logs_lst)

    def data_formatted(self, df):
        time = get_time()
        message = f"Data dump is formatted:\n {df.info()}"
        logging(time, message, self.logs_lst)

    def error_occurs(self, error):
        time = get_time()
        message = f"{error} occurs"
        logging(time, message, self.logs_lst)

    def save_to_txt(self):
        pass

In [4]:
import time
# import indeed_com_scraper as scraper
# from dumping import DataDump
# from logger import Logger


class ScrapingSession:
    def __init__(self, positions):
        self.positions = positions

    def run(self):
        data_dump = DataDump()
        logger = Logger()

        # scraping -> creates a compiled data dump from different search parameters
        logger.start_session()  # logging start of a session

        for position in self.positions:
            logger.start_scraping(position)  # logging each scraping attempt
            try:
                df = scraper.get_jobs(position)
            except Exception as inst:
                logger.error_occurs(inst)
            else:
                logger.scraping_result(df)  # logging scraping results
                data_dump.merge(df)  # merging all values to the data_dump
            finally:
                time.sleep(5)  # wait for 5 seconds to avoid a block
                continue

        # logging scraping results
        logger.scraping_result_final(data_dump.df)

        # data cleansing / formatting
        data_dump.remove_duplicates(field="job_id")  # removing duplicates

        # logging formatted data dump information
        logger.data_formatted(data_dump.df)

        # saving as a csv file
        data_dump.save_to_csv(path="data_dumps/")

        logger.end_session()
        logger.save_to_txt()  # function is yet to be added

In [5]:
"""
all required arguments for the program: parameters for the search
"""

# list of any chosen key-words
positions = ["auditor"]


In [6]:
# from parameters import positions
# from main import ScrapingSession

session = ScrapingSession(positions)

if __name__ == "__main__":
    session.run()

04-06-2024, 18:40:37: Session starts
04-06-2024, 18:40:37: Scraping attempt with the key word: 'auditor'... 
04-06-2024, 18:40:37: name 'scraper' is not defined occurs


KeyError: 'job_id'