### Web scrap data 

In [10]:
import selenium

print(selenium.__version__)


3.141.0


### code to scrap from indeed

In [None]:
import requests
import time
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import re
import pandas as pd
from pymongo import MongoClient

source = "indeed.com"
cookies = {'aep_usuc_f': 'region=US&site=glo&b_locale=en_US&c_tp=USD'}

def get_url(position):
    """
    Generate URL from position
    """
    return f"https://indeed.com/jobs?q={position}"

def get_job_date(card):
    """
    Extracts date from the job post record
    """
    post_str = card.find('span', {'class': 'date'}).text  # text from the footer: days ago was posted
    post_days = re.findall(r'\d+', post_str)  # extracting number of days from posted_str

    if post_days:
        # calculated date of job posting if days are mentioned
        job_date = (datetime.now() - timedelta(days=int(post_days[0]))).strftime("%d/%m/%Y")
    else:
        job_date = datetime.now().strftime("%d/%m/%Y")  # if days are not mentioned - using today

    return job_date

def get_job_salaries(card):
    """
    Extracts salaries
    """
    try:
        salary_str = card.find('div', 'metadata salary-snippet-container').text
        salaries = re.findall(r"\$\d+(?:\,\d+)*(?:\.\d+)?", salary_str)  # Extracting salary in $ format
    except AttributeError:
        salaries = []
    return salaries

def get_job_rating(card):
    """
    Extracts company rating if available
    """
    try:
        rating = card.find('span', {'class': 'ratingNumber'}).text
    except AttributeError:
        rating = None
    return rating

def get_record(card):
    """
    Extract job data from a single record
    """
    span_tag = card.h2.a.span
    a_tag = card.h2.a

    job_id = a_tag.get("data-jk")  # unique job id
    job_title = span_tag.get("title")  # job title
    job_url = 'https://www.indeed.com' + a_tag.get('href')  # job url
    company_name = card.find('span', {'class': 'companyName'}).text  # company name
    job_loc = card.find('div', {'class': 'companyLocation'}).text  # job location
    job_summary = card.find('div', {'class': 'job-snippet'}).text.strip()  # job description
    job_date = get_job_date(card)  # job posting date
    job_salary = get_job_salaries(card)  # job salaries if any
    job_rating = get_job_rating(card)  # company rating if any

    return {
        'Description2': job_summary,
        'Company_name': company_name,
        'Location': job_loc,
        'Salary_range': job_salary,
        'Date': job_date,
        'Rating': job_rating,
        'job_id': job_id,
        'job_title': job_title,
        'job_url': job_url
    }

def get_jobs(position):
    """
    Creates a DataFrame with all records (scraped jobs), scraping from all pages
    """
    url = get_url(position)
    records = []

    session = requests.Session()
    session.cookies.update(cookies)

    # extract the job data
    while True:
        response = ""
        while response == "":
            try:
                response = session.get(url)
                break
            except requests.ConnectionError:
                print("Connection refused by the server..")
                print("Let me sleep for 5 seconds")
                print("ZZzzzz...")
                time.sleep(5)
                print("Was a nice sleep, now let me continue...")
                continue

        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('div', 'job_seen_beacon')

        for card in cards:
            record = get_record(card)
            records.append(record)

        time.sleep(3)  # making a pause before moving to the next page

        # moving to the next page - > assigning a new url
        try:
            url = 'https://indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except AttributeError:
            break

    return records

def save_to_mongodb(data, db_name, collection_name):
    """
    Save data to MongoDB
    """
    client = MongoClient("mongodb://localhost:27017/")
    db = client[db_name]
    collection = db[collection_name]
    collection.insert_many(data)
    print(f"Data saved to MongoDB in the collection '{collection_name}' of database '{db_name}'")

# Example usage
if __name__ == "__main__":
    position = "data+scientist"
    records = get_jobs(position)
    save_to_mongodb(records, "test", "indeed")
    print("Scraping completed and data saved to MongoDB")
