## Importing all necessary libraries

In [1]:
# Importing all necessary libraries
import numpy as np
import pymongo
import json
import time
from time import sleep
from random import randint
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException


#  Connection to MongoDB

In [2]:
# Connection to MongoDB
client = pymongo.MongoClient('mongodb://localhost:27017/')
db = client['job_database_2']
collection = db['jobs_collection_3']


# Set up options for the Chrome driver and define pagination URL

In [3]:
# Set up options for the Chrome driver and define pagination URL
option = webdriver.ChromeOptions()
pagination_url = 'https://in.indeed.com/jobs?q={}&l={}&radius=35&filter=0&sort=date&start={}'
job_ = 'Python+developer'
location = ''


# Open the Chrome driver to get job count and max pages

In [4]:
# Open the Chrome driver to get job count and max pages
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=option)
driver.get(pagination_url.format(job_, location, 0))
sleep(randint(2, 6))
p = driver.find_element(By.CLASS_NAME, 'jobsearch-JobCountAndSortPane-jobCount').text
max_iter_pgs = int(p.split(' ')[0].replace(',', '')) // 15
driver.quit()
print("No of pages we have to make iteration",max_iter_pgs)

No of pages we have to make iteration 705


# Start scraping job details

In [5]:
# Start scraping job details
job_list = []
salary_list = []
full_job_details = []
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=option)
sleep(randint(2, 6))

for i in range(0, 3): #In place of 3 we can use this max_iter_pgs varibale to fetch all the jobs. I am using 3 for demo purpose only
    driver.get(pagination_url.format(job_, location, i * 10))
    sleep(randint(2, 4))
    job_page = driver.find_element(By.ID, "mosaic-jobResults")
    jobs = job_page.find_elements(By.CLASS_NAME, "job_seen_beacon")

    for jj in jobs:
        job_title = jj.find_element(By.CLASS_NAME, "jobTitle")
        job_list.append({
            'title': job_title.text,
            'href': job_title.find_element(By.CSS_SELECTOR, "a").get_attribute("href"),
            'id': job_title.find_element(By.CSS_SELECTOR, "a").get_attribute("id"),
            'company': jj.find_element(By.CLASS_NAME, "companyName").text,
            'location': jj.find_element(By.CLASS_NAME, "companyLocation").text,
            'date': jj.find_element(By.CLASS_NAME, "date").text,
            'href_full': job_title.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
        })

        try:
            salary_list.append(jj.find_element(By.CLASS_NAME, "salary-snippet-container").text)
        except NoSuchElementException:
            try:
                salary_list.append(jj.find_element(By.CLASS_NAME, "estimated-salary").text)
            except NoSuchElementException:
                salary_list.append(None)
                


# Store job and salary data in MongoDB
for i in range(len(job_list)):
    document = {
        'job_details': job_list[i],
        'salary': salary_list[i]
    }
    collection.insert_one(document)
    # Close the MongoDB connection
client.close()
    
#complete data
for i in range(len(job_list)):
    job_list[i]['salary'] = salary_list[i]
    full_job_details.append(job_list[i])
full_job_details_pretty = json.dumps(full_job_details,indent=4)    
driver.quit()
print("This is the scrapped job details",full_job_details_pretty)

This is the scrapped job details [
    {
        "title": "Python and React Developer",
        "href": "https://in.indeed.com/company/Albireo-Tech-System/jobs/Python-Developer-7a78bb4a37f1b088?fccid=4495f8b04aecf27d&vjs=3",
        "id": "job_7a78bb4a37f1b088",
        "company": "Albireo Tech System",
        "location": "Remote in Remote",
        "date": "Posted\nJust posted",
        "href_full": "https://in.indeed.com/company/Albireo-Tech-System/jobs/Python-Developer-7a78bb4a37f1b088?fccid=4495f8b04aecf27d&vjs=3",
        "salary": "\u20b9500 - \u20b91,500 a day"
    },
    {
        "title": "Python Developer",
        "href": "https://in.indeed.com/company/Quantsapp-Private-Limited/jobs/Python-Developer-947c80af3a24e8df?fccid=157ea3ff67920563&vjs=3",
        "id": "job_947c80af3a24e8df",
        "company": "Quantsapp Private Limited",
        "location": "Mumbai, Maharashtra",
        "date": "Posted\nJust posted",
        "href_full": "https://in.indeed.com/company/Quantsapp-P

# NumPy to calculation for  the average salary for Python developers in your city.

In [6]:
# Extract and filter data for the specific location
import re
locations = "Mumbai, Maharashtra"
salaries = []

for item in full_job_details:
    if item.get("location") == locations:
        salary_str = item.get("salary")
        
        if salary_str:
            if 'a month' in salary_str:
            	salary_str = salary_str.replace("₹", "").replace(",", "") 
            	temp = re.findall(r'\d+', salary_str)
            	res = list(map(int, temp))
            	res = list(np.array(res) * 12)
            	mean_salary = sum(res)//len(res)  
            	salaries.append(mean_salary)
                
            if 'a year' in salary_str:
            	salary_str = salary_str.replace("₹", "").replace(",", "") #.replace("a year", "").strip()
            	temp = re.findall(r'\d+', salary_str)
            	res = list(map(int, temp))
            	mean_salary = sum(res)//len(res)
            	salaries.append(mean_salary)
            

# Calculate the average salary using NumPy
average_salary = np.mean(salaries)

print(f"The average salary in {location} is: ₹{average_salary:,.2f}")


The average salary in  is: ₹543,000.00
