# Data Science Job Market Analysis in India

First, we will just import all the necessary tools. Most of these come from the selenium package, which is good at evading bot checks.

In [2]:
import time
import pandas as pd
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

To enable safari web driving, we have to run `sudo safaridriver --enable` in the terminal. We also have to make sure the developer settings are enabled under Safari > Settings > Advanced (check the small box at the very bottom). Now we can navigate to https://www.naukri.com/data-scientist-jobs. 

Next let's think about which elements we want to scrape from each job post. For example, let's say we want to scrape:
- job title
- company
- experience required
- location
- skills required

We'll right-click on each element and select "Inspect". The relevant information will be the class ID. This screenshot shows an example using the job title in the top-most post.

![Alt text](./assets/inspect.jpg "")

Now let's set up the web driver


In [3]:
driver = webdriver.Safari()
driver.maximize_window() 
URL = "https://www.naukri.com/data-scientist-jobs"

# initiate an empty dataframe
jobs = {
    "roles" : [],
    "companies" :[],
    "locations" : [],
    "experience" : [],
    "skills": []
}

# We'll scrape only the first 50 job posts

for i in range(50):
    driver.get(f"{URL}-{i}") # navigates to page i of results
    
    # explicitly wait until all job cards on this page load
    # alt: implicitly wait a pre-defined span of time (~1s)
    # this command times out in 10s
    #print(f"Getting page {i}")
    WebDriverWait(driver, 10).until(
        # the full class name appears as "cust-job-tuple layout-wrapper lay-2 sjw__tuple"
        # but this is actually a list of 4 space-separated class names
        EC.presence_of_all_elements_located((By.CSS_SELECTOR,"div.cust-job-tuple"))
    )

    job_cards = driver.find_elements(By.CSS_SELECTOR, "div.cust-job-tuple")

    # iterate through each job and pull: role, company, experience, location, skills
    for job in job_cards:

        try:
            role = job.find_element(By.CSS_SELECTOR, "a.title").text.strip()
        except:
            role = "" # assign empty values if field was mot found
        try:
            company = job.find_element(By.CSS_SELECTOR, "a.comp-name").text.strip()
        except:
            company = "" # assign empty values if field was mot found
        try:
            # QUESTION: difference between span.expwdth and span.expwdth ? 
            exp = job.find_element(By.CSS_SELECTOR, "span.expwdth").text.strip()
        except:
            exp = "" # assign empty values if field was mot found
        try:
            location = job.find_element(By.CSS_SELECTOR, "span.locWdth").text.strip()
        except:
            location = "" # assign empty values if field was mot found
        try:
            # finds all <li> elements inside a <ul> with the class tags-gt, within a specific job HTML element.
            # The result is a list of WebElement objects (each representing a skill tag).
            skills_list = job.find_elements(By.CSS_SELECTOR, "ul.tags-gt li")

            # Loops over to extract each elements, then strips whitespaces,
            # then joins all skills into a single string, separated by commas.
            skills = ', '.join([s.text.strip() for s in skills_list])
        except:
            skills = ""

        jobs["roles"].append(role)
        jobs["companies"].append(company)
        jobs["locations"].append(location)
        jobs["experience"].append(exp)
        jobs["skills"].append(skills)
    

    

In [22]:
import pandas as pd
import numpy as np

# how many rows are in each list? 
headers = ["roles","companies","locations","experience","skills"]
lengths = [ len(jobs[header]) for header in headers ]
print(lengths[0])

DS_jobs_df=pd.DataFrame(jobs)
DS_jobs_df.to_csv("DataScience_jobs.csv")

1009


In [20]:
# just to see what's going into the CSV file, let's print the first few lines of the df
DS_jobs_df.head()

Unnamed: 0,roles,companies,locations,experience,skills
0,GN SC&O - S&P Data Scientist - Specialist,Accenture,Bengaluru,3-7 Yrs,"project management, data analysis, spend analy..."
1,Urgent Hiring For FullStack-Data Scientist-Imm...,Genpact,"Hybrid - Gurugram, Bengaluru",7-12 Yrs,"Natural Language Processing, Machine Learning ..."
2,Sr. Data Scientist,,,,
3,Artificial Intelligence Developer,,,,
4,Data Science Manager,,,,
