# DATA SCIENCE JOB MARKET ANALYSIS

## 1. Web Scraping using Selenium

In [4]:
import time
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv


## Step-by-Step: How to Know What to Use in Selenium
### 1. Open the Website in Your Browser (Google Chrome)
For example:

https://www.naukri.com/data-scientist-jobs-1

### 2. Right-Click on the Element You Want to Scrape
Let’s say you want to grab the job title or company name.

Right-click the job title → Click “Inspect”

Your browser's Developer Tools (DevTools) will open, and the corresponding HTML element will be highlighted.

### 3. Identify the HTML Tag and Class/ID
For example:

```bash
<div class="cust-job-tuple">
  <a class="title" href="...">Data Scientist</a>
  <a class="comp-name">ABC Corp</a>
  <span class="exp">
    <span class="expwdth">2-5 Yrs</span>
  </span>
```

From this structure, you figure out the correct CSS Selectors:

- Job card wrapper -->	div.cust-job-tuple

- Role/title	-->  a.title

- Company	-->  a.comp-name

- Experience	-->  span.exp span.expwdth

- Location	-->  span.loc span.locWdth

In [5]:
driver = webdriver.Chrome()
driver.maximize_window()

jobs = {
    "roles": [],
    "companies": [],
    "locations": [],
    "experience": [],
    "skills": []
}

for i in range(50):
    driver.get(f"https://www.naukri.com/data-scientist-jobs-{i}")
    
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "div.cust-job-tuple"))
    )
    
    job_cards = driver.find_elements(By.CSS_SELECTOR, "div.cust-job-tuple")

    for job in job_cards:
        try:
            role = job.find_element(By.CSS_SELECTOR, "a.title").text.strip()
        except:
            role = ""
        try:
            company = job.find_element(By.CSS_SELECTOR, "a.comp-name").text.strip()
        except:
            company = ""
        try:
            exp = job.find_element(By.CSS_SELECTOR, "span.exp span.expwdth").text.strip()
        except:
            exp = ""
        try:
            location = job.find_element(By.CSS_SELECTOR, "span.loc span.locWdth").text.strip()
        except:
            location = ""
        try:
            # finds all <li> elements inside a <ul> with the class tags-gt, within a specific job HTML element.
            # The result is a list of WebElement objects (each representing a skill tag).
            skills_list = job.find_elements(By.CSS_SELECTOR, "ul.tags-gt li")

            # Loops over to extract each elements, then strips whitespaces,
            # then joins all skills into a single string, separated by commas.
            skills = ', '.join([s.text.strip() for s in skills_list])
        except:
            skills = ""

        jobs["roles"].append(role)
        jobs["companies"].append(company)
        jobs["locations"].append(location)
        jobs["experience"].append(exp)
        jobs["skills"].append(skills)

In [6]:
import pandas as pd
DS_jobs_df=pd.DataFrame(jobs)
DS_jobs_df.to_csv("DataScience_jobs.csv")

### The scraped data will be saved in the current project directory as "DataScience_jobs.csv".