# DATA SCIENCE JOB MARKET ANALYSIS

## 1. Web Scraping using Selenium

In [1]:
import time
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv


## Step-by-Step: How to Know What to Use in Selenium
### 1. Open the Website in Your Browser (Google Chrome)
For example:

https://www.naukri.com/data-scientist-jobs-1

### 2. Right-Click on the Element You Want to Scrape
Let’s say you want to grab the job title or company name.

Right-click the job title → Click “Inspect”

Your browser's Developer Tools (DevTools) will open, and the corresponding HTML element will be highlighted.

### 3. Identify the HTML Tag and Class/ID
For example:

```bash
<div class="cust-job-tuple">
  <a class="title" href="...">Data Scientist</a>
  <a class="comp-name">ABC Corp</a>
  <span class="exp">
    <span class="expwdth">2-5 Yrs</span>
  </span>
```

From this structure, you figure out the correct CSS Selectors:

- Job card wrapper -->	div.cust-job-tuple

- Role/title	-->  a.title

- Company	-->  a.comp-name

- Experience	-->  span.exp span.expwdth

- Location	-->  span.loc span.locWdth

### The scraped data will be saved in the current project directory as "DataScience_jobs.csv".

In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
import time
import random
import pandas as pd

# Set up the Chrome driver
driver = webdriver.Chrome()
driver.maximize_window()

# Data storage
jobs = {
    "roles": [],
    "companies": [],
    "locations": [],
    "experience": [],
    "skills": []
}

base_url = "https://www.naukri.com/data-scientist-jobs"
max_pages = 50

for i in range(max_pages):
    url = base_url if i == 0 else f"{base_url}-{i}"
    print(f"\nScraping page {i}: {url}")
    
    try:
        driver.get(url)

        # Check for CAPTCHA or bot detection
        if "captcha" in driver.page_source.lower() or "verify" in driver.page_source.lower():
            print("⚠️ Captcha or bot check detected. Skipping this page.")
            continue

        # Wait for job cards to appear
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.cust-job-tuple"))
        )

        job_cards = driver.find_elements(By.CSS_SELECTOR, "div.cust-job-tuple")

        # Stop early if no job listings are found
        if not job_cards:
            print("❌ No job listings found, stopping early.")
            break

        for job in job_cards:
            try:
                role = job.find_element(By.CSS_SELECTOR, "a.title").text.strip()
            except:
                role = ""
            try:
                company = job.find_element(By.CSS_SELECTOR, "a.comp-name").text.strip()
            except:
                company = ""
            try:
                exp = job.find_element(By.CSS_SELECTOR, "span.exp span.expwdth").text.strip()
            except:
                exp = ""
            try:
                location = job.find_element(By.CSS_SELECTOR, "span.loc span.locWdth").text.strip()
            except:
                location = ""
            try:
                skills_list = job.find_elements(By.CSS_SELECTOR, "ul.tags-gt li")
                skills = ', '.join([s.text.strip() for s in skills_list])
            except:
                skills = ""

            jobs["roles"].append(role)
            jobs["companies"].append(company)
            jobs["locations"].append(location)
            jobs["experience"].append(exp)
            jobs["skills"].append(skills)

        time.sleep(random.uniform(2, 5))  # Delay to reduce bot detection

    except TimeoutException:
        print("❌ Timeout waiting for job cards. Skipping this page.")
        continue
    except WebDriverException as e:
        print(f"⚠️ WebDriver error: {e}")
        break

# Save using your format
DS_jobs_df = pd.DataFrame(jobs)
DS_jobs_df.to_csv("DataScience_jobs.csv", index=False)
print("\n✅ Scraping complete. Data saved to 'DataScience_jobs.csv'.")

driver.quit()



Scraping page 0: https://www.naukri.com/data-scientist-jobs

Scraping page 1: https://www.naukri.com/data-scientist-jobs-1

Scraping page 2: https://www.naukri.com/data-scientist-jobs-2

Scraping page 3: https://www.naukri.com/data-scientist-jobs-3

Scraping page 4: https://www.naukri.com/data-scientist-jobs-4

Scraping page 5: https://www.naukri.com/data-scientist-jobs-5

Scraping page 6: https://www.naukri.com/data-scientist-jobs-6

Scraping page 7: https://www.naukri.com/data-scientist-jobs-7

Scraping page 8: https://www.naukri.com/data-scientist-jobs-8

Scraping page 9: https://www.naukri.com/data-scientist-jobs-9

Scraping page 10: https://www.naukri.com/data-scientist-jobs-10

Scraping page 11: https://www.naukri.com/data-scientist-jobs-11

Scraping page 12: https://www.naukri.com/data-scientist-jobs-12

Scraping page 13: https://www.naukri.com/data-scientist-jobs-13

Scraping page 14: https://www.naukri.com/data-scientist-jobs-14

Scraping page 15: https://www.naukri.com/data-