### <span style="color:#800000">Nakuri Scraping - Job postings</span>

### <span style="color:#FF00FF">Import libraries</span>

In [1]:
import requests
import time
from tqdm.auto import tqdm
from bs4 import BeautifulSoup

from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import pandas as pd

### <span style="color:#FF00FF">Configure selenium driver</span>

In [2]:
def configure_driver():
    ua = UserAgent()
    user_agent = ua.random
    print(user_agent)
    options = Options()
    options.add_argument(f'user-agent = {user_agent}')
    options.add_argument('headless')
    driver = webdriver.Chrome('chromedriver', options = options)
    return driver

In [3]:
driver = configure_driver()

Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36


### <span style="color:#FF00FF">Extract the data from HTML tags</span>

In [4]:

details_driver = configure_driver()

def get_jobdetails(job_url,domain):
    
    job_details = []
    
    details_driver.get(job_url)
    soup=BeautifulSoup(details_driver.page_source, 'lxml') 

    try:
        title = soup.find(attrs={'class':"jd-header-title"}).text
        experience = soup.find(attrs={'class':"exp"}).text
        salary = soup.find(attrs={'class':"salary"}).text
        location = soup.find(attrs={'class':'loc'}).find('a').text

        description_box = soup.find(attrs={'class':"dang-inner-html"})
        desc = description_box.get_text(separator=" ").rstrip("\n")
        desc = desc.lower().replace("job description","")
        desc = desc.lower().replace("roles and responsibilities","")


        detail = soup.find(attrs={'class':"other-details"}).findAll(attrs={'class':"details"})
        details=[]

        for i in detail:
            details.append(i.span.text)

        role = details[0]
        industry_type = details[1]
        functional_area = details[2]
        employment_type = details[3]
        role_category = details[4]

        education = soup.find(attrs={'class':"education"}).findAll(attrs={'class':'details'})
        qualification=[]
        for i in education:
            qualification.append(i.text)

        sk = soup.find(attrs={'class':"key-skill"}).findAll('a')
        skills=[]
        for i in sk:
            skills.append(i.text)

        company = soup.find(attrs={'class':"jd-header-comp-name"}).find(attrs={'class':"pad-rt-8"}).text

    except Exception as e:
        #print(e,job_url)
        pass

    job_details = (
                    domain,
                    job_url,
                    title,
                    company,
                    experience,
                    salary,
                    location,
                    desc,
                    role,
                    industry_type,
                    qualification,
                    functional_area,
                    employment_type,
                    role_category,
                    skills
                  )
    
    return job_details

Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36


### <span style="color:#FF00FF">Search Job postings with specific Domain name</span>

In [6]:
base_url = 'https://www.naukri.com'

search = ['information-technology-jobs',
         'engineering-jobs',
         'accounting-jobs',
         'architecture-jobs',
         'automobile-jobs', 
         'electrical-jobs',
         'ecommerce-jobs']
        
job_details = []

MAX_PAGE_NUM  = 10
MAX_PAGE_DIG = 3

profiles = (len(search)*MAX_PAGE_NUM*20)

with tqdm(total=profiles) as pbar:
    
    for area in search:  
        pbar.set_description(area)
        for i in range(1, MAX_PAGE_NUM+1):
            page_num = (MAX_PAGE_DIG - len(str(i))) * "0" + str(i)
            search_url = f'{base_url}/{area}-{page_num}' 
            driver.get(search_url)
            time.sleep(3)
            lst=driver.find_elements_by_css_selector(".jobTuple.bgWhite.br4.mb-8") 


            for job in lst:
                driver.implicitly_wait(10)
                try:
                    currenturl=job.find_element_by_css_selector("a.title.fw500.ellipsis").get_attribute('href')
                    details = get_jobdetails(currenturl,area)
                    job_details.append(details)
                except Exception as e:
                     pass
                pbar.update(1)

HBox(children=(FloatProgress(value=0.0, max=140.0), HTML(value='')))




### <span style="color:#FF00FF">Convert Result to DataFrame</span>

In [7]:
columns = [
    "domain",
    "link",
    "title",
    "company",
    "experience",
    "salary",
    "location",
    "description",
    "role",
    "industry_type",
    "qualification",
    "functional_area",
    "employment_type",
    "role_category",
    "skills"]


df = pd.DataFrame(job_details,columns=columns)

df.domain.value_counts()

electrical-jobs                20
automobile-jobs                20
information-technology-jobs    19
ecommerce-jobs                 19
accounting-jobs                18
engineering-jobs               18
architecture-jobs               9
Name: domain, dtype: int64

### <span style="color:#FF00FF">Export Dataset</span>

In [9]:
df.to_csv("nakuri_jobdesc.csv",index=False)