# Mastering Applied Skills in Management, Analytics and Entrepreneurship I

## DATA COLLECTION TECHNIQUES
## Part IV. Web scraping deeper dive

JupyterHub installation includes [Selenium with Python](https://selenium-python.readthedocs.io/) which provides a simple API to write functional/acceptance tests using Selenium WebDriver or just to scrap sites over the Internet.

### 1. Selenium library

In [None]:
import selenium
from selenium import webdriver
from selenium.webdriver import FirefoxOptions

In [None]:
opts = FirefoxOptions()
opts.add_argument('--headless')
browser = webdriver.Firefox(options=opts)

### 2. Basic demo

In [None]:
URL = 'https://ai-jobs.net'
browser.get(URL)

In [None]:
text_from_site = browser.find_element('xpath', 'html').text

In [None]:
print(text_from_site)

In [None]:
len(text_from_site)

### 3. Click buttons

In [None]:
browser.find_element('link text', "Load more").click()

In [None]:
text_from_site = browser.find_element('xpath', 'html').text

In [None]:
len(text_from_site)

### 4. Click buttons wisely

In [None]:
import time

In [None]:
counter = 0
while counter < 3: # or 'while True:' for endless
    try:
        browser.find_element('link text', "Load more").click()
        text_from_site = browser.find_element('xpath', 'html').text
        counter += 1
        print('click', counter, '| text', len(text_from_site))
        time.sleep(1)
    except Exception as e:
        print(e)
        break
text_from_site = browser.find_element('xpath', 'html').text
len(text_from_site)

In [None]:
from selenium.webdriver.common.by import By

In [None]:
jobs = browser.find_elements(By.XPATH, "//a[@class='col list-group-item-action px-2 py-3']")
len(jobs)

In [None]:
jobs[0].get_attribute(name='href')

In [None]:
jobs[0].get_attribute(name='title')

In [None]:
# NOTE: use './/' instead of '//' to search in element, not from the start of the tree
jobs[0].find_element(By.XPATH, ".//span[@class='d-none d-md-block text-break job-list-item-location']").text

In [None]:
jobs[0].find_element(By.XPATH, ".//span[@class='badge badge-secondary badge-pill my-md-1']").text

In [None]:
jobs[0].find_element(By.XPATH, ".//span[@class='badge badge-info badge-pill my-md-1 d-md-none']").text

In [None]:
jobs[0].find_element(By.XPATH, ".//span[@class='badge badge-info badge-pill my-md-1 d-none d-md-inline-block']").text

In [None]:
jobs[0].find_element(By.XPATH, ".//span[@class='badge badge-success badge-pill d-none d-md-inline-block']").text

In [None]:
jobs[0].find_element(By.XPATH, ".//span[@class='badge badge-success badge-pill d-md-none']").text

In [None]:
jobs[0].find_element(By.XPATH, ".//p[@class='m-0 text-muted job-list-item-company']").text

In [None]:
jobs[0].find_element(By.XPATH, ".//h2[@class='h4 mb-1']").text

In [None]:
[x.text for x in jobs[0].find_elements(By.XPATH, ".//span[@class='badge badge-light badge-pill']")]

In [None]:
[x.text for x in jobs[0].find_elements(By.XPATH, ".//span[@class='badge badge-success badge-pill']")]

In [None]:
import time
from tqdm.auto import tqdm

In [None]:
all_jobs = []
for job in tqdm(jobs):
    job_dict = {}
    job_dict['url'] = job.get_attribute(name='href')
    job_dict['title'] = job.get_attribute(name='title')
    job_dict['locate'] = job.find_element(
        By.XPATH, ".//span[@class='d-none d-md-block text-break job-list-item-location']"
    ).text
    job_dict['type'] = job.find_element(
        By.XPATH, 
        ".//span[@class='badge badge-secondary badge-pill my-md-1']"
    ).text
    try:
        job_dict['level'] = job.find_element(
            By.XPATH, 
            ".//span[@class='badge badge-info badge-pill my-md-1 d-none d-md-inline-block']").text
    except:
        job_dict['level'] = ''
    try:
        job_dict['salary_range'] = job.find_element(
            By.XPATH, 
            ".//span[@class='badge badge-success badge-pill d-none d-md-inline-block']").text
    except:
        job_dict['salary_range'] = ''
    try:
        job_dict['salary'] = job.find_element(
            By.XPATH, 
            ".//span[@class='badge badge-success badge-pill d-md-none']"
        ).text
    except:
        job_dict['salary'] = ''
    job_dict['company'] = job.find_element(
        By.XPATH, 
        ".//p[@class='m-0 text-muted job-list-item-company']"
    ).text
    job_dict['position'] = job.find_element(By.XPATH, ".//h2[@class='h4 mb-1']").text
    job_dict['skills'] = [
        x.text 
        for x in job.find_elements(By.XPATH, ".//span[@class='badge badge-light badge-pill']")
    ]
    job_dict['benefits'] = [
        x.text 
        for x in job.find_elements(By.XPATH, ".//span[@class='badge badge-success badge-pill']")
    ]
    all_jobs.append(job_dict)

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(all_jobs)

In [None]:
df

### 5. Single position with Selenium

In [None]:
print(all_jobs[0]['url'])

In [None]:
# Recreate webdriver again
opts = FirefoxOptions()
opts.add_argument('--headless')
browser = webdriver.Firefox(options=opts)

In [None]:
URL = all_jobs[0]['url']
browser.get(URL)

In [None]:
browser.find_element('xpath', 'html').text

In [None]:
text = browser.find_element(By.XPATH, "//script[@type='application/ld+json']")

In [None]:
text.get_attribute('innerHTML')

In [None]:
import json

In [None]:
data = json.loads(text.get_attribute('innerHTML'))
data

In [None]:
data['baseSalary']