# Applied Project in Big Data on Industrial Dataset

## DATA COLLECTION TECHNIQUES
## Part I. Web scraping intro

### 1. Libraries

Let's start from very basic example, we wiil need [urllib](https://docs.python.org/3/library/urllib.html) library for  opening and reading URLs. We will also use [Beautiful Soup](https://beautiful-soup-4.readthedocs.io/en/latest/) Python library to parce HTML data.

In [None]:
import os
import re
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

### 2. Get text from HTML page

In [None]:
URL_2_SCRAP = 'https://ai-jobs.net'

In [None]:
request = Request(URL_2_SCRAP)
response = urlopen(request)
html = response.read()

In [None]:
html

In [None]:
soup = BeautifulSoup(html, 'html.parser')

In [None]:
soup

In [None]:
soup.contents

In [None]:
soup.text

In [None]:
text = soup.text
for ch in ['\n', '\t', '\r']:
    text = text.replace(ch, ' ')

In [None]:
' '.join(text.split())

### 3. Simple NLP analysis

In [None]:
# lower text and leave only text without symbols
text = re.sub('[^а-яА-Яa-zA-Z]+', ' ', text).strip().lower()
text

In [None]:
text_as_list = text.split()
text_as_list[:5]

In [None]:
from collections import Counter

In [None]:
Counter(text_as_list)

In [None]:
freqs = dict(Counter(text_as_list))

In [None]:
freqs

In [None]:
freqs = dict(
    sorted(
        freqs.items(), 
        key=lambda item: item[1], 
        reverse=True
    )
)
freqs

In [None]:
import matplotlib.pyplot as plt

In [None]:
freqs_bar = {k: v for k, v in freqs.items() if v >= 10}

In [None]:
plt.figure(figsize=(16, 6))
plt.bar(*zip(*freqs_bar.items()))
plt.xticks(rotation='vertical')
plt.show()

### 4. Save HTML to disk as row data

In [None]:
with open('ai_jobs_page.html', 'w') as file:
    file.write(html.decode())

In [None]:
with open('ai_jobs_page.html', 'r') as file:
    text = file.read()

In [None]:
text

### 5. More than text

In [None]:
soup

In [None]:
soup.find_all('img')

In [None]:
soup.find_all('img', attrs={'alt': 'Oak Ridge National Laboratory logo'})

In [None]:
soup.find_all('img', attrs={'alt': re.compile(r".*Gorovoy")})

In [None]:
soup.find_all('img')[19]['src']

In [None]:
URL_2_SCRAP_IMG = 'https://ai-jobs.net/' + soup.find_all('img')[19]['src']
print(URL_2_SCRAP_IMG)

In [None]:
request = Request(URL_2_SCRAP_IMG)
response = urlopen(request)
img = response

In [None]:
img

In [None]:
from PIL import Image
import numpy as np

In [None]:
plt.figure(figsize=(12, 8))
img = Image.open(img)
plt.imshow(np.array(img))
plt.show()

In [None]:
img.save('logo.jpg')

### 6. Get data

In [None]:
soup

In [None]:
soup.find('a')

In [None]:
soup.find_all('a')

In [None]:
soup.find_all('a', class_="col list-group-item-action px-2 py-3")

In [None]:
jobs_list = soup.find_all('a', class_="col list-group-item-action px-2 py-3")

In [None]:
jobs_list[0]

In [None]:
jobs_list[1]

In [None]:
jobs_list[1]['href']

In [None]:
jobs_list[1]['title']

In [None]:
jobs_list[1].find_all('span')

In [None]:
jobs_list[1].find_all('span')[0]['class']

In [None]:
jobs_list[1].find('span', class_='d-block d-md-none text-break job-list-item-location')

In [None]:
jobs_list[1].find_all('span', class_='badge badge-light badge-pill')

In [None]:
[x.text for x in jobs_list[1].find_all('span', class_='badge badge-light badge-pill')]

In [None]:
for job in jobs_list:
    print(job['title'])
    print(URL_2_SCRAP + job['href'])
    print(
        'location ->',
        job.find('span', class_='d-block d-md-none text-break job-list-item-location').text
    )
    print(
        'requirements ->',
        [x.text for x in job.find_all('span', class_='badge badge-light badge-pill')]
    )
    print('*' * 50)

In [None]:
all_jobs = []
for job in jobs_list:
    job_dict = {}
    job_dict['description'] = job['title']
    job_dict['url'] = URL_2_SCRAP + job['href']
    location = job.find(
        'span', 
        class_='d-block d-md-none text-break job-list-item-location'
    )
    job_dict['location'] = location.text if location else ''
    time = job.find(
        'span', 
        class_='badge badge-secondary badge-pill my-md-1'
    )
    job_dict['time'] = time.text if time else ''
    position = job.find(
        'span', 
        class_='badge badge-info badge-pill my-md-1 d-md-none'
    )
    job_dict['position'] = position.text if position else ''
    level = job.find(
        'span', 
        class_='badge badge-info badge-pill my-md-1 d-none d-md-inline-block'
    )
    job_dict['level'] = level.text if level else ''
    salary_range = job.find(
        'span', 
        class_='badge badge-success badge-pill d-none d-md-inline-block'
    )
    job_dict['salary_range'] = salary_range.text if salary_range else ''
    salary = job.find(
        'span', 
        class_='badge badge-success badge-pill d-md-none'
    )
    job_dict['salary'] = salary.text if salary else ''
    job_dict['requirements'] = [
        x.text for x in job.find_all('span', class_='badge badge-light badge-pill')
    ]
    job_dict['offers'] = [
        x.text for x in job.find_all('span', class_='badge badge-success badge-pill')
    ]
    all_jobs.append(job_dict)
all_jobs

In [None]:
import pandas as pd

In [None]:
pd.DataFrame(all_jobs)

### 7. Get data by single item

In [None]:
URL_2_SCRAP = 'https://ai-jobs.net/job/32218-data-science-content-intern-remote/'

In [None]:
request = Request(URL_2_SCRAP)
response = urlopen(request)
html = response.read()

In [None]:
soup = BeautifulSoup(html, 'html.parser')

In [None]:
soup

In [None]:
soup.find('script', type="application/ld+json")

In [None]:
import json

In [None]:
text = soup.find('script', type="application/ld+json").text
print(text)

In [None]:
data = json.loads(text)
data

In [None]:
data['baseSalary']

In [None]:
data['identifier']['value']

### 8. Saving data

In [None]:
folder = 'ai_jobs_data'
os.makedirs(folder, exist_ok=True)

In [None]:
for job in all_jobs[:3]:
    print(job['url'])

In [None]:
import time
from tqdm.auto import tqdm

In [None]:
for job in tqdm(all_jobs):
    request = Request(job['url'])
    response = urlopen(request)
    html = response.read()
    soup = BeautifulSoup(html, 'html.parser')
    
    text = soup.find('script', type="application/ld+json").text
    data = json.loads(text, strict=False)
   
    job_id =  data['identifier']['value']
    file_path = f'{folder}/{job_id}.json'
    with open(file_path, 'w') as file:
        json.dump(data, file)
    
    time.sleep(.7)