# Applied Project in Big Data on Industrial Dataset

## DATA COLLECTION TECHNIQUES
## Part I. Web scraping intro

### 1. Libraries

Let's start from very basic example, we wiil need [urllib](https://docs.python.org/3/library/urllib.html) library for  opening and reading URLs. We will also use [Beautiful Soup](https://beautiful-soup-4.readthedocs.io/en/latest/) Python library to parce HTML data.

In [None]:
import os
import re
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

### 2. Get text from HTML page

In [None]:
URL_2_SCRAP = 'https://ai-jobs.net'
print(URL_2_SCRAP)

In [None]:
request = Request(URL_2_SCRAP)
response = urlopen(request)
html = response.read()

In [None]:
html

In [None]:
soup = BeautifulSoup(html, 'html.parser')

In [None]:
soup

In [None]:
soup.contents

In [None]:
soup.contents[0]

In [None]:
soup.text

In [None]:
text = soup.text
for ch in ['\n', '\t', '\r']:
    text = text.replace(ch, ' ')

In [None]:
' '.join(text.split())

### 3. Simple NLP analysis

In [None]:
# lower text and leave only text without symbols
# with help of regular expressions library

text = re.sub('[^а-яА-Яa-zA-Z]+', ' ', text).strip().lower()
text

In [None]:
text_as_list = text.split()
text_as_list[:5]

In [None]:
from collections import Counter

In [None]:
Counter(text_as_list)

In [None]:
freqs = dict(Counter(text_as_list))

In [None]:
freqs

In [None]:
freqs = dict(
    sorted(
        freqs.items(), 
        key=lambda item: item[1], 
        reverse=True
    )
)
freqs

In [None]:
import matplotlib.pyplot as plt

In [None]:
freqs_bar = {k: v for k, v in freqs.items() if v >= 10}

In [None]:
plt.figure(figsize=(16, 6))
plt.bar(*zip(*freqs_bar.items()))
plt.xticks(rotation='vertical')
plt.show()

### 4. Save HTML to disk as row data

In [None]:
with open('ai_jobs_page.html', 'w') as file:
    file.write(html.decode())

In [None]:
with open('ai_jobs_page.html', 'r') as file:
    text = file.read()

In [None]:
text

### 5. More than text

In [None]:
soup

In [None]:
soup.find_all('img')[1:10]

In [None]:
soup.find_all('img', attrs={'alt': 'Micron Technology logo'})

In [None]:
soup.find_all('img')[3]['src']

In [None]:
# or you can use `find` with `attrs`

soup.find('img', attrs={'alt': 'Micron Technology logo'})['src']

In [None]:
URL_2_SCRAP_IMG = 'https://ai-jobs.net/' + \
    soup.find('img', attrs={'alt': 'Micron Technology logo'})['src']
print(URL_2_SCRAP_IMG)

In [None]:
request = Request(URL_2_SCRAP_IMG)
response = urlopen(request)
img = response

In [None]:
img

In [None]:
from PIL import Image
import numpy as np

In [None]:
plt.figure(figsize=(12, 8))
img = Image.open(img)
plt.imshow(np.array(img))
plt.show()

In [None]:
img.save('logo.jpg')

### 6. Get data

In [None]:
# look carefully at this soup and you will find that 
# job descriptions are insude the `a` tags
# like <a class="col pt-2 pb-3" 
#     href="/job/84649-senior-data-engineer/" 
#     title="View details for this job">

soup

In [None]:
soup.find('a')

In [None]:
soup.find_all('a')

In [None]:
soup.find_all('a', class_='col pt-2 pb-3')

In [None]:
jobs_list = soup.find_all('a', class_='col pt-2 pb-3')

In [None]:
# or in `dict` key-value style

jobs_list = soup.find_all('a', {'class': 'col pt-2 pb-3'})

In [None]:
jobs_list[0]

In [None]:
jobs_list[1]

In [None]:
jobs_list[1]['href']

In [None]:
jobs_list[1]['title']

In [None]:
jobs_list[1].find('h3')

In [None]:
jobs_list[1].find('h3').text

In [None]:
jobs_list[1].find_all('span')

In [None]:
jobs_list[1].find_all('span')[0]['class']

In [None]:
# location of the job 

jobs_list[1].find('span', class_='d-block d-md-none text-break')

In [None]:
# position for the job 

jobs_list[1].find('span', class_='badge rounded-pill text-bg-info my-md-1 d-md-none')

In [None]:
# requirements

jobs_list[1].find_all('span', class_='badge rounded-pill text-bg-light')

In [None]:
[x.text 
 for x in jobs_list[1].find_all(
     'span', 
     class_='badge rounded-pill text-bg-light'
 )]

In [None]:
for job in jobs_list:
    print(job['title'])
    print(URL_2_SCRAP + job['href'])
    print(
        'location ->',
        job.find('span', class_='d-block d-md-none text-break').text
    )
    try:
        print(
            'position ->',
            job.find('span', class_='badge rounded-pill text-bg-info my-md-1 d-md-none').text
        )
    except:
        print('no position defined')
    print(
        'requirements ->',
        [x.text for x in job.find_all('span', class_='badge rounded-pill text-bg-light')]
    )
    print('*' * 50)

Let's try to collect data and put it to dictionary first and then convert to Pandas dataframe.

In [None]:
from tqdm.notebook import tqdm

# list for all jobs collected
all_jobs = []

for job in tqdm(jobs_list):
    
    # dictionary for a single job
    job_dict = {}
    
    job_dict['description'] = job.find('h3').text
    job_dict['url'] = URL_2_SCRAP + job['href']
    location = job.find(
        'span', 
        class_='d-block d-md-none text-break'
    )
    job_dict['location'] = location.text if location else ''
    time = job.find(
        'span', 
        class_='badge badge-secondary badge-pill my-md-1'
    )
    job_dict['time'] = time.text if time else ''
    position = job.find(
        'span', 
        class_='badge rounded-pill text-bg-secondary my-md-1 ms-1'
    )
    job_dict['position'] = position.text if position else ''
    level = job.find(
        'span', 
        class_='badge rounded-pill text-bg-info my-md-1 d-md-none'
    )
    job_dict['level'] = level.text if level else ''
    salary_range = job.find(
        'span', 
        class_='badge rounded-pill text-bg-info my-md-1 d-none d-md-inline-block'
    )
    job_dict['salary_range'] = salary_range.text if salary_range else ''
    salary = job.find(
        'span', 
        class_='badge rounded-pill text-bg-success d-none d-md-inline-block'
    )
    job_dict['salary'] = salary.text if salary else ''
    job_dict['requirements'] = [
        x.text for x in job.find_all('span', class_='badge rounded-pill text-bg-light')
    ]
    job_dict['offers'] = [
        x.text for x in job.find_all('span', class_='badge rounded-pill text-bg-success')
    ]
    
    all_jobs.append(job_dict)

print('jobs collected:', len(all_jobs))

In [None]:
import pandas as pd

In [None]:
pd.DataFrame(all_jobs)

### 7. Get data by single item

In [None]:
URL_2_SCRAP_SINGLE = all_jobs[0]['url']
URL_2_SCRAP_SINGLE

In [None]:
request = Request(URL_2_SCRAP_SINGLE)
response = urlopen(request)
html = response.read()

In [None]:
soup = BeautifulSoup(html, 'html.parser')

In [None]:
soup

In [None]:
soup.find_all('script', {'type': 'application/ld+json'})

In [None]:
soup.find_all('script', {'type': 'application/ld+json'})[0]

In [None]:
soup.find_all('script', {'type': 'application/ld+json'})[1]

In [None]:
import json

In [None]:
text = soup.find_all('script', {'type': 'application/ld+json'})[1].text
print(text)

In [None]:
data = json.loads(text)
data

In [None]:
data['baseSalary']

In [None]:
data['identifier']['value']

### 8. Saving data

In [None]:
folder = 'ai_jobs_data'
os.makedirs(folder, exist_ok=True)

In [None]:
for job in all_jobs[:3]:
    print(job['url'])

In [None]:
from time import sleep
from random import uniform

In [None]:
import re

class LazyDecoder(json.JSONDecoder):
    def decode(self, s, **kwargs):
        regex_replacements = [
            (re.compile(r'([^\\])\\([^\\])'), r'\1\\\\\2'),
            (re.compile(r',(\s*])'), r'\1'),
        ]
        for regex, replacement in regex_replacements:
            s = regex.sub(replacement, s)
        return super().decode(s, **kwargs)

In [None]:
for job in tqdm(all_jobs):
    request = Request(job['url'])
    response = urlopen(request)
    html = response.read()
    soup = BeautifulSoup(html, 'html.parser')
    
    text = soup.find_all('script', {'type': 'application/ld+json'})[1].text
    try:
        data = json.loads(text, strict=False)
    except Exception as e:
        data = json.loads(text, cls=LazyDecoder)
        print(e, data)
   
    job_id =  data['identifier']['value']
    file_path = f'{folder}/{job_id}.json'
    with open(file_path, 'w') as file:
        json.dump(data, file)
    
    sleep(uniform(.1, 1.1))

But here is a small problem with collecting more data...