# Applied Project in Big Data on Industrial Dataset

## DATA COLLECTION AND PROCESSING TECHNIQUES
## Part I. Web scraping with API trick

### 1. Libraries and config parameters

In [None]:
import os
import re
import json
import socket
from random import randint, uniform
from urllib.request import (
    Request, 
    urlopen, 
    URLError, 
    HTTPError, 
    ProxyHandler, 
    build_opener, 
    install_opener)
from urllib.parse import quote, unquote, urlencode
from time import sleep, gmtime, strftime
from tqdm import tqdm

In [None]:
def read_data(file_path):
    with open(file_path) as file:
        data = json.load(file)
    return data

creds = read_data(file_path='creds/site_scrap.json')
print(creds.keys())

In [None]:
USER_AGENT = creds['user_agent']
BASE_URL = creds['base_url']
print('user agent:', USER_AGENT)
print('url to scrap:', BASE_URL)

In [None]:
MIN_TIME_SLEEP = .5
MAX_TIME_SLEEP = 2
MAX_COUNTS = 2
TIMEOUT = 15

### 2. Main function for data gathering

In [None]:
def url_content(url_page, user_agent, 
                min_time_sleep, max_time_sleep, max_counts, timeout, 
                proxies=None, file_content=False, json_data=None):
    counts = 0
    content = None
    while counts < max_counts:
        try:
            request = Request(url_page)
            request.add_header('User-Agent', user_agent)
            if proxies:
                proxy_support = ProxyHandler(proxies)
                opener = build_opener(proxy_support)
                install_opener(opener)
                context = ssl._create_unverified_context()
                response = urlopen(
                    request, 
                    context=context, 
                    timeout=self.timeout
                )
            else:
                if json_data:
                    response = urlopen(
                        request, 
                        data=json.dumps(json_data).encode('utf-8'),
                        timeout=timeout
                    )
                else:
                    response = urlopen(request, timeout=timeout)
            if file_content:
                content = response.read()
            else:
                try:
                    content = response.read().decode(
                        response.headers.get_content_charset()
                    )
                except:
                    content = None
            break
        except URLError as e:
            counts += 1
            print('URLError | ', url_page, ' | ', e, ' | counts: ', counts)
            sleep(uniform(
                counts * min_time_sleep, counts * max_time_sleep
            ))
        except HTTPError as e:
            counts += 1
            print('HTTPError | ', url_page, ' | ', e, ' | counts: ', counts)
            sleep(uniform(
                counts * min_time_sleep, counts * max_time_sleep
            ))
        except socket.timeout as e:
            counts += 1
            print('socket timeout | ', url_page, ' | ', e, ' | counts: ', counts)
            sleep(uniform(
                counts * min_time_sleep, counts * max_time_sleep
            ))
    return content

### 3. Data collect

#### 3.1. Macroeconomics

In [None]:
search = 'макроэкономика государство денежно кредитная политика'

In [None]:
data = {
    'mode': 'articles',
    'q': search,
    'size': 1000,
    'from': 0
}
print(data)

In [None]:
content = url_content(
    url_page=BASE_URL + '/api/search',
    user_agent=USER_AGENT, 
    timeout=TIMEOUT, 
    max_counts=MAX_COUNTS,
    min_time_sleep=MIN_TIME_SLEEP, 
    max_time_sleep=MAX_TIME_SLEEP,
    proxies=None, 
    file_content=True, 
    json_data=data
)

In [None]:
type(content)

In [None]:
content = json.loads(content)

In [None]:
type(content)

In [None]:
content.keys()

In [None]:
len(content['articles'])

In [None]:
content['articles'][0]

In [None]:
# you may want to create folder 
# at S3 object storage
folder = 'articles_data'
os.makedirs(folder, exist_ok=True)

In [None]:
file_path = f'{folder}/articles_lbl_0.json'
with open(file_path, 'w') as file:
    json.dump(content, file)

#### 3.2. Microeconomics

In [None]:
search = 'микроэкономика управление предприятием потребитель'

In [None]:
data = {
    'mode': 'articles',
    'q': search,
    'size': 1000,
    'from': 0
}
print(data)

In [None]:
content = url_content(
    url_page=BASE_URL + '/api/search',
    user_agent=USER_AGENT, 
    timeout=TIMEOUT, 
    max_counts=MAX_COUNTS,
    min_time_sleep=MIN_TIME_SLEEP, 
    max_time_sleep=MAX_TIME_SLEEP,
    proxies=None, 
    file_content=True, 
    json_data=data
)

In [None]:
content = json.loads(content)
len(content['articles'])

In [None]:
content['articles'][0]

In [None]:
file_path = f'{folder}/articles_lbl_1.json'
with open(file_path, 'w') as file:
    json.dump(content, file)

#### 3.3. More topics

In [None]:
searches = [
    'искусственный интеллект машинное обучение большие данные',
    'экология природа загрязнение зеленая экономика',
    'менеджмент управление персоналом'
]

In [None]:
def save_search(i, folder, search):
    data = {
        'mode': 'articles',
        'q': search,
        'size': 1000,
        'from': 0
    }
    content = url_content(
        url_page=BASE_URL + '/api/search',
        user_agent=USER_AGENT, 
        timeout=TIMEOUT, 
        max_counts=MAX_COUNTS,
        min_time_sleep=MIN_TIME_SLEEP, 
        max_time_sleep=MAX_TIME_SLEEP,
        proxies=None, 
        file_content=True, 
        json_data=data
    )
    content = json.loads(content)
    file_path = f'{folder}/articles_lbl_{i}.json'
    with open(file_path, 'w') as file:
        json.dump(content, file)
    return len(content['articles'])

In [None]:
for i, search in enumerate(searches):
    res = save_search(
        i + 2,
        folder, 
        search
    )
    print(search, '| loaded', res, 'articles')