# Mastering Applied Skills in Management, Analytics and Entrepreneurship I

## DATA COLLECTION TECHNIQUES
## Part IV. Web scraping deeper dive

### 1. Libraries

In [None]:
# some basic libraries
import os
import re
import json
import socket
from random import randint, uniform
# for sending requests
from urllib.request import (
    Request, 
    urlopen, 
    URLError, 
    HTTPError, 
    ProxyHandler, 
    build_opener, 
    install_opener)
# to parce html data
from bs4 import BeautifulSoup
# for time delay while scraping
from time import sleep, gmtime, strftime
from time import sleep
from tqdm.notebook import tqdm
from urllib.parse import quote, unquote
# to work with the data
import pandas as pd

### 2. Tools and hints for requests

In [None]:
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 YaBrowser/19.6.1.153 Yowser/2.5 Safari/537.36'
MIN_TIME_SLEEP = .1
MAX_TIME_SLEEP = 2
MAX_COUNTS = 2
TIMEOUT = 5
MAX_PAGES = 5

In [None]:
def get_content_lite(url_page, timeout):
    # sleep a while for not to overload site
    sleep(uniform(MIN_TIME_SLEEP, MAX_TIME_SLEEP))
    # make a request
    request = Request(url_page)
    request.add_header('User-Agent', USER_AGENT)
    # get the response
    response = urlopen(request, timeout=timeout)
    content = response.read()
    return content

In [None]:
url_main = 'https://piter-online.net/address'
print(url_main)
html = get_content_lite(url_main, timeout=TIMEOUT)
soup = BeautifulSoup(html, 'html.parser')

### 3. How to work with soup, examples

In [None]:
soup

In [None]:
soup.text

In [None]:
soup.find('meta')

In [None]:
soup.find_all('meta')

In [None]:
soup.find_all('div', class_="app131")

In [None]:
soup.find_all('div')

In [None]:
for element in soup.find_all('div'):
    print('*' * 50)
    print(element.text)

In [None]:
soup.find_all('div', class_="app131")[0].text

In [None]:
for element in soup.find_all('div', class_="app131"):
    print(element.text)

In [None]:
soup.a

In [None]:
soup.find_all('a')

In [None]:
soup.find_all('a', attrs={'datatest': 'top_provider_block'})

In [None]:
soup.find_all('a', attrs={'datatest': re.compile(r'top')})

### 3. Let's cook our soup

We can search with `CTRL+F` because of the power of Jupyter notebook!

#### Step 1. Regions of the city

In [None]:
# 
# <div class="app281"><a href="/address/адмиралтейский-id1192">Адмиралтейский</a></div>
# 
soup.find('div', attrs={'class': 'app281'})

In [None]:
# or another way
soup.find('div', class_ = 'app281')

In [None]:
# to find a url
soup.find('div', class_ = 'app281').a

In [None]:
soup.find('div', class_ = 'app281').a['href']

In [None]:
print('https://piter-online.net' + soup.find('div', class_ = 'app281').a['href'])

In [None]:
# now will find all regions of the city
soup.find_all('div', class_ = 'app281')

In [None]:
for item in soup.find_all('div', class_ = 'app281'):
    print(item.text, 'https://piter-online.net' + item.a['href'])

#### Step 2. Streets of the region

In [None]:
# now let's looks at one region
url_main = 'https://piter-online.net' + quote(soup.find_all('div', class_ = 'app281')[0].a['href'])
print(url_main)
html = get_content_lite(url_main, timeout=TIMEOUT)
soup = BeautifulSoup(html, 'html.parser')

In [None]:
soup

In [None]:
# 
# <div class="app321 app322">
# <a href="/address/адмиралтейский-id1192/ул-1-я-конная-лахта-id377844"> 1-я Конная Лахта ул</a>
# </div>
# 
soup.find('div', attrs={'class': 'app321'})

In [None]:
for item in soup.find_all('div', class_ = 'app321'):
    print(item.text, 'https://piter-online.net' + item.a['href'])

#### Step 3. Houses on the street

In [None]:
# now let's looks at one street
url_main = 'https://piter-online.net' + quote(soup.find_all('div', class_ = 'app321')[0].a['href'])
print(url_main)
html = get_content_lite(url_main, timeout=TIMEOUT)
soup = BeautifulSoup(html, 'html.parser')

In [None]:
for item in soup.find_all('div', class_ = 'app258'):
    url_house = 'https://piter-online.net' + item.a['href']
    house_id = url_house[url_house.find('=') + 1 : ]
    print(item.text, house_id, url_house)

#### Step 3. Houses on the street

In [None]:
# now let's looks at one street
page = 1
url_main = f'https://piter-online.net/rates/{page}?house_id={house_id}'
print(url_main)
html = get_content_lite(url_main, timeout=TIMEOUT)
soup = BeautifulSoup(html, 'html.parser')

In [None]:
soup

In [None]:
#
# <div class="app297"><a href="/providers/rostelecom/rates">Ростелеком</a></div>
#
soup.find('div', attrs={'class': 'app297'})

In [None]:
soup.find('div', attrs={'class': 'app372'})

In [None]:
for item in soup.find_all('div', class_ = 'app297'):
    print(item.text)

In [None]:
for item0, item1 in zip(
    soup.find_all('div', class_ = 'app297'),
    soup.find_all('div', class_ = 'app372')
):
    # 1st try
    print(item0.text, item1.text)
    # 2nd try
    #print(item0.text, item1.span)
    # 3rd
    #print(item0.text, item1.span.text)

## <font color='red'>INTERMEDIATE QUIZ</font>
We lost discount price from the data, so:
1. Look at the table on the web site and find discount price in our soup
2. Update code to print the discount price along with the base price

#### Step 4. Combine all together

In [None]:
# refresh our soup for full load
url_main = 'https://piter-online.net/address'
print(url_main)
html = get_content_lite(url_main, timeout=TIMEOUT)
soup = BeautifulSoup(html, 'html.parser')

In [None]:
# some parameters for debug
DEBUG = True
end = 2 if DEBUG else -1

In [None]:
# place to store the data
addresses = []

In [None]:
# --- first loop is for data on the regions ---
for reg_data in tqdm(soup.find_all('div', class_='app281')[:end], 
                     desc='regions'):
    url_reg = url_main.replace('/address', '') + quote(reg_data.find('a')['href'])
    html = get_content_lite(url_reg, timeout=TIMEOUT)
    soup = BeautifulSoup(html, 'html.parser')
    # second loop is for streets ---
    for street_data in tqdm(soup.find_all('div', class_='app321')[:end], 
                            desc='streets of '+ reg_data.getText()):
        url_street = url_main.replace('/address', '') + quote(street_data.find('a')['href'])
        html = get_content_lite(url_street, timeout=TIMEOUT)
        soup = BeautifulSoup(html, 'html.parser')
        # --- this loop is for houses ---
        for houses_data in tqdm(soup.find_all('div', class_='app258')[:end], 
                                desc='houses of ' + street_data.getText()):
            url_house = url_main.replace('/address', '') + quote(houses_data.find('a')['href'])
            url_house_ = unquote(url_house)
            house_id = url_house_[url_house_.find('=') + 1 : ]
            # --- NOTE here we use loop for all pages for one house ---
            for page in range(1, MAX_PAGES):
                url_provs = f'https://piter-online.net/rates/{page}?house_id={house_id}'
                html = get_content_lite(url_provs, timeout=TIMEOUT)
                soup = BeautifulSoup(html, 'html.parser')
                if soup.find_all('div', class_='app271') == 0:
                    break
                else:
                    for prov_name, price in zip(
                        soup.find_all('div', class_='app297')[:end],
                        soup.find_all('div', class_ = 'app372')[:end]
                    ):
                        # --- FINALLY collect all the data in a form of dictionary ---
                        addresses.append(
                            {
                                'region': reg_data.getText(),
                                'street': street_data.getText(),
                                'house': houses_data.getText(),
                                'provider': prov_name.text,
                                'price': price.span.text
                            }
                        )

In [None]:
# convert data to dataframe
df = pd.DataFrame(addresses)
print(df.shape)
df.head()

### 5. Hints

Hints for data request:
1. Proxy
2. Exception
3. Trials strategy (unlimited or count)

In [None]:
def get_content(url_page, timeout, proxies=None, file=False):
    counts = 0
    content = None
    while counts < MAX_COUNTS:
        try:
            request = Request(url_page)
            request.add_header('User-Agent', USER_AGENT)
            if proxies:
                proxy_support = ProxyHandler(proxies)
                opener = build_opener(proxy_support)
                install_opener(opener)
                context = ssl._create_unverified_context()
                response = urlopen(request, context=context, timeout=timeout)
            else:
                response = urlopen(request, timeout=timeout)
            if file:
                content = response.read()
            else:
                try:
                    content = response.read().decode(response.headers.get_content_charset())
                except:
                    content = None
            break
        except URLError as e:
            counts += 1
            print('URLError | ', url_page, ' | ', e, ' | counts: ', counts)
            sleep(randint(counts * MIN_TIME_SLEEP, counts * MAX_TIME_SLEEP))
        except HTTPError as e:
            counts += 1
            print('HTTPError | ', url_page, ' | ', e, ' | counts: ', counts)
            sleep(randint(counts * MIN_TIME_SLEEP, counts * MAX_TIME_SLEEP))
        except socket.timeout as e:
            counts += 1
            print('socket timeout | ', url_page, ' | ', e, ' | counts: ', counts)
            sleep(randint(counts * MIN_TIME_SLEEP, counts * MAX_TIME_SLEEP))
    return content

Example of hints for data search within soup:

In [None]:
url_main = 'https://piter-online.net/address'
print(url_main)
html = get_content_lite(url_main, timeout=TIMEOUT)
soup = BeautifulSoup(html, 'html.parser')

In [None]:
for x in soup.find_all('div', class_=re.compile(r"app*")):
    try:
        s = x.find('a')['href']
        if s.startswith('/address'):
            print(s) 
    except:
        pass

## LAB WORK #3

Rewrite code:
1. Add more data about providers' offers (name of the tariff)
2. Try to run code NOT in debug mode, collect more data
3. Find top provider across every street / region
4. Try `heavy` version of `get_content` function (see above)