In [1]:
from bs4 import BeautifulSoup
import requests
import re
import json

In [2]:
def get_html(url):
    response = requests.get(url)
    return response.text

In [3]:
tvs_url = 'https://rozetka.com.ua/all-tv/c80037/'

In [4]:
from urllib.parse import urlparse

def entity_name(url):
    return urlparse(url).path[1:-1].replace('/','_')

entity_name(tvs_url)

'all-tv_c80037'

In [14]:
from collections import namedtuple

CatalogPage = namedtuple('CatalogPage', 'max_page_num items')

def get_items_page(catalog_url, page_num, log = None):
    page_url = catalog_url if page_num == 0 else catalog_url + 'page={}/'.format(page_num + 1)
    
    soup = BeautifulSoup(get_html(page_url), 'lxml')
    
    title_divs = soup.find_all('div', class_ = 'g-i-tile-i-title clearfix') 

    titles = []
    for div in title_divs:
        a = div.find('a')
        titles.append((a.get("href"), a.text.strip()))
        
    page_items = soup.find_all('li', class_ = 'paginator-catalog-l-i pos-fix')

    max_page_num = [int(item.get('id').replace('page', '').strip()) for item in page_items][-1] 
    
    if not (log is None):
        print('{} {}'.format(catalog_url, page_num), file = log, end = "\n")
    
    return CatalogPage(max_page_num, titles)

In [6]:
item_url = 'https://rozetka.com.ua/philips_43pus6503_12/p39097504/'

In [7]:
entity_name(item_url)

'philips_43pus6503_12_p39097504'

In [35]:
Comment = namedtuple('Comment', 'text up_count down_count replies')


CommentsPage = namedtuple('CommentsPage', 'max_page_num comments')

def get_comments_page(item_url, page_num, log = None):
    comments_url = item_url + 'comments/'
    page_url = comments_url if page_num == 0 else comments_url + 'page={}/'.format(page_num + 1)
    
    soup = BeautifulSoup(get_html(page_url), 'lxml')
    article_divs = soup.find_all('article', class_ = 'pp-review-i')
    
    comments = []
    
    for div in article_divs:
        text = [text_div.text.strip().replace(u'\xa0', ' ') \
                for text_div in div.find_all('div', class_ = 'pp-review-text-i')]
    
        replies = [rep_div.text.strip().replace(u'\xa0', ' ') \
                   for rep_div in div.find_all('div', class_ = 'pp-replies-text')]
    
        up_count = div.find('a', class_ = 'pp-review-vote-positive').\
            find('span', class_ = 'pp-review-vote-count').text.strip()
        
        down_count = div.find('a', class_ = 'pp-review-vote-negative').\
            find('span', class_ = 'pp-review-vote-count').text.strip()
    
        comments.append(Comment(text, int(up_count) if up_count else 0, \
                                int(down_count) if down_count else 0, replies))
    
    page_items = soup.find_all('li', class_ = 'paginator-l-i ng-star-inserted')
    
    max_page_num = [int(item.text.replace('...','0').strip()) for item in page_items][-1]
    
    if not (log is None):
        print('{} {}'.format(item_url, page_num), file = log, end = "\n")
    
    return CommentsPage(max_page_num, comments)

In [None]:
<li class="paginator-l-i ng-star-inserted"><!----><a class="novisited paginator-l-link ng-star-inserted"
href="/lg_43uk6200plb1/p61571186/comments/page=9/">9</a><!----></li>

In [21]:
import time
import json

def scrape_catalog(catalog_url, data_path):
    pages = []
    
    with open(data_path + entity_name(catalog_url) + '.log', 'a') as log:
    
        pages.append(get_items_page(catalog_url, 0, log)) 
        max_page_num = pages[-1].max_page_num
    
        page_num = 1
        while page_num < max_page_num:
            time.sleep(10)
            pages.append(get_items_page(catalog_url, page_num, log))
            page_num += 1
        
    with open(data_path + entity_name(catalog_url) + '.json', 'w') as result:
        json.dump(pages, result)

In [15]:
#scrape_catalog(tvs_url, 'data/')

https://rozetka.com.ua/all-tv/c80037/ 0
https://rozetka.com.ua/all-tv/c80037/ 1
https://rozetka.com.ua/all-tv/c80037/ 2
https://rozetka.com.ua/all-tv/c80037/ 3
https://rozetka.com.ua/all-tv/c80037/ 4
https://rozetka.com.ua/all-tv/c80037/ 5
https://rozetka.com.ua/all-tv/c80037/ 6
https://rozetka.com.ua/all-tv/c80037/ 7
https://rozetka.com.ua/all-tv/c80037/ 8
https://rozetka.com.ua/all-tv/c80037/ 9
https://rozetka.com.ua/all-tv/c80037/ 10
https://rozetka.com.ua/all-tv/c80037/ 11
https://rozetka.com.ua/all-tv/c80037/ 12
https://rozetka.com.ua/all-tv/c80037/ 13
https://rozetka.com.ua/all-tv/c80037/ 14
https://rozetka.com.ua/all-tv/c80037/ 15
https://rozetka.com.ua/all-tv/c80037/ 16
https://rozetka.com.ua/all-tv/c80037/ 17
https://rozetka.com.ua/all-tv/c80037/ 18
https://rozetka.com.ua/all-tv/c80037/ 19
https://rozetka.com.ua/all-tv/c80037/ 20
https://rozetka.com.ua/all-tv/c80037/ 21
https://rozetka.com.ua/all-tv/c80037/ 22
https://rozetka.com.ua/all-tv/c80037/ 23


In [19]:
catalog = []
with open('data/' + entity_name(tvs_url) + '.json') as f_cat:
    catalog = [item for page in json.load(f_cat) for item in page[1]]

catalog[:10]

[['https://rozetka.com.ua/lg_43uk6200plb1/p61571186/',
  'Телевизор LG 43UK6200PLA Оплата частями 24 мес.!'],
 ['https://rozetka.com.ua/hisense_h43a6100_/p40090880/',
  'Телевизор Hisense H43A6100 Smart TV, 4K, 43"'],
 ['https://rozetka.com.ua/samsung_ue58nu7100uxua/p54812544/',
  'Телевизор Samsung UE58NU7100UXUA или Оплата частями на 24 мес.!'],
 ['https://rozetka.com.ua/samsung_ue49n5500auxua/p46780456/',
  'Телевизор Samsung UE49N5500AUXUA Оплата частями на 12 мес.!'],
 ['https://rozetka.com.ua/philips_50pfs5823_12/p52131804/',
  'Телевизор Philips 50PFS5823/12 + 0% Кредит на 10 мес!'],
 ['https://rozetka.com.ua/samsung_qe49q6fnauxua/p46776608/',
  'Телевизор Samsung QE49Q6FNAUXUA Оплата частями на 24 мес.!'],
 ['https://rozetka.com.ua/ergo_le40ct5530ak/p62757784/',
  'Телевизор Ergo LE40CT5530AK + Оплата частями на 10 платежей!'],
 ['https://rozetka.com.ua/panasonic_tx_32dr400/p12689378/',
  'Телевизор Panasonic Viera TX-32DR400 32", HD, T2 Оплата частями на 12 мес. или Сертификат

In [20]:
len(catalog)

758

In [26]:
def scrape_comments(catalog, data_path):
    for item in catalog:
        item_url = item[0]
        with open(data_path + entity_name(item_url) + '-comments.log', 'a') as log:
            pages = []
            pages.append(get_comments_page(item_url, 0, log))
            max_page_num = pages[-1].max_page_num
            page_num = 1
            while page_num < max_page_num:
                time.sleep(3)
                pages.append(get_comments_page(item_url, page_num, log))
                
            with open(data_path + entity_name(item_url) + '-comments.json', 'w') as result:
                json.dump(pages, result)    
        

In [36]:
get_comments_page(catalog[0][0], 0)

CommentsPage(max_page_num=9, comments=[])

In [27]:
scrape_comments(catalog, 'data/')

IndexError: list index out of range