In [87]:
from requests import get, Timeout
from bs4 import BeautifulSoup as bs
import pandas as pd
from time import sleep
import random
from datetime import datetime as dt
import json

In [149]:
class ParserQuotes:
    
    url = "https://quotes.toscrape.com/page/"
    headers = {
        'accept': '*/*',
        'user-agent': 'Mozilla / 5.0(Macintosh; Intel Mac OS X 10_14_6)'
                    ' AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 98.0 .4758 .102 Safari / 537.36'
    }

    def __init__(self):
        print("START SCRAPE")
        print("*" * 12 + "\n")
        
        self.content = self.start_parsing(self.url, self.headers)


    def start_parsing(self, url, headers, start_page=1):
        page = start_page
        content = []

        while True:
            page_url = url + str(page)

            print()
            print(f"Start {page} page parsing")
            print("url --> " + page_url)

            sleep(round(random.uniform(0, 3), 3)) # <<<---( Гигиена )
            soup = self.get_page_soup(page_url, headers)
            if soup == None:
                break

            tmp = self.page_scrape(soup)
            content.extend(tmp)
            try:
                page += 1
                soup.find('a', href=f"/page/{page}/").text # <<<---( Проверка наличия следующей страницы )
            except AttributeError:
                print()
                print("end content")
                print("***********")
                print("END PARSING")
                return content


    def get_page_soup(self, page_url, headers):
        
        try:
            response = get(page_url, headers=headers, timeout=10)
            print("Status response: " + str(response.status_code))
            if response.status_code == 200:
                return bs(response.content, "html.parser")
            else:
                print("Error: status_code")
                return None
        except Timeout:
            print("error: Превышено время ожидания ответа")
            return None


    def page_scrape(self, soup):
        
        print("Start scrape")

        res = []
        quotes = soup.find_all('div', class_="quote")

        for quote in quotes:
            quote_dict = {}

            quote_dict["quote"] = quote.find('span', class_="text").text
            quote_dict["author"] = quote.find('small', class_='author').text

            tmp = []
            for tag in quote.find_all('a', class_="tag"):
                tmp.append(tag.text)
            quote_dict["tag"] = tmp

            res.append(quote_dict)


        
        print("Scrape: OK")
        return res


    def save_content(self):
        name_f = "content_" + dt.now().strftime('%H:%M_%d.%m.%Y')
        with open(name_f, 'w') as f:
            json.dump(self.content, f)
        
        print()
        print("Save: OK")
        print("Name: " + name_f)


test = ParserQuotes()



START SCRAPE
************


Start 1 page parsing
url --> https://quotes.toscrape.com/page/1
Status response: 200
Start scrape
Scrape: OK

Start 2 page parsing
url --> https://quotes.toscrape.com/page/2
Status response: 200
Start scrape
Scrape: OK

Start 3 page parsing
url --> https://quotes.toscrape.com/page/3
Status response: 200
Start scrape
Scrape: OK

Start 4 page parsing
url --> https://quotes.toscrape.com/page/4
Status response: 200
Start scrape
Scrape: OK

Start 5 page parsing
url --> https://quotes.toscrape.com/page/5
Status response: 200
Start scrape
Scrape: OK

Start 6 page parsing
url --> https://quotes.toscrape.com/page/6
Status response: 200
Start scrape
Scrape: OK

Start 7 page parsing
url --> https://quotes.toscrape.com/page/7
Status response: 200
Start scrape
Scrape: OK

Start 8 page parsing
url --> https://quotes.toscrape.com/page/8
Status response: 200
Start scrape
Scrape: OK

Start 9 page parsing
url --> https://quotes.toscrape.com/page/9
Status response: 200
Start s

In [150]:
print(f"Количество контента = {len(test.content)}")

print("Вот случайная цитата:\n")
tmp = test.content[random.randint(0, len(test.content) - 1)]
print(tmp['quote'])
print(tmp['author'])
print(tmp['tag'])

Количество контента = 100
Вот случайная цитата:

“The trouble with having an open mind, of course, is that people will insist on coming along and trying to put things in it.”
Terry Pratchett
['humor', 'open-mind', 'thinking']


In [151]:
test.save_content()


Save: OK
Name: content_11:26_12.06.2022
