In [1]:
import requests
import string
from time import sleep
from bs4 import BeautifulSoup
import csv

In [2]:
debug = True
def dprint (str):
    if debug: print(str)

In [3]:
def get_bs_element(soup, chain, all_instances = False):
    '''
    This function retrives an element from BeautifulSoup parsed html text.
    
    return: BeautifulSoup elements if all_instance is True or the first 
            instant if it is False
    Arg:
    Soup: is a BeautifulSoup object of parsed html.
    chain: is a list of dictionary which has two keys:
            1) 'element':string of element to find in soup boject
            2) 'attrs': dictinaru of attributes to pass to soup find function
            e.g:
            [{'element':'lu', 'attrs':{}},
             {'element':'lu'},
             {'element':'li', 'attrs':{'class':'value'}] 
    all_instances: boolean and default is False. if it is True the functin will
            return all instances of element on the leaf of supplied chain of elements. Otherwise if it is False the function will 
            return the first instance it find on the leaf of supplied chain of elements.
    '''
    retval= None 
    #dprint(chain)
    first = chain.pop(0)
    element = soup.find(first.get('element'), attrs = first.get('attrs'))
    if len(chain) == 0 :
        if all_instances :
            retval = soup.find_all(first.get('element'), attrs = first.get('attrs'))
        else:
            retval = element
    else:
        retval = get_bs_element(element, chain, all_instances)
    return retval

In [4]:
def get_bs_element_details(soup, chain, attr = None):
    '''
    This function retrives either attribute or text of BeautifulSoup 
    parsed html element.
    
    return: the specified attributes or the element text if attr = None
    Arg:
    Soup: is a BeautifulSoup object of parsed html.
    chain: is a list of dictionary which has two keys; 
            1) 'element':string of element to find in soup boject
            2) 'attrs': dictinary of attributes to pass to soup find function
    attr: is the attributes to be retrived, 
          If it is None the element text will be returne instead.
    '''
    retval= None 
    #dprint(chain)
    first = chain.pop(0)
    element = soup.find(first.get('element'), attrs = first.get('attrs'))
    if element != None:
        if len(chain) == 0 :
                retval = element.get(attr) if attr != None else element.get_text()
        else:
            retval = get_bs_element_details(element, chain, attr)
    return retval

In [5]:
def get_book_list(url):
    booklist = list()
    rates = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
    #name, price, rate
    response = requests.get(url)
    parser = BeautifulSoup(response.text, 'html.parser')
    for item in parser.find_all('li', attrs = {'class':'col-xs-6 col-sm-4 col-md-3 col-lg-3'}):
        title = get_bs_element_details(item, [{'element':'h3'},{'element':'a'}],'title')
        price = get_bs_element_details(item, [{'element':'p', 'attrs':{'class':'price_color'}}])[2:]
        rate = rates.get(get_bs_element_details(item,[{'element':'p', 'attrs':{'class': 'star-rating'}}], 'class')[1])
        booklist.append([title, price, rate])
    #check next page
    next_page_link = get_bs_element_details(parser,[{'element':'li', 'attrs':{'class':'next'}}, {'element':'a'}], 'href')
    if next_page_link != None:
        new_url = url.replace(url.split(sep='/')[-1], next_page_link)
        booklist.extend(get_book_list(new_url))
    return booklist

In [6]:
url ='https://books.toscrape.com/'

In [7]:
response = requests.get(url)

In [8]:
parser = BeautifulSoup(response.text, 'html.parser')

In [9]:
categories = list()
for rec in get_bs_element(parser,[{'element':'ul', 'attrs':{'class':'nav nav-list'}} , {'element':'ul'}, {'element':'li'}], all_instances= True):
    category = rec.get_text().strip()
    c_url =  'https://books.toscrape.com/' + get_bs_element_details(rec, [{'element':'a'}],'href')
    book_list = get_book_list (c_url)    
    rec = dict({'category':category, 'url':c_url, 'books': book_list})
    categories.append(rec)
    dprint(f'Cateory:{category}, No. Books:{len(book_list)}')
    sleep(10)


Cateory:Travel, No. Books:11
Cateory:Mystery, No. Books:32
Cateory:Historical Fiction, No. Books:26
Cateory:Sequential Art, No. Books:75
Cateory:Classics, No. Books:19
Cateory:Philosophy, No. Books:11
Cateory:Romance, No. Books:35
Cateory:Womens Fiction, No. Books:17
Cateory:Fiction, No. Books:65
Cateory:Childrens, No. Books:29
Cateory:Religion, No. Books:7
Cateory:Nonfiction, No. Books:110
Cateory:Music, No. Books:13
Cateory:Default, No. Books:152
Cateory:Science Fiction, No. Books:16
Cateory:Sports and Games, No. Books:5
Cateory:Add a comment, No. Books:67
Cateory:Fantasy, No. Books:48
Cateory:New Adult, No. Books:6
Cateory:Young Adult, No. Books:54
Cateory:Science, No. Books:14
Cateory:Poetry, No. Books:19
Cateory:Paranormal, No. Books:1
Cateory:Art, No. Books:8
Cateory:Psychology, No. Books:7
Cateory:Autobiography, No. Books:9
Cateory:Parenting, No. Books:1
Cateory:Adult Fiction, No. Books:1
Cateory:Humor, No. Books:10
Cateory:Horror, No. Books:17
Cateory:History, No. Books:18
Cate

In [12]:
with open('bookslist.csv', 'w', newline='', encoding='UTF8') as file:
    file_writer = csv.DictWriter(file,['category', 'title', 'price', 'rate'])
    file_writer.writeheader()
    for category in categories:
        for book in category.get('books'):
            file_writer.writerow({'category':category.get('category'), 'title':book[0], 'price':book[1], 'rate':book[2]})
