In [1]:
import requests
import string
from bs4 import BeautifulSoup
import csv

In [16]:
debug = True
def dprint (str):
    if debug: print(str)

In [17]:
def get_bs_element(soup, chain, all_instances = False):
    '''
    This function retrives an element from BeautifulSoup parsed html text.
    
    return: BeautifulSoup elements if all_instance is True or the first 
            instant if it is False
    Arg:
    Soup: is a BeautifulSoup object of parsed html.
    chain: is a list of dictionary which has two keys:
            1) 'element':string of element to find in soup boject
            2) 'attrs': dictinaru of attributes to pass to soup find function
            e.g:
            [{'element':'lu', 'attrs':{}},
             {'element':'lu'},
             {'element':'li', 'attrs':{'class':'value'}] 
    all_instances: boolean and default is False. if it is True the functin will
            return all instances of element on the leaf of supplied chain of elements. Otherwise if it is False the function will 
            return the first instance it find on the leaf of supplied chain of elements.
    '''
    retval= None 
    #dprint(chain)
    first = chain.pop(0)
    element = soup.find(first.get('element'), attrs = first.get('attrs'))
    if len(chain) == 0 :
        if all_instances :
            retval = soup.find_all(first.get('element'), attrs = first.get('attrs'))
        else:
            retval = element
    else:
        retval = get_bs_element(element, chain, all_instances)
    return retval

In [18]:
def get_bs_element_details(soup, chain, attr = None):
    '''
    This function retrives either attribute or text of BeautifulSoup 
    parsed html element.
    
    return: the specified attributes or the element text if attr = None
    Arg:
    Soup: is a BeautifulSoup object of parsed html.
    chain: is a list of dictionary which has two keys; 
            1) 'element':string of element to find in soup boject
            2) 'attrs': dictinary of attributes to pass to soup find function
    attr: is the attributes to be retrived, 
          If it is None the element text will be returne instead.
    '''
    retval= None 
    #dprint(chain)
    first = chain.pop(0)
    element = soup.find(first.get('element'), attrs = first.get('attrs'))
    if element != None:
        if len(chain) == 0 :
                retval = element.get(attr) if attr != None else element.get_text()
        else:
            retval = get_bs_element_details(element, chain, attr)
    return retval

In [26]:
def get_book_list(url):
    booklist = list()
    rates = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
    #name, price, rate
    response = requests.get(url)
    parser = BeautifulSoup(response.text, 'html.parser')
    for item in parser.find_all('li', attrs = {'class':'col-xs-6 col-sm-4 col-md-3 col-lg-3'}):
        title = get_bs_element_details(item, [{'element':'h3'},{'element':'a'}],'title')
        price = get_bs_element_details(item, [{'element':'p', 'attrs':{'class':'price_color'}}])[2:]
        rate = rates.get(get_bs_element_details(item,[{'element':'p', 'attrs':{'class': 'star-rating'}}], 'class')[1])
        booklist.append([title, price, rate])
    #check next page
    next_page_link = get_bs_element_details(parser,[{'element':'li', 'attrs':{'class':'next'}}, {'element':'a'}], 'href')
    if next_page_link != None:
        new_url = url.replace('index.html', next_page_link)
        dprint(new_url)
        booklist.extend(get_book_list(new_url))
    return booklist

In [20]:
url ='https://books.toscrape.com/'

In [21]:
response = requests.get(url)

In [22]:
parser = BeautifulSoup(response.text, 'html.parser')

In [23]:
categories = list()
for rec in get_bs_element(parser,[{'element':'ul', 'attrs':{'class':'nav nav-list'}} , {'element':'ul'}, {'element':'li'}], all_instances= True):
    category = rec.get_text().strip()
    c_url =  'https://books.toscrape.com/' + get_bs_element_details(rec, [{'element':'a'}],'href')
    rec = dict({'category':category, 'url':c_url})
    categories.append(rec)
dprint(categories)

[{'category': 'Travel', 'url': 'https://books.toscrape.com/catalogue/category/books/travel_2/index.html'}, {'category': 'Mystery', 'url': 'https://books.toscrape.com/catalogue/category/books/mystery_3/index.html'}, {'category': 'Historical Fiction', 'url': 'https://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html'}, {'category': 'Sequential Art', 'url': 'https://books.toscrape.com/catalogue/category/books/sequential-art_5/index.html'}, {'category': 'Classics', 'url': 'https://books.toscrape.com/catalogue/category/books/classics_6/index.html'}, {'category': 'Philosophy', 'url': 'https://books.toscrape.com/catalogue/category/books/philosophy_7/index.html'}, {'category': 'Romance', 'url': 'https://books.toscrape.com/catalogue/category/books/romance_8/index.html'}, {'category': 'Womens Fiction', 'url': 'https://books.toscrape.com/catalogue/category/books/womens-fiction_9/index.html'}, {'category': 'Fiction', 'url': 'https://books.toscrape.com/catalogue/category/b

In [27]:
book_list = get_book_list (categories[2]['url'])
book_list

https://books.toscrape.com/catalogue/category/books/historical-fiction_4/page-2.html


[['Tipping the Velvet', '53.74', 1],
 ['Forever and Forever: The Courtship of Henry Longfellow and Fanny Appleton',
  '29.69',
  3],
 ['A Flight of Arrows (The Pathfinders #2)', '55.53', 5],
 ['The House by the Lake', '36.95', 1],
 ['Mrs. Houdini', '30.25', 5],
 ['The Marriage of Opposites', '28.08', 4],
 ['Glory over Everything: Beyond The Kitchen House', '45.84', 3],
 ['Love, Lies and Spies', '20.55', 2],
 ['A Paris Apartment', '39.01', 4],
 ['Lilac Girls', '17.28', 2],
 ['The Constant Princess (The Tudor Court #1)', '16.62', 3],
 ['The Invention of Wings', '37.34', 1],
 ['World Without End (The Pillars of the Earth #2)', '32.97', 4],
 ['The Passion of Dolssa', '28.32', 5],
 ['Girl With a Pearl Earring', '26.77', 1],
 ['Voyager (Outlander #3)', '21.07', 5],
 ['The Red Tent', '35.66', 5],
 ['The Last Painting of Sara de Vos', '55.55', 2],
 ['The Guernsey Literary and Potato Peel Pie Society', '49.53', 1],
 ['Girl in the Blue Coat', '46.83', 2],
 ['Between Shades of Gray', '20.79', 5],

In [28]:
len(book_list)

26

In [None]:
categories = list()
#for rec in parser.find('ul', attrs={'class':'nav nav-list'}).find('ul').find_all('li'):
for rec in get_bs_element(parser,[{'element':'ul', 'attrs':{'class':'nav nav-list'}} , {'element':'ul'}, {'element':'li'}], all_instances= True):
    category = rec.get_text().strip()
    #c_url =  'https://books.toscrape.com/' + rec.find('a').get('href')
    c_url =  'https://books.toscrape.com/' + get_bs_element_details(rec, [{'element':'a'}],'href')
    rec = dict({'category':category, 'url':c_url})
    categories.append(rec)
#dprint(categories)
categories

In [None]:
def get_book_list(url):
    booklist = list()
    rates = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
    #name, price, rate
    response = requests.get(url)
    parser = BeautifulSoup(response.text, 'html.parser')
    for item in parser.find_all('li', attrs = {'class':'col-xs-6 col-sm-4 col-md-3 col-lg-3'}):
        #title = item.find('h3').find('a').get('title')
        title = get_bs_element_details(item, [{'element':'h3'},{'element':'a'}],'title')
        #price = item.find('p', attrs = {'class':'price_color'}).get_text()[2:]
        price = get_bs_element_details(item, [{'element':'p', 'attrs':{'class':'price_color'}}])[2:]
        #rate = rates[item.find('p', attrs={'class': 'star-rating'}).get('class')[1]]
        rate = rates.get(get_bs_element_details(item,[{'element':'p', 'attrs':{'class': 'star-rating'}}], 'class')[1])
        #dprint(f'title:{title} price:{price} rate:{rate}')
        booklist.append([title, price, rate])
    #check next page
    #next_page = parser.find('li', attrs={'class':'next'})
    #if next_page :
    #    next_page_link = next_page.find('a')
    #    if next_page_link :
    #        new_url = url.replace('index.html', next_page_link.get('href'))
    #        dprint(new_url)
     #       booklist.extend(get_book_list(new_url))
    next_page_link = get_bs_element_details(parser,[{'element':'li', 'attrs'={'class':'next'}}, {'element':'a'}], 'href')
    if next_page_lint != None:
        new_url = url.replace('index.html', next_page_link)
        dprint(new_url)
        booklist.extend(get_book_list(new_url))
        
                                                     
    #dprint(next_page_link)
    return booklist

In [None]:
book_list = get_book_list (categories[2]['url'])
book_list

In [None]:
categories[0]['url']

In [None]:
len(book_list)