In [1]:
import os
import re
import requests
import pandas as pd
import json
from datetime import datetime
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

In [2]:
def scrape_books(html_soup, url, catalogue):
    for book in html_soup.select('article.product_pod'):
        book_url = book.find('h3').find('a').get('href')
        book_url = urljoin(url, book_url)
        path = urlparse(book_url).path
        book_id = path.split('/')[2] 
        catalogue[book_id] = {}    
    return catalogue    


def scrape_book(html_soup, book_id):
    main = html_soup.find(class_='product_main')
    book = {}
    book['title'] = main.find('h1').get_text(strip=True)
    book['price'] = main.find(class_='price_color').get_text(strip=True)
    book['rating'] = ' '.join(main.find(class_='star-rating') \
                        .get('class')).replace('star-rating', '').strip()
    
    desc = html_soup.find(id='product_description')
    book['description'] = ''
    if desc:
        book['description'] = desc.find_next_sibling('p') \
                                  .get_text(strip=True)
    book_product_table = html_soup.find(text='Product Information').find_next('table')
    for row in book_product_table.find_all('tr'):
        header = row.find('th').get_text(strip=True)
        header = re.sub('[^a-zA-Z]+', '_', header)
        value = row.find('td').get_text(strip=True)
        if header == 'UPC':
            book[header] = value
    return book

In [4]:
# Scrape the pages in the catalogue
catalogue = {}
base_url = 'http://books.toscrape.com/'
catalogue_path = 'catalogue.json'
if os.path.exists(catalogue_path):
    with open(catalogue_path) as json_file:
        catalogue = json.load(json_file)
url = base_url
inp = input('Re-scrape the catalogue (y/n)? ')
while True and inp == 'y':
    print('Now scraping page:', url)
    r = requests.get(url)
    html_soup = BeautifulSoup(r.text, 'html.parser')
    catalogue = scrape_books(html_soup, url, catalogue)
    next_a = html_soup.select('li.next > a')
    if not next_a or not next_a[0].get('href'):
        break
    url = urljoin(url, next_a[0].get('href'))
if inp == 'y':
    with open('catalogue.json', 'w') as json_file:
            json.dump(catalogue, json_file)    
        
books = []
for book_id in catalogue.keys():
    book_url = base_url + 'catalogue/{}'.format(book_id)
    print('Now scraping book:', book_url)
    r = requests.get(book_url)
    r.encoding = 'utf-8'
    html_soup = BeautifulSoup(r.text, 'html.parser')
    dictionary = scrape_book(html_soup, book_id)
    books.append(dictionary)
df = pd.DataFrame(books)
df.to_csv('result_df.csv')


Re-scrape the catalogue (y/n)? n
Now scraping book: http://books.toscrape.com/catalogue/a-light-in-the-attic_1000
Now scraping book: http://books.toscrape.com/catalogue/tipping-the-velvet_999
Now scraping book: http://books.toscrape.com/catalogue/soumission_998
Now scraping book: http://books.toscrape.com/catalogue/sharp-objects_997
Now scraping book: http://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996
Now scraping book: http://books.toscrape.com/catalogue/the-requiem-red_995
Now scraping book: http://books.toscrape.com/catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994
Now scraping book: http://books.toscrape.com/catalogue/the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993
Now scraping book: http://books.toscrape.com/catalogue/the-boys-in-the-boat-nine-americans-and-their-epic-quest-for-gold-at-the-1936-berlin-olympics_992
Now scraping book: http://books.toscrape.com/catalogue/the-black-maria_991
Now scrapin