Construction of the Book description table "DF_Books" from Books.toscrape.com


saved categories : -Mystery
                   -thriller 
                   -historical 
                   -science 
                   -business

In [11]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from time import sleep
from requests.exceptions import RequestException

def get_stock_availability(book_url):
    try:
        # Access the individual book page to extract the availability
        book_page = requests.get(book_url)
        book_page.raise_for_status()  # Raise an exception for bad response status
        book_soup = BeautifulSoup(book_page.text, 'html.parser')
        
        availability_tag = book_soup.find('p', class_='instock availability')
        availability_text = availability_tag.get_text(strip=True) if availability_tag else 'Availability not available'
        
        return availability_text
    except RequestException as e:
        print(f"Error accessing book page: {e}")
        return 'Availability not available'

def get_book_price(doc):
    try:
        # Extract book price using the function
        book_price_tags = doc.find_all('p', class_='price_color')
        # Use the first tag's text and replace 'Â'
        book_price = book_price_tags[0].text.replace('Â', '').strip() if book_price_tags else 'Price not available'
        return book_price
    except IndexError:
        return 'Price not available'

# Initialize objects
books = {}

# Initial URL to start with
base_url = 'https://books.toscrape.com/'
target_categories_href = [
    'catalogue/category/books/mystery_3/index.html',
    'catalogue/category/books/thriller_6/index.html',
    'catalogue/category/books/historical-fiction_4/index.html',
    'catalogue/category/books/science_22/index.html',
    'catalogue/category/books/business_35/index.html',
]

# Iterate over target_categories_href
for category_href in target_categories_href:
    next_url = urljoin(base_url, category_href)

    # While Next button exists (while != '' or != None)
    while next_url:
        try:
            # HTTP Request and BeautifulSoup
            r = requests.get(next_url)
            r.raise_for_status()  # Raise an exception for bad response status
            soup = BeautifulSoup(r.text, 'html.parser')

            # Find the genre of books on the current page
            genre_tag = soup.find('div', class_='page-header')
            current_genre = genre_tag.find('h1').get_text(strip=True) if genre_tag and genre_tag.find('h1') else 'Genre not available'

            # Find all article elements representing books
            books_articles = soup.find_all('article', class_='product_pod')

            # For each book on each webpage, extract data
            for book_article in books_articles:
                # Extract book details
                book_id = len(books) + 1

                title_tag = book_article.find('h3').find('a')
                book_title = title_tag['title'] if title_tag and 'title' in title_tag.attrs else 'Title not available'

                relative_path = title_tag['href'] if title_tag else None
                cleaned_relative_path = relative_path.lstrip('../../../') if relative_path else None
                book_url = f"{base_url}catalogue/{cleaned_relative_path}" if cleaned_relative_path else 'URL not available'

                # Extract book stock availability using the function
                book_stock = get_stock_availability(book_url)

                # Extract book price using the function
                book_price = get_book_price(book_article)

                image_tag = book_article.find('img')
                book_image_url = urljoin(base_url, image_tag['src']) if image_tag else 'Image URL not available'

                # Extract book stars based on the color style
                star_rating_paragraph = book_article.find('p', class_='star-rating')
                if star_rating_paragraph:
                    star_rating_class = star_rating_paragraph['class']
                    num_stars = 0
                    if 'One' in star_rating_class:
                        num_stars = 1
                    elif 'Two' in star_rating_class:
                        num_stars = 2
                    elif 'Three' in star_rating_class:
                        num_stars = 3
                    elif 'Four' in star_rating_class:
                        num_stars = 4
                    elif 'Five' in star_rating_class:
                        num_stars = 5

                # Access the individual book page to extract the description
                book_page = requests.get(book_url)
                book_page.raise_for_status()
                book_soup = BeautifulSoup(book_page.text, 'html.parser')
                description_meta_tag = book_soup.find('meta', attrs={'name': 'description'})
                book_description = description_meta_tag['content'] if description_meta_tag else 'Description not available'

                # Add book to dict
                books[book_id] = {
                    'title': book_title,
                    'url': book_url,
                    'price': book_price,
                    'category': current_genre,
                    'availability': book_stock,
                    'image_url': book_image_url,
                    'num_stars': num_stars,
                    'description': book_description,
                }

            next_tag = soup.find('li', class_='next').find('a')['href']
            next_url = urljoin(base_url, next_tag)
        except (RequestException, AttributeError) as e:
            print(f"Error: {e}")
            next_url = None

        sleep(1)  # Introduce a pause of 1 second between requests
        print(f'Requesting next page for category {current_genre}: {next_url}' if next_url else 'Finished.')

# Print the extracted data
for book_id, book_info in books.items():
    print(f"\nBook {book_id}:")
    print(f"Title: {book_info['title']}")
    print(f"URL: {book_info['url']}")
    print(f"Price: {book_info['price']}")
    print(f"Category: {book_info['category']}")
    print(f"Availability: {book_info['availability']}")
    print(f"Image URL: {book_info['image_url']}")
    print(f"Number of Stars: {book_info['num_stars']}")
    print(f"Description: {book_info['description']}")

Requesting next page for category Mystery: https://books.toscrape.com/page-2.html
Error: 404 Client Error: Not Found for url: https://books.toscrape.com/page-2.html
Finished.
Error: 404 Client Error: Not Found for url: https://books.toscrape.com/catalogue/category/books/thriller_6/index.html
Finished.
Requesting next page for category Historical Fiction: https://books.toscrape.com/page-2.html
Error: 404 Client Error: Not Found for url: https://books.toscrape.com/page-2.html
Finished.
Error: 'NoneType' object has no attribute 'find'
Finished.
Error: 'NoneType' object has no attribute 'find'
Finished.

Book 1:
Title: Sharp Objects
URL: https://books.toscrape.com/catalogue/sharp-objects_997/index.html
Price: £47.82
Category: Mystery
Availability: In stock (20 available)
Image URL: https://books.toscrape.com/media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg
Number of Stars: 4
Description: 
    WICKED above her hipbone, GIRL across her heart Words are like a road map to reporter Camill

converting Df_books to Dataframe and saving as a scv file

In [12]:
import pandas as pd

# Convert the books dictionary to a pandas DataFrame
df_books = pd.DataFrame.from_dict(books, orient='index')

# Save the DataFrame to a CSV file
df_books.to_csv('books_data.csv', index=False)

print("DataFrame saved to 'books_data.csv'")

DataFrame saved to 'books_data.csv'


In [13]:
df_books

Unnamed: 0,title,url,price,category,availability,image_url,num_stars,description
1,Sharp Objects,https://books.toscrape.com/catalogue/sharp-obj...,£47.82,Mystery,In stock (20 available),https://books.toscrape.com/media/cache/32/51/3...,4,"\n WICKED above her hipbone, GIRL across he..."
2,"In a Dark, Dark Wood",https://books.toscrape.com/catalogue/in-a-dark...,£19.63,Mystery,In stock (18 available),https://books.toscrape.com/media/cache/23/85/2...,1,"\n In a dark, dark wood Nora hasn't seen Cl..."
3,The Past Never Ends,https://books.toscrape.com/catalogue/the-past-...,£56.50,Mystery,In stock (16 available),https://books.toscrape.com/media/cache/89/b8/8...,4,"\n A simple task, Attorney Chester Morgan t..."
4,A Murder in Time,https://books.toscrape.com/catalogue/a-murder-...,£16.64,Mystery,In stock (16 available),https://books.toscrape.com/media/cache/11/aa/1...,1,"\n Beautiful and brilliant, Kendra Donovan ..."
5,The Murder of Roger Ackroyd (Hercule Poirot #4),https://books.toscrape.com/catalogue/the-murde...,£44.10,Mystery,In stock (15 available),https://books.toscrape.com/media/cache/29/fe/2...,4,"\n In the village of King's Abbot, a widow'..."
...,...,...,...,...,...,...,...,...
62,Born for This: How to Find the Work You Were M...,https://books.toscrape.com/catalogue/born-for-...,£21.59,Business,In stock (11 available),https://books.toscrape.com/media/cache/aa/67/a...,5,"\n To the outside observer, it seems like t..."
63,The E-Myth Revisited: Why Most Small Businesse...,https://books.toscrape.com/catalogue/the-e-myt...,£36.91,Business,In stock (8 available),https://books.toscrape.com/media/cache/11/2c/1...,1,"\n E-Myth 'e-,'mith n 1: the entrepreneuri..."
64,"Rich Dad, Poor Dad",https://books.toscrape.com/catalogue/rich-dad-...,£51.74,Business,In stock (7 available),https://books.toscrape.com/media/cache/18/f4/1...,1,\n Personal finance author and lecturer Rob...
65,The Lean Startup: How Today's Entrepreneurs Us...,https://books.toscrape.com/catalogue/the-lean-...,£33.92,Business,In stock (3 available),https://books.toscrape.com/media/cache/39/f1/3...,3,\n Most startups fail. But many of those fa...
