In [15]:
import requests
from bs4 import BeautifulSoup
import csv
import time

# URL that we will scrape. {} will get replaced with the correct page
BASE_URL = "http://books.toscrape.com/catalogue/page-{}.html" 

# map word ratings to number ratings
def get_rating(rating_text):
    ratings_map = {
        'One': 1,
        'Two': 2,
        'Three': 3,
        'Four': 4,
        'Five': 5
    }
    return ratings_map.get(rating_text, 0) #default to zero if none

In [14]:
def scrape_book_page(page_number):
    #get formatted URL with right page number
    url = BASE_URL.format(page_number)
    # send our GET HTTP request, returns a 'Response' object and a status code
    response = requests.get(url)
    
    if response.status_code != 200:
        print(f"Error fetching page {page_number}: HTTP {response.status_code}")
        return []
        
    #create beautiful soup parser with the text
    soup = BeautifulSoup(response.text, 'html.parser') #response.text is the html data as a string, creates a parse tree
    book_list = soup.find_all('article', class_='product_pod') # looks for <article> tags with class product_pod
    
    results = [] 
    for book in book_list:
        # EXAMPLE: <a href="book1.html" title="The Great Book">Click here</a>
        
        # Title
        title_tag = book.find('h3').find('a') # the 'a' tag contains the title attribute
        title = title_tag['title'] if title_tag else 'No Title Found' #extracts the title attribute, defaults to No Title Found
        
        # Price
        price_tag = book.find('p', class_='price_color')
        price = price_tag.text.strip() if price_tag else 'N/A' #extracts the price text
        
        # Availability
        stock_tag = book.find('p', class_='instock availability')
        availability = stock_tag.text.strip() if stock_tag else 'N/A' #extracts the stock text 
        
        # Rating (e.g. "star-rating Three")
        rating_tag = book.find('p', class_='star-rating')
        if rating_tag:
            rating_classes = rating_tag.get('class', [])
            # The second class might be "One", "Two", "Three", etc.
            rating_text = rating_classes[1] if len(rating_classes) > 1 else 'No Rating'
        else:
            rating_text = 'No Rating'
        
        numeric_rating = get_rating(rating_text)
        
        results.append({
            'title': title,
            'price': price,
            'availability': availability,
            'rating': numeric_rating
        })
    
    return results

# Scrape first page
sample_books = scrape_book_page(1)
print(f"Scraped {len(sample_books)} books from page 1.")
sample_books[:3]  # Show first 3 results

Scraped 20 books from page 1.


[{'title': 'A Light in the Attic',
  'price': 'Â£51.77',
  'availability': 'In stock',
  'rating': 3},
 {'title': 'Tipping the Velvet',
  'price': 'Â£53.74',
  'availability': 'In stock',
  'rating': 1},
 {'title': 'Soumission',
  'price': 'Â£50.10',
  'availability': 'In stock',
  'rating': 1}]

In [10]:
def scrape_multiple_pages(start_page=1, end_page=3):
    all_books = []
    
    for page in range(start_page, end_page + 1):
        print(f"Scraping page {page}...")
        page_books = scrape_book_page(page)
        all_books.extend(page_books)
        
        # Polite delay
        time.sleep(1)
    
    return all_books
#
books_data = scrape_multiple_pages(1, 2)
print(f"Total books scraped: {len(books_data)}")

Scraping page 1...
Scraping page 2...
Total books scraped: 40


In [11]:
def save_to_csv(book_list, filename="books.csv"):
    fieldnames = ['title', 'price', 'availability', 'rating']
    with open(filename, mode='w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for book in book_list:
            writer.writerow(book)
            
save_to_csv(books_data, "books.csv")
print("Data saved to 'books.csv'")

Data saved to 'books.csv'
