In [1]:
import os
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
import time

# List of categories and their URLs
categories = {
    'all': 'https://book.douban.com/chart?subcat=all&icn=index-topchart-popular',
    'literary': 'https://book.douban.com/chart?subcat=literary&icn=index-topchart-popular',
    'novel': 'https://book.douban.com/chart?subcat=novel&icn=index-topchart-popular',
    'history': 'https://book.douban.com/chart?subcat=history&icn=index-topchart-popular',
    'social': 'https://book.douban.com/chart?subcat=social&icn=index-topchart-popular',
    'tech': 'https://book.douban.com/chart?subcat=tech&icn=index-topchart-popular',
    'art': 'https://book.douban.com/chart?subcat=art&icn=index-topchart-popular',
    'drama': 'https://book.douban.com/chart?subcat=drama&icn=index-topchart-popular',
    'business': 'https://book.douban.com/chart?subcat=business&icn=index-topchart-popular',
    'comics': 'https://book.douban.com/chart?subcat=comics&icn=index-topchart-popular',
    'suspense_novel': 'https://book.douban.com/chart?subcat=suspense_novel&icn=index-topchart-popular',
    'science_fiction': 'https://book.douban.com/chart?subcat=science_fiction&icn=index-topchart-popular'
}

# Headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# List to hold all book information across categories
all_books = []

# Function to fetch ISBN and book description
def fetch_isbn_and_description(book_url):
    time.sleep(10)  # Add a 10-second delay to avoid detection
    response = requests.get(book_url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve the book page. Status code: {response.status_code}")
        return 'N/A', 'N/A'

    soup = BeautifulSoup(response.content, 'html.parser')
    isbn_tag = soup.find('meta', property='book:isbn')
    description_tag = soup.find('meta', property='og:description')
    isbn = isbn_tag['content'].strip() if isbn_tag else 'N/A'
    description = description_tag['content'].strip() if description_tag else 'N/A'
    return isbn, description

# Function to scrape book information from a given category page
def scrape_category(category, url):
    # Send a GET request to the URL
    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to retrieve the page for category '{category}'. Status code: {response.status_code}")
        return

    # Parse the page content with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the book entries in the page
    books = soup.select('li.media.clearfix, li.chart-digest-media.clearfix')

    if not books or len(books) < 10:
        print(f"Only found {len(books)} books for category '{category}'. The structure might have changed.")
    else:
        # Extract information for each book
        for i, book in enumerate(books[:10]):  # Get top 10 books
            try:
                title_tag = book.find('h2')
                title = title_tag.find('a').text.strip() if title_tag else 'N/A'
                
                author_info_tag = book.find('p', class_='subject-abstract')
                author_info = author_info_tag.text.strip() if author_info_tag else 'N/A'
                
                # Split author info to get author and publisher separately
                author_info_parts = author_info.split('/')
                author = author_info_parts[0].strip() if len(author_info_parts) > 0 else 'N/A'
                publisher = author_info_parts[-3].strip() if len(author_info_parts) > 2 else 'N/A'
                
                rating_tag = book.find('span', class_='font-small color-red fleft')
                rating = rating_tag.text.strip() if rating_tag else 'N/A'
                
                num_ratings_tag = book.find('span', class_='fleft ml8 color-gray')
                num_ratings = num_ratings_tag.text.strip()[1:-1] if num_ratings_tag else 'N/A'  # Remove parentheses

                url_tag = title_tag.find('a')['href'] if title_tag else 'N/A'
                
                # Fetch ISBN and description
                isbn, description = fetch_isbn_and_description(url_tag)

                # Store the extracted data in a dictionary
                book_info = {
                    'category': category,
                    'number': i + 1,
                    'title': title,
                    'author': author,
                    'publisher': publisher,
                    'rating': rating,
                    'num_ratings': num_ratings,
                    'url': url_tag,
                    'isbn': isbn,
                    'description': description
                }
                all_books.append(book_info)
            except AttributeError as e:
                print(f"Error parsing book at index {i} in category '{category}': {e}")

# Scrape each category
for category, url in categories.items():
    scrape_category(category, url)

# Get the current date in YYYYMMDD format
current_date = datetime.now().strftime('%Y%m%d')

# Specify the CSV file name with the current date and the designated folder
folder_path = 'PATH TO YOUR FOLDER'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
csv_file = os.path.join(folder_path, f'top_10_books_all_categories_{current_date}.csv')

# Define the CSV column headers
csv_columns = ['category', 'number', 'title', 'author', 'publisher', 'rating', 'num_ratings', 'url', 'isbn', 'description']

# Write to the CSV file
try:
    with open(csv_file, 'w', newline='', encoding='utf-8-sig') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()
        for book in all_books:
            writer.writerow(book)
    print(f"Top 10 books information from all categories has been written to {csv_file}")
except IOError as e:
    print(f"I/O error({e.errno}): {e.strerror}")


Top 10 books information from all categories has been written to C:/Users/lbjennifer/OneDrive - HKUST/Documents/Book Selection/douban - weekly\top_10_books_all_categories_20240703.csv
