In [1]:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
import time
import random

def scrape_goodreads_books(num_pages=5):
    base_url = "https://www.goodreads.com/list/show/1.Best_Books_Ever?page={}"
    books_data = []

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    for page in range(1, num_pages + 1):
        url = base_url.format(page)
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        books = soup.find_all('tr', {'itemtype': 'http://schema.org/Book'})

        for book in books:
            title = book.find('a', {'class': 'bookTitle'}).text.strip()
            author = book.find('a', {'class': 'authorName'}).text.strip()
            rating = book.find('span', {'class': 'minirating'}).text.strip().split('avg')[0].strip()
            num_ratings = book.find('span', {'class': 'minirating'}).text.strip().split('—')[1].strip().split(' ')[0].replace(',', '')
            
            last_modified = response.headers.get('Last-Modified', None)
            if last_modified:
                timestamp = datetime.strptime(last_modified, '%a, %d %b %Y %H:%M:%S %Z').strftime('%Y-%m-%d %H:%M:%S')
            else:
                timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

            books_data.append({
                'Title': title,
                'Author': author,
                'Rating': rating,
                'Number of Ratings': num_ratings,
                'Timestamp': timestamp
            })

        time.sleep(random.uniform(1, 3))

    return books_data

def save_to_csv(data, filename='goodreads_books.csv'):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Title', 'Author', 'Rating', 'Number of Ratings', 'Timestamp']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for row in data:
            writer.writerow(row)

    print(f"Data saved to {filename}")

if __name__ == "__main__":
    books_data = scrape_goodreads_books()
    save_to_csv(books_data)

Data saved to goodreads_books.csv


In [3]:
import pandas as pd
data = pd.read_csv("goodreads_books.csv")

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Title              500 non-null    object
 1   Author             500 non-null    object
 2   Rating             500 non-null    object
 3   Number of Ratings  500 non-null    int64 
 4   Timestamp          500 non-null    object
dtypes: int64(1), object(4)
memory usage: 19.7+ KB


In [11]:
data.iloc[12]

Title                  Wuthering Heights
Author                      Emily Brontë
Rating                              3.89
Number of Ratings                1869398
Timestamp            2024-10-13 12:35:17
Name: 12, dtype: object