In [1]:
pip install requests beautifulsoup4 pandas



In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import re


In [12]:
BASE_URL = "https://books.toscrape.com/"

def get_soup(url):
    resp = requests.get(url)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")

RATING_MAP = {
    "One": 1,
    "Two": 2,
    "Three": 3,
    "Four": 4,
    "Five": 5
}

def get_soup(url: str) -> BeautifulSoup:
    resp = requests.get(url)
    resp.encoding = "utf-8"        # penting: hilangkan karakter 'Â'
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")

def clean_price(text: str):
    if text:
        return text.replace("Â", "").strip()
    return None


In [13]:
def parse_book_detail(book_url: str, list_page_rating: int, list_page_title: str) -> dict:
    soup = get_soup(book_url)

    breadcrumb_links = soup.select("ul.breadcrumb li a")
    if len(breadcrumb_links) >= 3:
        # index 2 biasanya kategori (Home=0, Books=1, Category=2)
        category = breadcrumb_links[2].get_text(strip=True)
    elif len(breadcrumb_links) >= 2:
        category = breadcrumb_links[-1].get_text(strip=True)
    else:
        category = None

    # ---------- table info ----------
    table_rows = soup.select("table.table.table-striped tr")
    table_data = {}
    for row in table_rows:
        th = row.find("th").get_text(strip=True)
        td = row.find("td").get_text(strip=True)
        table_data[th] = td

    code = table_data.get("UPC")
    price_excl_tax = clean_price(table_data.get("Price (excl. tax)"))
    price_incl_tax = clean_price(table_data.get("Price (incl. tax)"))
    tax = clean_price(table_data.get("Tax"))
    availability = table_data.get("Availability")
    num_reviews = table_data.get("Number of reviews")

    # ---------- stock status & number of stock ----------
    stock_status = availability
    num_stock = None
    if availability:
        match = re.search(r"\((\d+)\s+available\)", availability)
        if match:
            num_stock = int(match.group(1))

    # ---------- description ----------
    desc = None
    desc_div = soup.find("div", id="product_description")
    if desc_div:
        p = desc_div.find_next_sibling("p")
        if p:
            desc = p.get_text(strip=True)

    # ---------- cover image ----------
    img_tag = soup.select_one("div.item.active img")
    cover_url = None
    if img_tag and img_tag.get("src"):
        cover_url = urljoin(book_url, img_tag["src"])

    # ---------- rating & title ----------
    rating = list_page_rating
    title = list_page_title

    # ---------- number of reviews -> int ----------
    num_reviews_int = None
    if num_reviews is not None:
        try:
            num_reviews_int = int(num_reviews)
        except ValueError:
            num_reviews_int = None

    return {
        "category": category,
        "code": code,
        "cover": cover_url,
        "title": title,
        "rating": rating,
        "price (excl. tax)": price_excl_tax,
        "price (incl. tax)": price_incl_tax,
        "tax": tax,
        "stock status": stock_status,
        "number of stock available": num_stock,
        "description": desc,
        "number of reviews": num_reviews_int
    }

In [17]:
books_data = []

current_url = BASE_URL

while True:
    print("Scraping list page:", current_url)
    soup = get_soup(current_url)

    # setiap buku di list page
    for article in soup.select("article.product_pod"):
        # title & url detail
        a_tag = article.select_one("h3 a")
        rel_link = a_tag.get("href")
        book_title = a_tag.get("title").strip()
        book_url = urljoin(current_url, rel_link)

        # rating dari list page
        rating_class = article.select_one("p.star-rating")["class"]
        # contoh: ['star-rating', 'Three']
        rating_word = None
        for c in rating_class:
            if c in RATING_MAP:
                rating_word = c
                break
        rating_value = RATING_MAP.get(rating_word, None)

        # parse detail book
        book_info = parse_book_detail(book_url, rating_value, book_title)
        books_data.append(book_info)

        # kalau sudah 1000 buku, berhenti
        if len(books_data) >= 1000:
            break

    if len(books_data) >= 1000:
        break

    # cari link next page
    next_link = soup.select_one("li.next a")
    if next_link:
        next_url = urljoin(current_url, next_link.get("href"))
        current_url = next_url
    else:
        break

print("Total buku yang di-scrape:", len(books_data))


Scraping list page: https://books.toscrape.com/
Scraping list page: https://books.toscrape.com/catalogue/page-2.html
Scraping list page: https://books.toscrape.com/catalogue/page-3.html
Scraping list page: https://books.toscrape.com/catalogue/page-4.html
Scraping list page: https://books.toscrape.com/catalogue/page-5.html
Scraping list page: https://books.toscrape.com/catalogue/page-6.html
Scraping list page: https://books.toscrape.com/catalogue/page-7.html
Scraping list page: https://books.toscrape.com/catalogue/page-8.html
Scraping list page: https://books.toscrape.com/catalogue/page-9.html
Scraping list page: https://books.toscrape.com/catalogue/page-10.html
Scraping list page: https://books.toscrape.com/catalogue/page-11.html
Scraping list page: https://books.toscrape.com/catalogue/page-12.html
Scraping list page: https://books.toscrape.com/catalogue/page-13.html
Scraping list page: https://books.toscrape.com/catalogue/page-14.html
Scraping list page: https://books.toscrape.com/cat

In [19]:
df = pd.DataFrame(books_data)

# optional: cek kolom & 5 baris pertama
print(df.shape)   # harusnya (1000, 12)
print(df.head())

# simpan ke CSV jika perlu
# CSV
df.to_csv("books_toscrape_1000.csv", index=False, encoding="utf-8")

# Excel
df.to_excel("books_toscrape_1000.xlsx", index=False, sheet_name="Books Data")


(1000, 12)
             category              code  \
0              Poetry  a897fe39b1053632   
1  Historical Fiction  90fa61229261140a   
2             Fiction  6957f44c3847a760   
3             Mystery  e00eb4fd7b871a48   
4             History  4165285e1663650f   

                                               cover  \
0  https://books.toscrape.com/media/cache/fe/72/f...   
1  https://books.toscrape.com/media/cache/08/e9/0...   
2  https://books.toscrape.com/media/cache/ee/cf/e...   
3  https://books.toscrape.com/media/cache/c0/59/c...   
4  https://books.toscrape.com/media/cache/ce/5f/c...   

                                   title  rating price (excl. tax)  \
0                   A Light in the Attic       3            £51.77   
1                     Tipping the Velvet       1            £53.74   
2                             Soumission       1            £50.10   
3                          Sharp Objects       4            £47.82   
4  Sapiens: A Brief History of Humankind   