In [54]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from typing import List, Dict
import json
import math

In [64]:
BASE_URL = "https://fashion-studio.dicoding.dev"
TOTAL_PAGES = 50

In [65]:
def parse_rating(text: str) -> float:
    try:
        return float(text)
    except (ValueError, TypeError):
        return math.nan

In [66]:
def _create_session() -> requests.Session:
    """
    Membuat session requests agar lebih efisien
    """
    session = requests.Session()
    session.headers.update(
        {
            "User-Agent": "Mozilla/5.0 (compatible; ETL-Bot/1.0)"
        }
    )
    return session

In [67]:
def scrape_page(session: requests.Session, page: int) -> List[Dict]:
    """
    Mengambil data produk dari satu halaman website
    """
    products = []
    # url = f"{BASE_URL}/page{page}"
    url = f"{BASE_URL}/page{page}" if page != 1 else BASE_URL

    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        # Error handling: gagal request halaman
        print(f"[ERROR] Failed to fetch page {page}: {e}")
        return products

    soup = BeautifulSoup(response.text, "html.parser")
    cards = soup.find_all("div", class_="product-details")
    # cards = [soup.find("div", class_="product-details")]
    
    for card in cards:
        try:
            data = [
                el for el in card.find_all(recursive=False)
            ]
            
            if len(data) != 6:
                print(f"[WARNING] Unexpected structure: {len(data)} elements")
                continue
            
            products.append(
                {
                    "title": data[0].get_text(strip=False),
                    "price": data[1].get_text(strip=False),
                    "rating": parse_rating(data[2].get_text(strip=False).split()[-3]),
                    "colors": data[3].get_text(strip=False).split()[0],
                    "size": data[4].get_text(strip=False).split()[1],
                    "gender": data[5].get_text(strip=False).split()[1],
                }
            )            
            # print(data, "\n")

        except AttributeError as e:
            # Error handling: struktur HTML tidak sesuai
            print(f"[WARNING] Skipped one product due to parsing issue: {e}")
            continue

    # debug
    # print(json.dumps(products, indent=2, ensure_ascii=False))
    return products

In [68]:

def scrape_all_products() -> List[Dict]:
    """
    Mengambil seluruh data produk dari halaman 1â€“50
    """
    session = _create_session()
    all_products: List[Dict] = []
    timestamp = datetime.utcnow().isoformat()

    for page in range(1, TOTAL_PAGES + 1):
        page_products = scrape_page(session, page)

        for product in page_products:
            product["timestamp"] = timestamp

        all_products.extend(page_products)

    return all_products

In [69]:
if __name__ == "__main__":
    data = scrape_all_products()
    print(f"Total Data: {len(data)}")
    print(data[0])

Total Data: 1000
{'title': 'Unknown Product', 'price': '$100.00', 'rating': nan, 'colors': '5', 'size': 'M', 'gender': 'Men', 'timestamp': '2026-02-10T04:42:45.110720'}
