In [2]:
# Get all categories from the website base on the categories list
import os
import json
import requests
from bs4 import BeautifulSoup
from typing import List, Dict, Tuple
import time
import re
import logging
from tqdm import tqdm

# Disable SSL warnings
requests.packages.urllib3.disable_warnings()
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

# Helper Function

In [3]:
def save_subcategories_to_file(sub_categories: List[Dict], filename="sub_categories.json"):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(sub_categories, f, ensure_ascii=False, indent=2)
    print(f"Subcategories saved to {filename}")

def read_subcategories_from_file(filename="sub_categories.json") -> List[Dict]:
    with open(filename, "r", encoding="utf-8") as f:
        return json.load(f)
    
def read_all_url_from_file(path: str) -> List[Dict]:
    if not os.path.exists(path):
        return []
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def sanitize_filename(name: str) -> str:
    return re.sub(r'[^\w\-_.]', '_', name)

def ensure_dir(path: str):
    os.makedirs(path, exist_ok=True)

def get_logger(name, log_path):
    logger = logging.getLogger(name)
    logger.setLevel(logging.INFO)

    if logger.hasHandlers():
        logger.handlers.clear()

    handler = logging.FileHandler(log_path, mode='a', encoding='utf-8')
    handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    logger.addHandler(handler)

    return logger

def load_fetched_urls_from_log(log_file):
    urls = set()
    if not os.path.exists(log_file):
        return urls

    with open(log_file, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            # Look for lines like: "YYYY-MM-DD HH:MM:SS,mmm - INFO - Fetched: https://example.com/product/123"
            if "Fetched:" in line:
                parts = line.split("Fetched:")
                if len(parts) == 2:
                    url = parts[1].strip()
                    urls.add(url)
    return urls

In [4]:
def scrape_subcategories(
    url: str, 
    categories_table_header_class: str = "brand_cat_table",
    domain: str = "https://www.stylekorean.com"
) -> List[Dict]:
    response = requests.get(url, headers=headers, verify=False)
    soup = BeautifulSoup(response.content, "html.parser")

    table = soup.find("table", class_=categories_table_header_class)
    if not table:
        print(f"No table found with class '{categories_table_header_class}'")
        return []

    trs = table.find_all("tr")
    sub_categories = []

    for tr in trs:
        th = tr.find("th")
        td = tr.find("td")
        
        if th and td:
            parent_link = th.find("a")
            if not parent_link:
                continue

            parent_name = parent_link.text.strip()
            parent_url = domain + parent_link['href']

            children = []
            for link in td.find_all("a"):
                child_name = link.text.strip()
                child_url = domain + link['href']
                children.append({
                    "name": child_name,
                    "url": child_url
                })

            sub_categories.append({
                "name": parent_name,
                "url": parent_url,
                "children": children
            })

    return sub_categories

In [5]:
def get_pagination_info(soup: BeautifulSoup) -> Tuple[int, int]:
    pagination_nav_class = "pg_wrap"
    pagination_current_strong_class = "pg_current"
    pagination_last_link_class = "pg_page pg_end"
    pagination = soup.find("nav", class_=pagination_nav_class)
    
    if not pagination:
        print(f"[!] No pagination found with class '{pagination_nav_class}'")
        return 1, 1

    # Current page
    current_page_tag = pagination.find("strong", class_=pagination_current_strong_class)
    current_page_number = int(current_page_tag.text.strip()) if current_page_tag else 1

    # Last page
    last_page_tag = pagination.find("a", class_=pagination_last_link_class)

    if last_page_tag and 'href' in last_page_tag.attrs:
        match = re.search(r'page=(\d+)', last_page_tag['href'])
        last_page_number = int(match.group(1)) if match else current_page_number
    else:
        last_page_number = current_page_number

    return current_page_number, last_page_number


def scrape_all_products_from_paginated_category(base_url: str, headers: dict, delay: float = 1.0) -> List[Dict[str, str]]:
    list_products = []
    silder_class = "productlist_skin"
    item_class = "sct_li"
    product_name_class = "sct_txt"

    print(f"Fetching base URL: {base_url}")
    response = requests.get(base_url, headers=headers, verify=False)
    soup = BeautifulSoup(response.content, "html.parser")

    current_page, last_page = get_pagination_info(soup)
    print(f"Current Page: {current_page}, Last Page: {last_page}")

    for page in range(1, last_page + 1):
        print(f"\nScraping page {page} of {last_page}...")
        paged_url = f"{base_url}&page={page}"
        response = requests.get(paged_url, headers=headers, verify=False)
        soup = BeautifulSoup(response.content, "html.parser")

        grid = soup.find("div", class_=silder_class)
        if not grid:
            print(f"[!] No product grid found on page {page}")
            continue

        items = grid.find_all("li", class_=item_class)
        if not items:
            print(f"[!] No items found in grid on page {page}")
            continue

        for item in items:
            product = item.find("p", class_=product_name_class)
            if not product:
                continue
            link = product.find("a")
            if not link:
                continue
            product_name = link.text.strip()
            product_url = link['href']
            list_products.append({
                "name": product_name,
                "url": product_url
            })

        # Sleep before requesting the next page
        print(f"Sleeping for {delay} seconds before next page...")
        time.sleep(delay)

    return list_products

In [6]:
def extract_product_data(soup: BeautifulSoup, domain: str = "") -> dict:
    title_h1 = soup.find("h1", id="sit_title")
    full_text = title_h1.text.strip() if title_h1 and title_h1.text else "No Title Found"
    brand_link = title_h1.find("a") if title_h1 else None
    brand_name = brand_link.text.strip() if brand_link and brand_link.text else "No Brand Name Found"
    product_name = full_text.replace(brand_name, '').strip()

    # Price and discount
    table = soup.find("table", class_="sit_ov_tbl")
    td = table.find_all("tr") if table else []
    span = td[2].find_all('span') if len(td) > 2 else []
    price_raw = span[0].text.strip() if len(span) > 0 and span[0].text else "0"
    discount_raw = span[1].text.strip() if len(span) > 1 and span[1].text else "0%"
    discount_digits = re.sub(r"[^\d]", "", discount_raw)
    discount = int(discount_digits) if discount_digits else 0
    price_match = re.search(r"\d+(\.\d+)?", price_raw)
    discount_price = float(price_match.group()) if price_match else 0.0
    price = round(discount_price / (1 - discount / 100), 2) if discount else discount_price

    # Images
    gallary_div = soup.find("div", id="sit_pvi_big")
    sit_pvi_big = gallary_div.find_all("a") if gallary_div else []
    images = []
    for link in sit_pvi_big:
        img_tag = link.find("img") if link else None
        if img_tag and img_tag.has_attr("src"):
            images.append({"url": img_tag["src"]})

    # Product info (code & weight)
    product_info = soup.find("div", class_="pro_info")
    info_text = product_info.text.replace("\xa0", " ").replace("\n", " ").strip() if product_info and product_info.text else ""
    
    code_match = re.search(r"Code\s*:\s*([A-Z0-9\-]+)", info_text)
    weight_match = re.search(r"Weight\s*:\s*([^\n\r]+)", info_text)
    code = code_match.group(1).strip() if code_match else None
    weight = weight_match.group(1).strip() if weight_match else None

    # Notifications
    notifications = []
    sub_noti = soup.find("div", class_="sub_noti")
    sub_noti_inners = sub_noti.find_all("div", class_="sub_noti_inner") if sub_noti else []
    for inner in sub_noti_inners:
        title = inner.find("div", class_="sub_noti_title")
        content = inner.find("div", class_="sub_noti_cont")
        notifications.append({
            "title": title.text.strip() if title and title.text else "No Title",
            "content": content.text.strip() if content and content.text else "No Content"
        })

    # Descriptions and usage
    main_taps2 = soup.find("div", id="main_taps2")
    yk0 = main_taps2.find("div", id="main_tabcontent_yk0") if main_taps2 else None
    yk1 = main_taps2.find("div", id="main_tabcontent_yk1") if main_taps2 else None

    desc_div = yk0.find("div", class_="is_contents") if yk0 else None
    description = desc_div.text.strip() if desc_div and desc_div.text else "No Description Found"

    ps = yk1.find_all("p") if yk1 else []
    extra_description = ps[0].text.strip() if len(ps) > 0 and ps[0].text else "No Extra Description Found"
    if len(ps) > 1:
        img_tag = ps[1].find("img")
        instruction_image = img_tag["src"] if img_tag and img_tag.has_attr("src") else "No Instruction Image Found"
    else:
        instruction_image = "No Instruction Image Found"

    # Shipping Info
    shiping_info = soup.find("div", id="shipping")
    shipping_text = shiping_info.text.strip() if shiping_info and shiping_info.text else "No Shipping Info Found"

    # Reviews
    review_table = soup.find("table", class_="review_table")
    tds = review_table.find_all("td") if review_table else []
    reviews = []
    i = 0
    while i < len(tds):
        if tds[i].has_attr("colspan"):
            i += 1
            continue
        try:
            star_img_tag = tds[i].find("img") if i < len(tds) else None
            star_img = star_img_tag["alt"] if star_img_tag and star_img_tag.has_attr("alt") else None
            title = tds[i + 1].text.strip() if i + 1 < len(tds) and tds[i + 1] else "No Title"
            email = tds[i + 2].text.strip() if i + 2 < len(tds) and tds[i + 2] else "No Email"
            content = tds[i + 3].decode_contents().strip() if i + 3 < len(tds) and tds[i + 3] else "No Content"
            date_text = tds[i + 5].text.strip() if i + 5 < len(tds) and tds[i + 5] else ""
            date_match = re.search(r"Posted on (\d{2}-\d{2}-\d{2})", date_text)
            date = date_match.group(1) if date_match else None

            reviews.append({
                "stars": star_img,
                "title": title,
                "email": email,
                "content": content,
                "date": date,
            })
            i += 7
        except Exception as e:
            print(f"Skipping review at index {i} due to error: {e}")
            i += 1

    return {
        "code": code,
        "weight": weight,
        "product_name": product_name,
        "brand": brand_name.strip('[]'),
        "brand_url": f"{domain}{brand_link['href']}" if brand_link and brand_link.has_attr("href") else "No URL Found",
        "price": price,
        "discount": discount,
        "discount_price": discount_price,
        "images": images,
        "notifications": notifications,
        "description": description,
        "extra_description": extra_description,
        "instruction_image": instruction_image,
        "shipping_info": shipping_text,
        "reviews": reviews
    }

In [7]:
# Constants
DATA_NAME = "stylekorean"
SUB_CATEGORIES = "sub_categories"
ALL_PRODUCT_URLS = "product_urls"
DETAIL_PRODUCTS = "product_details"
DOMAIN = "https://www.stylekorean.com"
LOG_DIR = "log"
DATA_DIR = "data"
DELAY_BETWEEN_FILES = 1

HEADER = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

CATEGORIES = [
    {"Skincare": f"{DOMAIN}/shop/list.php?ca_id=10"},
    {"Makeup": f"{DOMAIN}/shop/list.php?ca_id=11"},
    {"Body Care": f"{DOMAIN}/shop/list.php?ca_id=12"},
    {"Hair Care": f"{DOMAIN}/shop/list.php?ca_id=13"},
    {"Beautiful Device & Tools": f"{DOMAIN}/shop/list.php?ca_id=14"}
]

# Directories
subcat_data_dir = os.path.join(DATA_NAME, DATA_DIR, SUB_CATEGORIES)
product_data_root = os.path.join(DATA_NAME, DATA_DIR, ALL_PRODUCT_URLS)
product_detail_data_dir = os.path.join(DATA_NAME, DATA_DIR, DETAIL_PRODUCTS)

subcat_log_dir = os.path.join(DATA_NAME, LOG_DIR, SUB_CATEGORIES)
product_log_dir = os.path.join(DATA_NAME, LOG_DIR, ALL_PRODUCT_URLS)
product_detail_log_dir = os.path.join(DATA_NAME, LOG_DIR, DETAIL_PRODUCTS)

os.makedirs(subcat_data_dir, exist_ok=True)
os.makedirs(product_data_root, exist_ok=True)
os.makedirs(product_detail_data_dir, exist_ok=True)

os.makedirs(subcat_log_dir, exist_ok=True)
os.makedirs(product_log_dir, exist_ok=True)
os.makedirs(product_detail_log_dir, exist_ok=True)


# Get Link for each category

In [8]:
for category in tqdm(CATEGORIES, desc="Main Categories", unit="category"):
    for name, url in category.items():
        sanitized_name = name.replace(' ', '_').lower()
        log_path = os.path.join(subcat_log_dir, f"{sanitized_name}.log")
        logger = get_logger(name, log_path)

        filename = f"{sanitized_name}_sub_categories.json"
        filepath = os.path.join(subcat_data_dir, filename)

        # Recovery check
        if os.path.exists(filepath):
            msg = f"[SKIP] {name} → Already scraped and saved: {filepath}"
            logger.info(msg)
            tqdm.write(msg)
            continue

        msg = f"[START] Scraping subcategories for {name} → {url}"
        logger.info(msg)
        tqdm.write(msg)

        try:
            sub_categories = scrape_subcategories(url)
            if sub_categories:
                save_subcategories_to_file(sub_categories=sub_categories, filename=filepath)
                msg = f"[DONE] {name} → Saved {len(sub_categories)} subcategories to {filepath}"
                logger.info(msg)
                tqdm.write(msg)
            else:
                msg = f"[WARN] {name} → No subcategories found at {url}"
                logger.warning(msg)
                tqdm.write(msg)
        except Exception as e:
            msg = f"[ERROR] {name} → Failed to scrape subcategories: {str(e)}"
            logger.error(msg)
            tqdm.write(msg)

        time.sleep(DELAY_BETWEEN_FILES)

Main Categories: 100%|██████████| 5/5 [00:00<00:00, 951.82category/s]

[SKIP] Skincare → Already scraped and saved: stylekorean/data/sub_categories/skincare_sub_categories.json
[SKIP] Makeup → Already scraped and saved: stylekorean/data/sub_categories/makeup_sub_categories.json
[SKIP] Body Care → Already scraped and saved: stylekorean/data/sub_categories/body_care_sub_categories.json
[SKIP] Hair Care → Already scraped and saved: stylekorean/data/sub_categories/hair_care_sub_categories.json
[SKIP] Beautiful Device & Tools → Already scraped and saved: stylekorean/data/sub_categories/beautiful_device_&_tools_sub_categories.json





# Read data from previous file to get all product urls

In [17]:
files = os.listdir(subcat_data_dir)

for file in tqdm(files, desc="Processing Subcategory Files", unit="file"):
    base_filename = os.path.splitext(file)[0]
    input_path = os.path.join(subcat_data_dir, file)
    output_dir = os.path.join(product_data_root, base_filename)
    os.makedirs(output_dir, exist_ok=True)

    log_path = os.path.join(product_log_dir, f"{base_filename}.log")
    logger = get_logger(base_filename, log_path)

    tqdm.write(f"Logging to: {log_path}")

    try:
        sub_categories = read_subcategories_from_file(input_path)

        # Count total targets first
        all_targets = []
        for sub_category in sub_categories:
            children = sub_category['children'] if sub_category.get('children') else [sub_category]
            all_targets.extend(children)

        for entry in tqdm(all_targets, desc=f"Scraping Products: {base_filename}", unit="subcategory", leave=False):
            name, url = entry['name'], entry['url']
            filename = sanitize_filename(name) + ".json"
            output_path = os.path.join(output_dir, filename)

            if os.path.exists(output_path):
                msg = f"[SKIP] {name} — already exists: {output_path}"
                logger.info(msg)
                tqdm.write(msg)
                continue

            msg = f"[START] Scraping: {name} → {url}"
            logger.info(msg)
            tqdm.write(msg)

            try:
                products = scrape_all_products_from_paginated_category(url, HEADER, delay=DELAY_BETWEEN_FILES)
                with open(output_path, "w", encoding="utf-8") as f:
                    json.dump(products, f, ensure_ascii=False, indent=2)
                msg = f"[DONE] {name} — saved {len(products)} products to {filename}"
                logger.info(msg)
                tqdm.write(msg)
            except Exception as e:
                msg = f"[ERROR] {name} ({url}) — {str(e)}"
                logger.error(msg)
                tqdm.write(msg)

    except Exception as e:
        logger.error(f"[FATAL ERROR] Failed to process {file}: {str(e)}")

    tqdm.write(f"[COMPLETE] Finished scraping: {base_filename}")
    time.sleep(DELAY_BETWEEN_FILES)

Processing Subcategory Files:   0%|          | 0/5 [00:00<?, ?file/s]

Logging to: stylekorean/log/product_urls/skincare_sub_categories.log



Processing Subcategory Files:   0%|          | 0/5 [00:00<?, ?file/s]                      
Processing Subcategory Files:   0%|          | 0/5 [00:00<?, ?file/s]                      
Processing Subcategory Files:   0%|          | 0/5 [00:00<?, ?file/s]                      
Processing Subcategory Files:   0%|          | 0/5 [00:00<?, ?file/s]                      
Processing Subcategory Files:   0%|          | 0/5 [00:00<?, ?file/s]                      
Processing Subcategory Files:   0%|          | 0/5 [00:00<?, ?file/s]                      
Processing Subcategory Files:   0%|          | 0/5 [00:00<?, ?file/s]                      
Processing Subcategory Files:   0%|          | 0/5 [00:00<?, ?file/s]                      
Processing Subcategory Files:   0%|          | 0/5 [00:00<?, ?file/s]                      
Processing Subcategory Files:   0%|          | 0/5 [00:00<?, ?file/s]                      
Processing Subcategory Files:   0%|          | 0/5 [00:00<?, ?file/s]          

[SKIP] Sheet Masks (270) — already exists: stylekorean/data/product_urls/skincare_sub_categories/Sheet_Masks__270_.json
[SKIP] Sleeping Masks (23) — already exists: stylekorean/data/product_urls/skincare_sub_categories/Sleeping_Masks__23_.json
[SKIP] Pad (96) — already exists: stylekorean/data/product_urls/skincare_sub_categories/Pad__96_.json
[SKIP] Patch (68) — already exists: stylekorean/data/product_urls/skincare_sub_categories/Patch__68_.json
[SKIP] Wash off Masks (82) — already exists: stylekorean/data/product_urls/skincare_sub_categories/Wash_off_Masks__82_.json
[SKIP] Nose Pack (4) — already exists: stylekorean/data/product_urls/skincare_sub_categories/Nose_Pack__4_.json
[SKIP] Toner (213) — already exists: stylekorean/data/product_urls/skincare_sub_categories/Toner__213_.json
[SKIP] Emulsion (47) — already exists: stylekorean/data/product_urls/skincare_sub_categories/Emulsion__47_.json
[SKIP] Essence & Serum (563) — already exists: stylekorean/data/product_urls/skincare_sub_ca

Processing Subcategory Files:  20%|██        | 1/5 [00:01<00:04,  1.05s/file]

Logging to: stylekorean/log/product_urls/makeup_sub_categories.log



Processing Subcategory Files:  20%|██        | 1/5 [00:01<00:04,  1.05s/file]            
Processing Subcategory Files:  20%|██        | 1/5 [00:01<00:04,  1.05s/file]            
Processing Subcategory Files:  20%|██        | 1/5 [00:01<00:04,  1.05s/file]            
Processing Subcategory Files:  20%|██        | 1/5 [00:01<00:04,  1.05s/file]            
Processing Subcategory Files:  20%|██        | 1/5 [00:01<00:04,  1.05s/file]            
Processing Subcategory Files:  20%|██        | 1/5 [00:01<00:04,  1.05s/file]            
Processing Subcategory Files:  20%|██        | 1/5 [00:01<00:04,  1.05s/file]            
Processing Subcategory Files:  20%|██        | 1/5 [00:01<00:04,  1.05s/file]            
Processing Subcategory Files:  20%|██        | 1/5 [00:01<00:04,  1.05s/file]            
Processing Subcategory Files:  20%|██        | 1/5 [00:01<00:04,  1.05s/file]            
Processing Subcategory Files:  20%|██        | 1/5 [00:01<00:04,  1.05s/file]            
Processin

[SKIP] Foundation (27) — already exists: stylekorean/data/product_urls/makeup_sub_categories/Foundation__27_.json
[SKIP] Makeup Base (21) — already exists: stylekorean/data/product_urls/makeup_sub_categories/Makeup_Base__21_.json
[SKIP] Powder & Pact (13) — already exists: stylekorean/data/product_urls/makeup_sub_categories/Powder___Pact__13_.json
[SKIP] Concealer (10) — already exists: stylekorean/data/product_urls/makeup_sub_categories/Concealer__10_.json
[SKIP] Cushion (64) — already exists: stylekorean/data/product_urls/makeup_sub_categories/Cushion__64_.json
[SKIP] Blusher & Highlighter (43) — already exists: stylekorean/data/product_urls/makeup_sub_categories/Blusher___Highlighter__43_.json
[SKIP] Eyebrows (13) — already exists: stylekorean/data/product_urls/makeup_sub_categories/Eyebrows__13_.json
[SKIP] Eyeshadow (40) — already exists: stylekorean/data/product_urls/makeup_sub_categories/Eyeshadow__40_.json
[SKIP] Eyeliner (11) — already exists: stylekorean/data/product_urls/mak

Processing Subcategory Files:  40%|████      | 2/5 [00:02<00:03,  1.05s/file]

Logging to: stylekorean/log/product_urls/body_care_sub_categories.log



Processing Subcategory Files:  40%|████      | 2/5 [00:02<00:03,  1.05s/file]              
Processing Subcategory Files:  40%|████      | 2/5 [00:02<00:03,  1.05s/file]              
Processing Subcategory Files:  40%|████      | 2/5 [00:02<00:03,  1.05s/file]              
Processing Subcategory Files:  40%|████      | 2/5 [00:02<00:03,  1.05s/file]              
Processing Subcategory Files:  40%|████      | 2/5 [00:02<00:03,  1.05s/file]              

[SKIP] Body Lotion & Oils (44) — already exists: stylekorean/data/product_urls/body_care_sub_categories/Body_Lotion___Oils__44_.json
[SKIP] Body wash (28) — already exists: stylekorean/data/product_urls/body_care_sub_categories/Body_wash__28_.json
[SKIP] Body Scrub (2) — already exists: stylekorean/data/product_urls/body_care_sub_categories/Body_Scrub__2_.json
[SKIP] Hand & Foot & Oral Care (33) — already exists: stylekorean/data/product_urls/body_care_sub_categories/Hand___Foot___Oral_Care__33_.json
[SKIP] Body Mist & Fragrance (11) — already exists: stylekorean/data/product_urls/body_care_sub_categories/Body_Mist___Fragrance__11_.json
[COMPLETE] Finished scraping: body_care_sub_categories


Processing Subcategory Files:  60%|██████    | 3/5 [00:03<00:02,  1.04s/file]

Logging to: stylekorean/log/product_urls/hair_care_sub_categories.log



Processing Subcategory Files:  60%|██████    | 3/5 [00:03<00:02,  1.04s/file]              
Processing Subcategory Files:  60%|██████    | 3/5 [00:03<00:02,  1.04s/file]              
Processing Subcategory Files:  60%|██████    | 3/5 [00:03<00:02,  1.04s/file]              
Processing Subcategory Files:  60%|██████    | 3/5 [00:03<00:02,  1.04s/file]              

[SKIP] Shampoo & Conditioner (40) — already exists: stylekorean/data/product_urls/hair_care_sub_categories/Shampoo___Conditioner__40_.json
[SKIP] Hair Treatment (25) — already exists: stylekorean/data/product_urls/hair_care_sub_categories/Hair_Treatment__25_.json
[SKIP] Hair Essence & Serum (18) — already exists: stylekorean/data/product_urls/hair_care_sub_categories/Hair_Essence___Serum__18_.json
[SKIP] Hair Color & Styling (8) — already exists: stylekorean/data/product_urls/hair_care_sub_categories/Hair_Color___Styling__8_.json
[COMPLETE] Finished scraping: hair_care_sub_categories


Processing Subcategory Files:  80%|████████  | 4/5 [00:04<00:01,  1.03s/file]

Logging to: stylekorean/log/product_urls/beautiful_device_&_tools_sub_categories.log



Processing Subcategory Files:  80%|████████  | 4/5 [00:04<00:01,  1.03s/file]                             
Processing Subcategory Files:  80%|████████  | 4/5 [00:04<00:01,  1.03s/file]                             
Processing Subcategory Files:  80%|████████  | 4/5 [00:04<00:01,  1.03s/file]                             
Processing Subcategory Files:  80%|████████  | 4/5 [00:04<00:01,  1.03s/file]                             

[SKIP] Beauty Device (10) — already exists: stylekorean/data/product_urls/beautiful_device_&_tools_sub_categories/Beauty_Device__10_.json
[SKIP] Cotton pad (3) — already exists: stylekorean/data/product_urls/beautiful_device_&_tools_sub_categories/Cotton_pad__3_.json
[SKIP] Brush (41) — already exists: stylekorean/data/product_urls/beautiful_device_&_tools_sub_categories/Brush__41_.json
[SKIP] Others (25) — already exists: stylekorean/data/product_urls/beautiful_device_&_tools_sub_categories/Others__25_.json
[COMPLETE] Finished scraping: beautiful_device_&_tools_sub_categories


Processing Subcategory Files: 100%|██████████| 5/5 [00:05<00:00,  1.03s/file]


# Get detail products

In [None]:
for folder in tqdm(os.listdir(product_data_root), desc="Processing Folders", unit="folder"):
    folder_path = os.path.join(product_data_root, folder)
    if not os.path.isdir(folder_path):
        continue

    for filename in tqdm(os.listdir(folder_path), desc=f"Processing Files in {folder}", unit="file", leave=False):
        input_file = os.path.join(folder_path, filename)
        urls_data = read_all_url_from_file(input_file)
        if not urls_data:
            continue

        output_folder = os.path.join(product_detail_data_dir, folder)
        ensure_dir(output_folder)

        log_folder = os.path.join(product_detail_log_dir, folder)
        ensure_dir(log_folder)

        output_file = os.path.join(output_folder, filename)
        log_file = os.path.join(log_folder, f"{os.path.splitext(filename)[0]}.log")

        logger = get_logger(f"{folder}_{filename}", log_file)
        logger.info(f"Processing: {input_file}")
        tqdm.write(f"Processing: {input_file}")

        # Load existing data
        if os.path.exists(output_file):
            with open(output_file, "r", encoding="utf-8") as f:
                extracted_items = json.load(f)
        else:
            extracted_items = []

        existing_urls = {item.get("original_url") for item in extracted_items}
        fetched_urls = load_fetched_urls_from_log(log_file)

        for product in tqdm(urls_data, desc=f"Fetching Products in {filename}", unit="product", leave=False):
            name = product.get("name")
            url = product.get("url")

            if not url or url in fetched_urls or url in existing_urls:
                msg = f"[SKIP] Already fetched: {url}"
                logger.info(msg)
                tqdm.write(msg)
                continue

            try:
                msg = f"[FETCH] {url}"
                logger.info(msg)
                tqdm.write(msg)

                res = requests.get(url, timeout=10)
                res.raise_for_status()
                soup = BeautifulSoup(res.text, "html.parser")
                    
                data = extract_product_data(soup, DOMAIN)
                data["original_name"] = name
                data["original_url"] = url
                data["category"] = folder
                data["subcategory"] = os.path.splitext(filename)[0]


                extracted_items.append(data)
                logger.info(f"Fetched: {url}")

                # Save incrementally
                with open(output_file, "w", encoding="utf-8") as f:
                    json.dump(extracted_items, f, indent=2, ensure_ascii=False)

            except Exception as e:
                err_msg = f"[ERROR] Fetching {url}: {e}"
                logger.error(err_msg)
                tqdm.write(err_msg)

            time.sleep(DELAY_BETWEEN_FILES)

        logger.info(f"[DONE] Saved final data: {output_file}")
        tqdm.write(f"[DONE] Saved final data: {output_file}")