<a href="https://colab.research.google.com/github/yesminehe/AI-Inventory-Management/blob/main/sephora_scrap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install selenium beautifulsoup4

Collecting urllib3<3.0,>=2.5.0 (from urllib3[socks]<3.0,>=2.5.0->selenium)
  Downloading urllib3-2.6.3-py3-none-any.whl.metadata (6.9 kB)
Downloading urllib3-2.6.3-py3-none-any.whl (131 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.6/131.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: urllib3
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.26.20
    Uninstalling urllib3-1.26.20:
      Successfully uninstalled urllib3-1.26.20
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pyppeteer 2.0.0 requires pyee<12.0.0,>=11.0.0, but you have pyee 13.0.0 which is incompatible.
pyppeteer 2.0.0 requires urllib3<2.0.0,>=1.25.8, but you have urllib3 2.6.3 which is incompatible.
google-adk 1.21.0 requires websockets<16.0.0,>=15.0.1, but you have websockets 10.4 which is incompatible.[0

In [3]:
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import json
import uuid
from datetime import datetime
import time

def create_robust_session():
    session = requests.Session()
    retry_strategy = Retry(total=5, backoff_factor=2, status_forcelist=[429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("https://", adapter)
    return session

def get_product_details(session, url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0 Safari/537.36'}
    try:
        response = session.get(url, headers=headers, timeout=30)
        soup = BeautifulSoup(response.content, 'html.parser')
        desc_box = soup.select_one('.woocommerce-product-details__short-description') or soup.select_one('#tab-description')
        description = desc_box.get_text(separator="\n", strip=True) if desc_box else ""
        img_elem = soup.select_one('.woocommerce-product-gallery__image img') or soup.select_one('.wp-post-image')
        img_url = img_elem.get('src') if img_elem else ""
        return description, img_url
    except:
        return "", ""

def scrape_sephora_to_schema(base_url, max_pages=1):
    session = create_robust_session()
    products = []

    for page in range(1, max_pages + 1):
        page_url = f"{base_url}page/{page}/" if page > 1 else base_url
        print(f"--- Processing Page {page} ---")

        try:
            response = session.get(page_url, timeout=20)
            soup = BeautifulSoup(response.content, 'html.parser')
            items = soup.select('li.product')

            for item in items:
                name_elem = item.select_one('.woocommerce-loop-product__title')
                link_elem = item.select_one('a.woocommerce-LoopProduct-link')
                if not name_elem or not link_elem: continue

                name = name_elem.get_text(strip=True)
                product_url = link_elem.get('href')

                # --- NEW PRICE DETECTION LOGIC ---
                price_container = item.select_one('.price')
                reg_val, spec_val = 0.0, 0.0
                if price_container:
                    del_tag = price_container.select_one('del bdi')
                    ins_tag = price_container.select_one('ins bdi')
                    if del_tag and ins_tag:
                        reg_val = float(''.join(c for c in del_tag.get_text().replace(',', '.') if c.isdigit() or c == '.'))
                        spec_val = float(''.join(c for c in ins_tag.get_text().replace(',', '.') if c.isdigit() or c == '.'))
                    else:
                        bdi_tag = price_container.select_one('bdi')
                        if bdi_tag:
                            val = float(''.join(c for c in bdi_tag.get_text().replace(',', '.') if c.isdigit() or c == '.'))
                            reg_val, spec_val = val, val

                print(f"  Mapping: {name[:30]}... [Reg: {reg_val} / Spec: {spec_val}]")
                full_desc, main_image = get_product_details(session, product_url)

                product_id = uuid.uuid4().hex[:16]
                products.append({
                    "id": product_id,
                    "name": name,
                    "description": full_desc,
                    "shortDescription": name,
                    "extraDescription": "",
                    "tags": ["Sephora"],
                    "type": "simple_product",
                    "createdAt": datetime.now().isoformat(),
                    "status": "1",
                    "categories": {"id": "beauty_and_health", "name": "beauty_and_health"},
                    "price": spec_val, # Top level price is the one the user pays
                    "rating": 0.0,
                    "noOfRating": 0,
                    "images": {"main": main_image, "others": [main_image] if main_image else []},
                    "videoUrl": "",
                    "quantityConfig": {"minimum": "1", "totalAllowed": "5", "stepSize": "1"},
                    "warrantyPeriod": "3 months",
                    "isReturnable": "1",
                    "isCancelable": "1",
                    "isAttachmentRequired": "0",
                    "simpleProductData": {
                        "price": {
                            "regular": str(reg_val),
                            "special": str(spec_val)
                        },
                        "stock": {"sku": f"SEPHORA_{product_id[:8].upper()}", "status": "1", "totalStock": 999},
                        "dimensions": {"weight": "0", "length": "0", "width": "0", "height": "0"}
                    },
                    "attributes": [],
                    "variableProductData": None,
                    "vendorId": "sephora_tn"
                })
                time.sleep(1)
        except Exception as e:
            print(f"Error: {e}")
            break
    return products

if __name__ == "__main__":
    url = "https://www.drest.tn/marques/sephora/"
    results = scrape_sephora_to_schema(url, max_pages=3)
    with open('sephora_final2.json', 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

--- Processing Page 1 ---
  Mapping: Sephora – Pack The Only One – ... [Reg: 745.0 / Spec: 89.9]
  Mapping: Sephora – Pack Catchi 5 – (1 A... [Reg: 695.0 / Spec: 69.0]
  Mapping: Sephora – Pack Electric Eye & ... [Reg: 420.0 / Spec: 54.0]
  Mapping: Sephora – Pack MACMAC – (1 Ach... [Reg: 681.0 / Spec: 42.0]
  Mapping: Sephora – Pack Playful Plum Po... [Reg: 320.0 / Spec: 42.0]
  Mapping: Sephora – Pack The Big One – (... [Reg: 845.0 / Spec: 99.0]
  Mapping: Sephora – Pack The Modern Natu... [Reg: 135.0 / Spec: 36.0]
  Mapping: Sephora – Rouge à Levres Rouge... [Reg: 50.0 / Spec: 12.0]
  Mapping: Sephora – Pack 0002... [Reg: 200.0 / Spec: 35.0]
  Mapping: Sephora – Rouge à Levres Rouge... [Reg: 50.0 / Spec: 12.0]
  Mapping: Sephora – Base Yeux – 02 Matte... [Reg: 50.0 / Spec: 12.0]
  Mapping: Sephora – Fond De Teint Perfec... [Reg: 80.0 / Spec: 12.0]
  Mapping: Sephora – Vernis A Ongle Color... [Reg: 15.0 / Spec: 5.9]
  Mapping: Sephora – Pack All In One – (1... [Reg: 161.0 / Spec: 29.