In [None]:
# ==============================
# STEP 1: Install dependencies

# ==============================
!apt-get update -qq
!apt-get install -y chromium-chromedriver
!pip install selenium beautifulsoup4

# ==============================
# STEP 2: Imports
# ==============================
import sqlite3
import json
import time
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor, as_completed

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# ==============================
# STEP 3: Create Driver Function
# (No webdriver_manager, no explicit Service path)
# ==============================
def create_driver():
    chrome_options = Options()
    chrome_options.add_argument('--headless')     # No UI
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    # Rely on the system's /usr/bin/chromedriver to match /usr/bin/chromium-browser
    driver = webdriver.Chrome(options=chrome_options)
    return driver

# ==============================
# STEP 4: Database Setup
# ==============================
conn = sqlite3.connect('fairprice_products.db')
cursor = conn.cursor()
cursor.execute('''
    CREATE TABLE IF NOT EXISTS products (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        name TEXT,
        price REAL,
        size TEXT,
        ratings REAL,
        brand TEXT,
        origin TEXT,
        key_information TEXT,
        additional_information TEXT,
        dietary TEXT,
        nutritional_data TEXT,
        ingredients TEXT,
        url TEXT UNIQUE,
        category TEXT
    )
''')
conn.commit()

base_url = 'https://www.fairprice.com.sg'

# ==============================
# STEP 5: Helper for extracting section info
# ==============================
def extract_info(soup, header_list):
    """
    Finds <h2> whose text matches any entry in header_list,
    then extracts the text from the next sibling block (<div> or <ul>).
    """
    header = soup.find('h2', string=lambda t: t and t.strip() in header_list)
    if not header:
        return None
    content = header.find_next_sibling()
    if not content:
        return None
    # Typical usage: <div> or <ul>
    if content.name == 'div':
        # Try two known selectors
        new_span = content.find("span", class_="sc-d6741239-1 sc-704a0dea-3 ddJuGi jhbHHq")
        if new_span:
            return new_span.get_text(strip=True)
        old_span = content.select_one(".sc-aa673588-1.gRHrCx")
        if old_span:
            return old_span.get_text(strip=True)
        return content.get_text(strip=True)
    elif content.name == 'ul':
        spans = content.select("li span.sc-aa673588-1.gRHrCx")
        items = [s.get_text(strip=True).strip("•").strip("*").strip() for s in spans]
        return "\n".join(items)
    else:
        return content.get_text(strip=True)

def extract_dietary_info(soup):
    """
    Extracts dietary labels from: <div class="sc-a0a7679c-0 dIEbVv"> <span class="sc-d6741239-1 dxIZHM">Halal</span>.
    """
    dietary_divs = soup.select("div.sc-a0a7679c-0.dIEbVv")
    dietary_texts = []
    for d in dietary_divs:
        span = d.select_one("span.sc-d6741239-1.dxIZHM")
        if span:
            dietary_texts.append(span.get_text(strip=True))

    return "\n".join(dietary_texts) if dietary_texts else None

# ==============================
# STEP 6: Product Detail Scraping
# ==============================
def scrape_product_detail(link, category_name):
    driver = create_driver()
    product_data = {'category': category_name}
    try:
        driver.get(link)
        # Wait for product name or other critical element
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'sc-d6741239-1'))
        )
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Product Name
        name_element = soup.find('span', class_='sc-d6741239-1 eWUoWV')
        product_data['name'] = name_element.text.strip() if name_element else None

        # Price
        price_element = soup.find('span', class_='sc-d6741239-1 sc-747538d2-5 jybCaJ eQpgNG')
        if price_element:
            price_str = price_element.text.strip()
            product_data['price'] = float(price_str.replace('$', '').replace(',', ''))
        else:
            product_data['price'] = None

        # Size
        size_element = soup.find('span', class_='sc-d6741239-1 sc-e94e62e6-3 dVWVNZ cvQTxq')
        if size_element:
            inner_span = size_element.find("span")
            product_data['size'] = inner_span.get_text(strip=True) if inner_span else None
        else:
            product_data['size'] = None

        # Ratings
        ratings_element = soup.find('span', class_='sc-6fe931dc-4 gnxVUm pdp')
        product_data['ratings'] = float(ratings_element.text.strip()) if ratings_element else None

        # Brand
        brand_element = soup.select_one("[data-testid='brandDetails'] a")
        product_data['brand'] = brand_element.get_text(strip=True) if brand_element else None

        # Origin
        product_data['origin'] = extract_info(soup, ['COUNTRY/PLACE OF ORIGIN'])

        # Key Information
        product_data['key_information'] = extract_info(soup, ['KEY INFORMATION'])

        # Additional Information
        product_data['additional_information'] = extract_info(soup, ['ADDITIONAL INFORMATION'])

        # Dietary
        product_data['dietary'] = extract_dietary_info(soup)

        # Ingredients
        product_data['ingredients'] = extract_info(soup, ['INGREDIENTS'])

        # Nutritional Data
        nutri_ul = soup.find('ul', class_='sc-ad6d339b-0 lhIfvG')
        nutritional_data = {}
        if nutri_ul:
            # skip the first li if it's just headers
            all_li = nutri_ul.find_all('li')[1:]
            for li in all_li:
                spans = li.find_all('span')
                if len(spans) == 2:
                    attr = spans[0].text.strip()
                    value = spans[1].text.strip()
                    nutritional_data[attr] = value
        product_data['nutritional_data'] = json.dumps(nutritional_data) if nutritional_data else None

        product_data['url'] = link

        print(f"[{category_name}] Scraped: {product_data['name'] or 'Unnamed product'}")

    except Exception as e:
        print(f"Error scraping {link} under '{category_name}': {e}")
    finally:
        driver.quit()

    return product_data

# ==============================
# STEP 7: Category Page Scraping
# - Now fetch ALL products by scrolling
# ==============================
def scrape_category_all(category_url, category_name):
    """
    Scroll through the entire category page to load all products,
    then extract *all* product links found.
    """
    driver = create_driver()
    product_links = []
    try:
        driver.get(category_url)
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.sc-84b21786-3.gmQwVN"))
        )

        container = driver.find_element(By.CSS_SELECTOR, "div.sc-84b21786-3.gmQwVN")

        # We'll attempt a certain number of scroll cycles
        max_scrolls = 60  #scroll up to 60 times
        previous_count = 0
        extra_attempts = 0

        for _ in range(max_scrolls):
            products = container.find_elements(By.CSS_SELECTOR, "a.sc-e68f503d-3.jReLWP")
            current_count = len(products)

            # If we didn't increase the product count, increment extra_attempts
            if current_count <= previous_count:
                extra_attempts += 1
                # If we've not seen new products for a few attempts, break
                if extra_attempts >= 3:
                    break
            else:
                extra_attempts = 0  # reset
            previous_count = current_count

            # Scroll to last product
            if products:
                last_product = products[-1]
                driver.execute_script("arguments[0].scrollIntoView(true);", last_product)

            # Wait briefly for new items to load
            time.sleep(3)

        # Now parse the final loaded HTML
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        product_elements = soup.select("div.sc-84b21786-3.gmQwVN a.sc-e68f503d-3.jReLWP")

        # Build a list of unique product links
        deduped_links = []
        seen = set()
        for a in product_elements:
            href = a.get('href')
            if href and href not in seen:
                seen.add(href)
                full_link = urljoin(base_url, href)
                deduped_links.append(full_link)

        product_links = deduped_links

    except Exception as e:
        print(f"Error scraping category page {category_url}: {e}")
    finally:
        driver.quit()

    print(f"[{category_name}] Found {len(product_links)} total product links.")
    return product_links

# ==============================
# STEP 8: Main Execution
# ==============================
if __name__ == "__main__":
    category_data = {
        "Rice, Noodles & Cooking Ingredients": "https://www.fairprice.com.sg/category/rice-noodles-cooking-ingredients",
        "Meat & Seafood": "https://www.fairprice.com.sg/category/meat-seafood",
        "Fruits & Vegetables": "https://www.fairprice.com.sg/category/fruits-vegetables"
    }

    all_scraped_products = []

    print("Starting FULL scraping - loading all items per category...")

    for category_name, cat_url in category_data.items():
        # Instead of a small limit, we call scrape_category_all to get everything
        product_links = scrape_category_all(cat_url, category_name)

        # You can adjust max_workers if you want more parallelism
        with ThreadPoolExecutor(max_workers=5) as executor:
            future_to_link = {
                executor.submit(scrape_product_detail, link, category_name): link
                for link in product_links
            }
            for future in as_completed(future_to_link):
                data = future.result()
                if data:
                    all_scraped_products.append(data)

    # Print the scraped data (just a summary)
    print("\n=== SAMPLE SCRAPED DATA (Full Version) ===")
    print(f"Total items scraped across all categories: {len(all_scraped_products)}")
    for product in all_scraped_products[:10]:  # just show the first 10 for brevity
        print("-------------------------------------------------")
        print(f"Category         : {product['category']}")
        print(f"Name             : {product['name']}")
        print(f"Price            : {product['price']}")
        print(f"Size             : {product['size']}")
        print(f"Ratings          : {product['ratings']}")
        print(f"Brand            : {product['brand']}")
        print(f"Origin           : {product['origin']}")
        print(f"Key Information  : {product['key_information']}")
        print(f"Add. Information : {product['additional_information']}")
        print(f"Dietary          : {product['dietary']}")
        print(f"Ingredients      : {product['ingredients']}")
        print(f"Nutritional Data : {product['nutritional_data']}")
        print(f"Product URL      : {product['url']}")

    # Insert scraped data into SQLite
    bulk_data = [
        (
            p.get('name'),
            p.get('price'),
            p.get('size'),
            p.get('ratings'),
            p.get('brand'),
            p.get('origin'),
            p.get('key_information'),
            p.get('additional_information'),
            p.get('dietary'),
            p.get('nutritional_data'),
            p.get('ingredients'),
            p.get('url'),
            p.get('category')
        )
        for p in all_scraped_products
    ]
    cursor.executemany('''
        INSERT OR IGNORE INTO products
            (name, price, size, ratings, brand, origin, key_information,
             additional_information, dietary, nutritional_data, ingredients, url, category)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    ''', bulk_data)
    conn.commit()
    conn.close()

    print("\nFull scraping completed. Check your console output and database for verification!")

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
chromium-chromedriver is already the newest version (1:85.0.4183.83-0ubuntu2.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 32 not upgraded.
Starting FULL scraping - loading all items per category...
[Rice, Noodles & Cooking Ingredients] Found 999 total product links.
[Rice, Noodles & Cooking Ingredients] Scraped: Tai Sun Rice Vermicelli
[Rice, Noodles & Cooking Ingredients] Scraped: Golden Chef Paste - Singapore Rendang
[Rice, Noodles & Cooking Ingredients] Scraped: Prima Flour Packet Flour - Plain
[Rice, Noodles & Cooking Ingredients] Scraped: Golden Chef Paste - Singapore Hainanese Chicken Rice
[Rice, Noodles & Cooking Ingredients] Scraped: Royal Umbrella Thai Hom Mali Rice
[Rice, Noodles & C