In [1]:
import os
import time
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


# ===============================
# CONFIG
# ===============================
SEARCH_QUERY = "Fragrance"
CATEGORY = "Beauty and Care"
TARGET_COUNT = 5
CSV_FILE = "fragrance_100_products.csv"


# ===============================
# CSV INIT
# ===============================
if not os.path.exists(CSV_FILE):
    pd.DataFrame(columns=[
        "product_name",
        "product_price",
        "overall_rating",
        "product_url",
        "category"
    ]).to_csv(CSV_FILE, index=False, encoding="utf-8-sig")


def get_csv_count():
    return len(pd.read_csv(CSV_FILE))


# ===============================
# SELENIUM SETUP
# ===============================
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

wait = WebDriverWait(driver, 20)


# ===============================
# OPEN FLIPKART
# ===============================
url = f"https://www.flipkart.com/search?q={SEARCH_QUERY}"
driver.get(url)
time.sleep(5)

# Close login popup
try:
    wait.until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'‚úï')]"))
    ).click()
except:
    pass


# ===============================
# MAIN SCRAPING LOOP
# ===============================
seen_links = set()
page = 1

while True:

    current_count = get_csv_count()
    if current_count >= TARGET_COUNT:
        print(f"\nüéØ Target reached ({current_count}). Stopping.")
        break

    print(f"\nüîÑ Page {page} | CSV count: {current_count}")

    # Scroll
    for _ in range(5):
        driver.execute_script("window.scrollBy(0, 1400);")
        time.sleep(2)

    # üî• CORRECT: CARD CONTAINER
    cards = driver.find_elements(By.XPATH, "//div[@data-id]")
    print("Cards detected:", len(cards))

    for card in cards:

        if get_csv_count() >= TARGET_COUNT:
            break

        try:
            link = card.find_element(By.TAG_NAME, "a").get_attribute("href")
        except:
            continue

        if not link or link in seen_links:
            continue
        seen_links.add(link)

        # ‚úÖ EXACT CLASSES (INSIDE CARD)
        try:
            name = card.find_element(By.CLASS_NAME, "pIpigb").text
        except:
            continue

        try:
            price = card.find_element(By.CLASS_NAME, "hZ3P6w").text
        except:
            price = None

        try:
            rating = card.find_element(By.CLASS_NAME, "MKiFS6").text
        except:
            rating = None

        sentiment = rating_to_sentiment(rating)

        pd.DataFrame([{
            "product_name": name,
            "product_price": price,
            "overall_rating": rating,         
            "product_url": link,
            "category": CATEGORY
        }]).to_csv(

            CSV_FILE,
            mode="a",
            header=False,
            index=False,
            encoding="utf-8-sig"
        )

        print(f"‚úÖ Saved: {get_csv_count()} / {TARGET_COUNT}")

    if get_csv_count() >= TARGET_COUNT:
        break

    # Next page
    try:
        next_btn = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//a/span[text()='Next']"))
        )
        driver.execute_script("arguments[0].click();", next_btn)
        time.sleep(5)
        page += 1
    except:
        print("‚ùå No more pages")
        break


# ===============================
# EXIT
# ===============================
driver.quit()
print("\nüéâ DONE")
print("Final rows:", get_csv_count())
print("File:", CSV_FILE)



üéØ Target reached (100). Stopping.

üéâ DONE
Final rows: 100
File: fragrance_100_products.csv


In [9]:
import os
import time
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


# ===============================
# CONFIG
# ===============================
SEARCH_QUERY = "Bath and Body"
CATEGORY = "Beauty and Care"
TARGET_COUNT = 100
CSV_FILE = "bathandbody_100_products.csv"


# ===============================
# CSV INIT
# ===============================
if not os.path.exists(CSV_FILE):
    pd.DataFrame(columns=[
        "product_name",
        "product_price",
        "overall_rating",
        "product_url",
        "category"
    ]).to_csv(CSV_FILE, index=False, encoding="utf-8-sig")


def get_csv_count():
    return len(pd.read_csv(CSV_FILE))


# ===============================
# SELENIUM SETUP
# ===============================
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

wait = WebDriverWait(driver, 20)


# ===============================
# OPEN FLIPKART
# ===============================
url = f"https://www.flipkart.com/search?q={SEARCH_QUERY}"
driver.get(url)
time.sleep(5)

# Close login popup
try:
    wait.until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'‚úï')]"))
    ).click()
except:
    pass


# ===============================
# MAIN SCRAPING LOOP
# ===============================
seen_links = set()
page = 1

while True:

    current_count = get_csv_count()
    if current_count >= TARGET_COUNT:
        print(f"\nüéØ Target reached ({current_count}). Stopping.")
        break

    print(f"\nüîÑ Page {page} | CSV count: {current_count}")

    # Scroll
    for _ in range(5):
        driver.execute_script("window.scrollBy(0, 1400);")
        time.sleep(2)

    # üî• CORRECT: CARD CONTAINER
    cards = driver.find_elements(By.XPATH, "//div[@data-id]")
    print("Cards detected:", len(cards))

    for card in cards:

        if get_csv_count() >= TARGET_COUNT:
            break

        try:
            link = card.find_element(By.TAG_NAME, "a").get_attribute("href")
        except:
            continue

        if not link or link in seen_links:
            continue
        seen_links.add(link)

        # ‚úÖ EXACT CLASSES (INSIDE CARD)
        try:
            name = card.find_element(By.CLASS_NAME, "pIpigb").text
        except:
            continue

        try:
            price = card.find_element(By.CLASS_NAME, "hZ3P6w").text
        except:
            price = None

        try:
            rating = card.find_element(By.CLASS_NAME, "MKiFS6").text
        except:
            rating = None

        pd.DataFrame([{
            "product_name": name,
            "product_price": price,
            "overall_rating": rating,
            "product_url": link,
            "category": CATEGORY
        }]).to_csv(
            CSV_FILE,
            mode="a",
            header=False,
            index=False,
            encoding="utf-8-sig"
        )

        print(f"‚úÖ Saved: {get_csv_count()} / {TARGET_COUNT}")

    if get_csv_count() >= TARGET_COUNT:
        break

    # Next page
    try:
        next_btn = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//a/span[text()='Next']"))
        )
        driver.execute_script("arguments[0].click();", next_btn)
        time.sleep(5)
        page += 1
    except:
        print("‚ùå No more pages")
        break


# ===============================
# EXIT
# ===============================
driver.quit()
print("\nüéâ DONE")
print("Final rows:", get_csv_count())
print("File:", CSV_FILE)



üîÑ Page 1 | CSV count: 5
Cards detected: 40
‚úÖ Saved: 6 / 100
‚úÖ Saved: 7 / 100
‚úÖ Saved: 8 / 100
‚úÖ Saved: 9 / 100
‚úÖ Saved: 10 / 100
‚úÖ Saved: 11 / 100
‚úÖ Saved: 12 / 100
‚úÖ Saved: 13 / 100
‚úÖ Saved: 14 / 100
‚úÖ Saved: 15 / 100
‚úÖ Saved: 16 / 100
‚úÖ Saved: 17 / 100
‚úÖ Saved: 18 / 100
‚úÖ Saved: 19 / 100
‚úÖ Saved: 20 / 100
‚úÖ Saved: 21 / 100
‚úÖ Saved: 22 / 100
‚úÖ Saved: 23 / 100
‚úÖ Saved: 24 / 100
‚úÖ Saved: 25 / 100
‚úÖ Saved: 26 / 100
‚úÖ Saved: 27 / 100
‚úÖ Saved: 28 / 100
‚úÖ Saved: 29 / 100
‚úÖ Saved: 30 / 100
‚úÖ Saved: 31 / 100
‚úÖ Saved: 32 / 100
‚úÖ Saved: 33 / 100
‚úÖ Saved: 34 / 100
‚úÖ Saved: 35 / 100
‚úÖ Saved: 36 / 100
‚úÖ Saved: 37 / 100
‚úÖ Saved: 38 / 100
‚úÖ Saved: 39 / 100
‚úÖ Saved: 40 / 100
‚úÖ Saved: 41 / 100
‚úÖ Saved: 42 / 100
‚úÖ Saved: 43 / 100
‚úÖ Saved: 44 / 100
‚úÖ Saved: 45 / 100

üîÑ Page 2 | CSV count: 45
Cards detected: 40
‚úÖ Saved: 46 / 100
‚úÖ Saved: 47 / 100
‚úÖ Saved: 48 / 100
‚úÖ Saved: 49 / 100
‚úÖ Saved: 50 / 100
‚úÖ Saved

In [11]:
import os
import time
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


# ===============================
# CONFIG
# ===============================
SEARCH_QUERY = "Hair Care"
CATEGORY = "Beauty and Care"
TARGET_COUNT = 100
CSV_FILE = "haircare_100_products.csv"


# ===============================
# CSV INIT
# ===============================
if not os.path.exists(CSV_FILE):
    pd.DataFrame(columns=[
        "product_name",
        "product_price",
        "overall_rating",
        "product_url",
        "category"
    ]).to_csv(CSV_FILE, index=False, encoding="utf-8-sig")


def get_csv_count():
    return len(pd.read_csv(CSV_FILE))


# ===============================
# SELENIUM SETUP
# ===============================
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

wait = WebDriverWait(driver, 20)


# ===============================
# OPEN FLIPKART
# ===============================
url = f"https://www.flipkart.com/search?q={SEARCH_QUERY}"
driver.get(url)
time.sleep(5)

# Close login popup
try:
    wait.until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'‚úï')]"))
    ).click()
except:
    pass


# ===============================
# MAIN SCRAPING LOOP
# ===============================
seen_links = set()
page = 1

while True:

    current_count = get_csv_count()
    if current_count >= TARGET_COUNT:
        print(f"\nüéØ Target reached ({current_count}). Stopping.")
        break

    print(f"\nüîÑ Page {page} | CSV count: {current_count}")

    # Scroll
    for _ in range(5):
        driver.execute_script("window.scrollBy(0, 1400);")
        time.sleep(2)

    # üî• CORRECT: CARD CONTAINER
    cards = driver.find_elements(By.XPATH, "//div[@data-id]")
    print("Cards detected:", len(cards))

    for card in cards:

        if get_csv_count() >= TARGET_COUNT:
            break

        try:
            link = card.find_element(By.TAG_NAME, "a").get_attribute("href")
        except:
            continue

        if not link or link in seen_links:
            continue
        seen_links.add(link)

        # ‚úÖ EXACT CLASSES (INSIDE CARD)
        try:
            name = card.find_element(By.CLASS_NAME, "pIpigb").text
        except:
            continue

        try:
            price = card.find_element(By.CLASS_NAME, "hZ3P6w").text
        except:
            price = None

        try:
            rating = card.find_element(By.CLASS_NAME, "MKiFS6").text
        except:
            rating = None

        pd.DataFrame([{
            "product_name": name,
            "product_price": price,
            "overall_rating": rating,
            "product_url": link,
            "category": CATEGORY
        }]).to_csv(
            CSV_FILE,
            mode="a",
            header=False,
            index=False,
            encoding="utf-8-sig"
        )

        print(f"‚úÖ Saved: {get_csv_count()} / {TARGET_COUNT}")

    if get_csv_count() >= TARGET_COUNT:
        break

    # Next page
    try:
        next_btn = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//a/span[text()='Next']"))
        )
        driver.execute_script("arguments[0].click();", next_btn)
        time.sleep(5)
        page += 1
    except:
        print("‚ùå No more pages")
        break


# ===============================
# EXIT
# ===============================
driver.quit()
print("\nüéâ DONE")
print("Final rows:", get_csv_count())
print("File:", CSV_FILE)



üîÑ Page 1 | CSV count: 5
Cards detected: 40
‚úÖ Saved: 6 / 100
‚úÖ Saved: 7 / 100
‚úÖ Saved: 8 / 100
‚úÖ Saved: 9 / 100
‚úÖ Saved: 10 / 100
‚úÖ Saved: 11 / 100
‚úÖ Saved: 12 / 100
‚úÖ Saved: 13 / 100
‚úÖ Saved: 14 / 100
‚úÖ Saved: 15 / 100
‚úÖ Saved: 16 / 100
‚úÖ Saved: 17 / 100
‚úÖ Saved: 18 / 100
‚úÖ Saved: 19 / 100
‚úÖ Saved: 20 / 100
‚úÖ Saved: 21 / 100
‚úÖ Saved: 22 / 100
‚úÖ Saved: 23 / 100
‚úÖ Saved: 24 / 100
‚úÖ Saved: 25 / 100
‚úÖ Saved: 26 / 100
‚úÖ Saved: 27 / 100
‚úÖ Saved: 28 / 100
‚úÖ Saved: 29 / 100
‚úÖ Saved: 30 / 100
‚úÖ Saved: 31 / 100
‚úÖ Saved: 32 / 100
‚úÖ Saved: 33 / 100
‚úÖ Saved: 34 / 100
‚úÖ Saved: 35 / 100
‚úÖ Saved: 36 / 100
‚úÖ Saved: 37 / 100
‚úÖ Saved: 38 / 100
‚úÖ Saved: 39 / 100
‚úÖ Saved: 40 / 100
‚úÖ Saved: 41 / 100
‚úÖ Saved: 42 / 100
‚úÖ Saved: 43 / 100
‚úÖ Saved: 44 / 100
‚úÖ Saved: 45 / 100

üîÑ Page 2 | CSV count: 45
Cards detected: 40
‚úÖ Saved: 46 / 100
‚úÖ Saved: 47 / 100
‚úÖ Saved: 48 / 100
‚úÖ Saved: 49 / 100
‚úÖ Saved: 50 / 100
‚úÖ Saved

In [13]:
import os
import time
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


# ===============================
# CONFIG
# ===============================
SEARCH_QUERY = "Men Grooming"
CATEGORY = "Beauty and Care"
TARGET_COUNT = 100
CSV_FILE = "mengrooming_100_products.csv"


# ===============================
# CSV INIT
# ===============================
if not os.path.exists(CSV_FILE):
    pd.DataFrame(columns=[
        "product_name",
        "product_price",
        "overall_rating",
        "product_url",
        "category"
    ]).to_csv(CSV_FILE, index=False, encoding="utf-8-sig")


def get_csv_count():
    return len(pd.read_csv(CSV_FILE))


# ===============================
# SELENIUM SETUP
# ===============================
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

wait = WebDriverWait(driver, 20)


# ===============================
# OPEN FLIPKART
# ===============================
url = f"https://www.flipkart.com/search?q={SEARCH_QUERY}"
driver.get(url)
time.sleep(5)

# Close login popup
try:
    wait.until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'‚úï')]"))
    ).click()
except:
    pass


# ===============================
# MAIN SCRAPING LOOP
# ===============================
seen_links = set()
page = 1

while True:

    current_count = get_csv_count()
    if current_count >= TARGET_COUNT:
        print(f"\nüéØ Target reached ({current_count}). Stopping.")
        break

    print(f"\nüîÑ Page {page} | CSV count: {current_count}")

    # Scroll
    for _ in range(5):
        driver.execute_script("window.scrollBy(0, 1400);")
        time.sleep(2)

    # üî• CORRECT: CARD CONTAINER
    cards = driver.find_elements(By.XPATH, "//div[@data-id]")
    print("Cards detected:", len(cards))

    for card in cards:

        if get_csv_count() >= TARGET_COUNT:
            break

        try:
            link = card.find_element(By.TAG_NAME, "a").get_attribute("href")
        except:
            continue

        if not link or link in seen_links:
            continue
        seen_links.add(link)

        # ‚úÖ EXACT CLASSES (INSIDE CARD)
        try:
            name = card.find_element(By.CLASS_NAME, "pIpigb").text
        except:
            continue

        try:
            price = card.find_element(By.CLASS_NAME, "hZ3P6w").text
        except:
            price = None

        try:
            rating = card.find_element(By.CLASS_NAME, "MKiFS6").text
        except:
            rating = None

        pd.DataFrame([{
            "product_name": name,
            "product_price": price,
            "overall_rating": rating,
            "product_url": link,
            "category": CATEGORY
        }]).to_csv(
            CSV_FILE,
            mode="a",
            header=False,
            index=False,
            encoding="utf-8-sig"
        )

        print(f"‚úÖ Saved: {get_csv_count()} / {TARGET_COUNT}")

    if get_csv_count() >= TARGET_COUNT:
        break

    # Next page
    try:
        next_btn = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//a/span[text()='Next']"))
        )
        driver.execute_script("arguments[0].click();", next_btn)
        time.sleep(5)
        page += 1
    except:
        print("‚ùå No more pages")
        break


# ===============================
# EXIT
# ===============================
driver.quit()
print("\nüéâ DONE")
print("Final rows:", get_csv_count())
print("File:", CSV_FILE)



üîÑ Page 1 | CSV count: 5
Cards detected: 40
‚úÖ Saved: 6 / 100
‚úÖ Saved: 7 / 100
‚úÖ Saved: 8 / 100
‚úÖ Saved: 9 / 100
‚úÖ Saved: 10 / 100
‚úÖ Saved: 11 / 100
‚úÖ Saved: 12 / 100
‚úÖ Saved: 13 / 100
‚úÖ Saved: 14 / 100
‚úÖ Saved: 15 / 100
‚úÖ Saved: 16 / 100
‚úÖ Saved: 17 / 100
‚úÖ Saved: 18 / 100
‚úÖ Saved: 19 / 100
‚úÖ Saved: 20 / 100
‚úÖ Saved: 21 / 100
‚úÖ Saved: 22 / 100
‚úÖ Saved: 23 / 100
‚úÖ Saved: 24 / 100
‚úÖ Saved: 25 / 100
‚úÖ Saved: 26 / 100
‚úÖ Saved: 27 / 100
‚úÖ Saved: 28 / 100
‚úÖ Saved: 29 / 100
‚úÖ Saved: 30 / 100
‚úÖ Saved: 31 / 100
‚úÖ Saved: 32 / 100
‚úÖ Saved: 33 / 100
‚úÖ Saved: 34 / 100
‚úÖ Saved: 35 / 100
‚úÖ Saved: 36 / 100
‚úÖ Saved: 37 / 100
‚úÖ Saved: 38 / 100
‚úÖ Saved: 39 / 100
‚úÖ Saved: 40 / 100
‚úÖ Saved: 41 / 100
‚úÖ Saved: 42 / 100
‚úÖ Saved: 43 / 100
‚úÖ Saved: 44 / 100
‚úÖ Saved: 45 / 100

üîÑ Page 2 | CSV count: 45
Cards detected: 40
‚úÖ Saved: 46 / 100
‚úÖ Saved: 47 / 100
‚úÖ Saved: 48 / 100
‚úÖ Saved: 49 / 100
‚úÖ Saved: 50 / 100
‚úÖ Saved

In [15]:
import os
import time
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


# ===============================
# CONFIG
# ===============================
SEARCH_QUERY = "Women Skincare"
CATEGORY = "Beauty and Care"
TARGET_COUNT = 100
CSV_FILE = "womenskincare_100_products.csv"


# ===============================
# CSV INIT
# ===============================
if not os.path.exists(CSV_FILE):
    pd.DataFrame(columns=[
        "product_name",
        "product_price",
        "overall_rating",
        "product_url",
        "category"
    ]).to_csv(CSV_FILE, index=False, encoding="utf-8-sig")


def get_csv_count():
    return len(pd.read_csv(CSV_FILE))


# ===============================
# SELENIUM SETUP
# ===============================
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

wait = WebDriverWait(driver, 20)


# ===============================
# OPEN FLIPKART
# ===============================
url = f"https://www.flipkart.com/search?q={SEARCH_QUERY}"
driver.get(url)
time.sleep(5)

# Close login popup
try:
    wait.until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'‚úï')]"))
    ).click()
except:
    pass


# ===============================
# MAIN SCRAPING LOOP
# ===============================
seen_links = set()
page = 1

while True:

    current_count = get_csv_count()
    if current_count >= TARGET_COUNT:
        print(f"\nüéØ Target reached ({current_count}). Stopping.")
        break

    print(f"\nüîÑ Page {page} | CSV count: {current_count}")

    # Scroll
    for _ in range(5):
        driver.execute_script("window.scrollBy(0, 1400);")
        time.sleep(2)

    # üî• CORRECT: CARD CONTAINER
    cards = driver.find_elements(By.XPATH, "//div[@data-id]")
    print("Cards detected:", len(cards))

    for card in cards:

        if get_csv_count() >= TARGET_COUNT:
            break

        try:
            link = card.find_element(By.TAG_NAME, "a").get_attribute("href")
        except:
            continue

        if not link or link in seen_links:
            continue
        seen_links.add(link)

        # ‚úÖ EXACT CLASSES (INSIDE CARD)
        try:
            name = card.find_element(By.CLASS_NAME, "pIpigb").text
        except:
            continue

        try:
            price = card.find_element(By.CLASS_NAME, "hZ3P6w").text
        except:
            price = None

        try:
            rating = card.find_element(By.CLASS_NAME, "MKiFS6").text
        except:
            rating = None

        pd.DataFrame([{
            "product_name": name,
            "product_price": price,
            "overall_rating": rating,
            "product_url": link,
            "category": CATEGORY
        }]).to_csv(
            CSV_FILE,
            mode="a",
            header=False,
            index=False,
            encoding="utf-8-sig"
        )

        print(f"‚úÖ Saved: {get_csv_count()} / {TARGET_COUNT}")

    if get_csv_count() >= TARGET_COUNT:
        break

    # Next page
    try:
        next_btn = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//a/span[text()='Next']"))
        )
        driver.execute_script("arguments[0].click();", next_btn)
        time.sleep(5)
        page += 1
    except:
        print("‚ùå No more pages")
        break


# ===============================
# EXIT
# ===============================
driver.quit()
print("\nüéâ DONE")
print("Final rows:", get_csv_count())
print("File:", CSV_FILE)



üîÑ Page 1 | CSV count: 5
Cards detected: 40
‚úÖ Saved: 6 / 100
‚úÖ Saved: 7 / 100
‚úÖ Saved: 8 / 100
‚úÖ Saved: 9 / 100
‚úÖ Saved: 10 / 100
‚úÖ Saved: 11 / 100
‚úÖ Saved: 12 / 100
‚úÖ Saved: 13 / 100
‚úÖ Saved: 14 / 100
‚úÖ Saved: 15 / 100
‚úÖ Saved: 16 / 100
‚úÖ Saved: 17 / 100
‚úÖ Saved: 18 / 100
‚úÖ Saved: 19 / 100
‚úÖ Saved: 20 / 100
‚úÖ Saved: 21 / 100
‚úÖ Saved: 22 / 100
‚úÖ Saved: 23 / 100
‚úÖ Saved: 24 / 100
‚úÖ Saved: 25 / 100
‚úÖ Saved: 26 / 100
‚úÖ Saved: 27 / 100
‚úÖ Saved: 28 / 100
‚úÖ Saved: 29 / 100
‚úÖ Saved: 30 / 100
‚úÖ Saved: 31 / 100
‚úÖ Saved: 32 / 100
‚úÖ Saved: 33 / 100
‚úÖ Saved: 34 / 100
‚úÖ Saved: 35 / 100
‚úÖ Saved: 36 / 100
‚úÖ Saved: 37 / 100
‚úÖ Saved: 38 / 100
‚úÖ Saved: 39 / 100
‚úÖ Saved: 40 / 100
‚úÖ Saved: 41 / 100
‚úÖ Saved: 42 / 100
‚úÖ Saved: 43 / 100
‚úÖ Saved: 44 / 100
‚úÖ Saved: 45 / 100

üîÑ Page 2 | CSV count: 45
Cards detected: 40
‚úÖ Saved: 46 / 100
‚úÖ Saved: 47 / 100
‚úÖ Saved: 48 / 100
‚úÖ Saved: 49 / 100
‚úÖ Saved: 50 / 100
‚úÖ Saved

In [17]:
import os
import time
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


# ===============================
# CONFIG
# ===============================
SEARCH_QUERY = "Women Makeup"
CATEGORY = "Beauty and Care"
TARGET_COUNT = 100
CSV_FILE = "womenmakeup_100_products.csv"


# ===============================
# CSV INIT
# ===============================
if not os.path.exists(CSV_FILE):
    pd.DataFrame(columns=[
        "product_name",
        "product_price",
        "overall_rating",
        "product_url",
        "category"
    ]).to_csv(CSV_FILE, index=False, encoding="utf-8-sig")


def get_csv_count():
    return len(pd.read_csv(CSV_FILE))


# ===============================
# SELENIUM SETUP
# ===============================
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

wait = WebDriverWait(driver, 20)


# ===============================
# OPEN FLIPKART
# ===============================
url = f"https://www.flipkart.com/search?q={SEARCH_QUERY}"
driver.get(url)
time.sleep(5)

# Close login popup
try:
    wait.until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'‚úï')]"))
    ).click()
except:
    pass


# ===============================
# MAIN SCRAPING LOOP
# ===============================
seen_links = set()
page = 1

while True:

    current_count = get_csv_count()
    if current_count >= TARGET_COUNT:
        print(f"\nüéØ Target reached ({current_count}). Stopping.")
        break

    print(f"\nüîÑ Page {page} | CSV count: {current_count}")

    # Scroll
    for _ in range(5):
        driver.execute_script("window.scrollBy(0, 1400);")
        time.sleep(2)

    # üî• CORRECT: CARD CONTAINER
    cards = driver.find_elements(By.XPATH, "//div[@data-id]")
    print("Cards detected:", len(cards))

    for card in cards:

        if get_csv_count() >= TARGET_COUNT:
            break

        try:
            link = card.find_element(By.TAG_NAME, "a").get_attribute("href")
        except:
            continue

        if not link or link in seen_links:
            continue
        seen_links.add(link)

        # ‚úÖ EXACT CLASSES (INSIDE CARD)
        try:
            name = card.find_element(By.CLASS_NAME, "pIpigb").text
        except:
            continue

        try:
            price = card.find_element(By.CLASS_NAME, "hZ3P6w").text
        except:
            price = None

        try:
            rating = card.find_element(By.CLASS_NAME, "MKiFS6").text
        except:
            rating = None

        pd.DataFrame([{
            "product_name": name,
            "product_price": price,
            "overall_rating": rating,
            "product_url": link,
            "category": CATEGORY
        }]).to_csv(
            CSV_FILE,
            mode="a",
            header=False,
            index=False,
            encoding="utf-8-sig"
        )

        print(f"‚úÖ Saved: {get_csv_count()} / {TARGET_COUNT}")

    if get_csv_count() >= TARGET_COUNT:
        break

    # Next page
    try:
        next_btn = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//a/span[text()='Next']"))
        )
        driver.execute_script("arguments[0].click();", next_btn)
        time.sleep(5)
        page += 1
    except:
        print("‚ùå No more pages")
        break


# ===============================
# EXIT
# ===============================
driver.quit()
print("\nüéâ DONE")
print("Final rows:", get_csv_count())
print("File:", CSV_FILE)



üîÑ Page 1 | CSV count: 5
Cards detected: 40
‚úÖ Saved: 6 / 100
‚úÖ Saved: 7 / 100
‚úÖ Saved: 8 / 100
‚úÖ Saved: 9 / 100
‚úÖ Saved: 10 / 100
‚úÖ Saved: 11 / 100
‚úÖ Saved: 12 / 100
‚úÖ Saved: 13 / 100
‚úÖ Saved: 14 / 100
‚úÖ Saved: 15 / 100
‚úÖ Saved: 16 / 100
‚úÖ Saved: 17 / 100
‚úÖ Saved: 18 / 100
‚úÖ Saved: 19 / 100
‚úÖ Saved: 20 / 100
‚úÖ Saved: 21 / 100
‚úÖ Saved: 22 / 100
‚úÖ Saved: 23 / 100
‚úÖ Saved: 24 / 100
‚úÖ Saved: 25 / 100
‚úÖ Saved: 26 / 100
‚úÖ Saved: 27 / 100
‚úÖ Saved: 28 / 100
‚úÖ Saved: 29 / 100
‚úÖ Saved: 30 / 100
‚úÖ Saved: 31 / 100
‚úÖ Saved: 32 / 100
‚úÖ Saved: 33 / 100
‚úÖ Saved: 34 / 100
‚úÖ Saved: 35 / 100
‚úÖ Saved: 36 / 100
‚úÖ Saved: 37 / 100
‚úÖ Saved: 38 / 100
‚úÖ Saved: 39 / 100
‚úÖ Saved: 40 / 100
‚úÖ Saved: 41 / 100
‚úÖ Saved: 42 / 100
‚úÖ Saved: 43 / 100
‚úÖ Saved: 44 / 100
‚úÖ Saved: 45 / 100

üîÑ Page 2 | CSV count: 45
Cards detected: 40
‚úÖ Saved: 46 / 100
‚úÖ Saved: 47 / 100
‚úÖ Saved: 48 / 100
‚úÖ Saved: 49 / 100
‚úÖ Saved: 50 / 100
‚úÖ Saved