In [6]:
import os
import time
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


# ===============================
# CONFIG
# ===============================
SEARCH_QUERY = "Mens Footwear"
CATEGORY = "Footwear"
TARGET_COUNT = 100
CSV_FILE = "mensfootwear_100_products.csv"


# ===============================
# CSV INIT
# ===============================
if not os.path.exists(CSV_FILE):
    pd.DataFrame(columns=[
        "product_name",
        "product_price",
        "overall_rating",
        "product_url",
        "category"
    ]).to_csv(CSV_FILE, index=False, encoding="utf-8-sig")


def get_csv_count():
    return len(pd.read_csv(CSV_FILE))


# ===============================
# SELENIUM SETUP
# ===============================
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

wait = WebDriverWait(driver, 20)


# ===============================
# OPEN SEARCH PAGE
# ===============================
search_url = f"https://www.flipkart.com/search?q={SEARCH_QUERY.replace(' ', '+')}"
driver.get(search_url)

# Close login popup
try:
    wait.until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'‚úï')]"))
    ).click()
except:
    pass


# ===============================
# MAIN SCRAPING LOGIC
# ===============================
seen_links = set()
page = 1

while True:

    if get_csv_count() >= TARGET_COUNT:
        print("\nüéØ Target reached. Stopping.")
        break

    print(f"\nüîÑ Page {page} | CSV count: {get_csv_count()}")

    # Load products
    product_elements = driver.find_elements(By.XPATH, "//a[contains(@href,'/p/')]")

    links = []
    for el in product_elements:
        try:
            href = el.get_attribute("href")
            if href and href not in seen_links:
                links.append(href)
        except:
            continue
    
    for link in links:

        if not link or link in seen_links:
            continue
        seen_links.add(link)

        # ===============================
        # OPEN PRODUCT PAGE
        # ===============================
        driver.execute_script("window.open(arguments[0]);", link)
        driver.switch_to.window(driver.window_handles[1])

        # -------- PRODUCT DETAILS (EXACT CLASSES) --------
        try:
            name = wait.until(
                EC.presence_of_element_located(
                    (By.CLASS_NAME, "LMizgS")
                )
            ).text
        except:
            name = None

        try:
            price = driver.find_element(
                By.XPATH, "//div[contains(@class,'hZ3P6w')]"
            ).text
        except:
            price = None

        try:
            rating = driver.find_element(
                By.XPATH, "//div[contains(@class,'MKiFS6')]"
            ).text
        except:
            rating = None

        driver.close()
        driver.switch_to.window(driver.window_handles[0])

        # ===============================
        # SAVE REAL-TIME CSV
        # ===============================
        pd.DataFrame([{
            "product_name": name,
            "product_price": price,
            "overall_rating": rating,
            "product_url": link,
            "category": CATEGORY
        }]).to_csv(
            CSV_FILE,
            mode="a",
            header=False,
            index=False,
            encoding="utf-8-sig"
        )

        print(f"‚úÖ Saved: {get_csv_count()} / {TARGET_COUNT}")

    if get_csv_count() >= TARGET_COUNT:
        break

    # Next page
    try:
        next_btn = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//a/span[text()='Next']"))
        )
        driver.execute_script("arguments[0].click();", next_btn)
        page += 1
    except:
        print("‚ùå No more pages")
        break


# ===============================
# CLEAN EXIT
# ===============================
driver.quit()
print("\nüéâ DONE")
print("Final rows:", get_csv_count())
print("File saved:", CSV_FILE)


üîÑ Page 1 | CSV count: 51
‚úÖ Saved: 52 / 100
‚úÖ Saved: 53 / 100
‚úÖ Saved: 54 / 100
‚úÖ Saved: 55 / 100
‚úÖ Saved: 56 / 100
‚úÖ Saved: 57 / 100
‚úÖ Saved: 58 / 100
‚úÖ Saved: 59 / 100
‚úÖ Saved: 60 / 100
‚úÖ Saved: 61 / 100
‚úÖ Saved: 62 / 100
‚úÖ Saved: 63 / 100
‚úÖ Saved: 64 / 100
‚úÖ Saved: 65 / 100
‚úÖ Saved: 66 / 100
‚úÖ Saved: 67 / 100
‚úÖ Saved: 68 / 100
‚úÖ Saved: 69 / 100
‚úÖ Saved: 70 / 100
‚úÖ Saved: 71 / 100
‚úÖ Saved: 72 / 100
‚úÖ Saved: 73 / 100
‚úÖ Saved: 74 / 100
‚úÖ Saved: 75 / 100
‚úÖ Saved: 76 / 100
‚úÖ Saved: 77 / 100
‚úÖ Saved: 78 / 100
‚úÖ Saved: 79 / 100
‚úÖ Saved: 80 / 100
‚úÖ Saved: 81 / 100
‚úÖ Saved: 82 / 100
‚úÖ Saved: 83 / 100
‚úÖ Saved: 84 / 100
‚úÖ Saved: 85 / 100
‚úÖ Saved: 86 / 100
‚úÖ Saved: 87 / 100
‚úÖ Saved: 88 / 100
‚úÖ Saved: 89 / 100
‚úÖ Saved: 90 / 100
‚úÖ Saved: 91 / 100
‚úÖ Saved: 92 / 100
‚úÖ Saved: 93 / 100
‚úÖ Saved: 94 / 100
‚úÖ Saved: 95 / 100
‚úÖ Saved: 96 / 100

üîÑ Page 2 | CSV count: 96
‚úÖ Saved: 97 / 100
‚úÖ Saved: 98 / 100
‚ú

In [1]:
import os
import time
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


# ===============================
# CONFIG
# ===============================
SEARCH_QUERY = "Womens Footwear"
CATEGORY = "Footwear"
TARGET_COUNT = 100
CSV_FILE = "womensfootwear_100_products.csv"


# ===============================
# CSV INIT
# ===============================
if not os.path.exists(CSV_FILE):
    pd.DataFrame(columns=[
        "product_name",
        "product_price",
        "overall_rating",
        "product_url",
        "category"
    ]).to_csv(CSV_FILE, index=False, encoding="utf-8-sig")


def get_csv_count():
    return len(pd.read_csv(CSV_FILE))


# ===============================
# SELENIUM SETUP
# ===============================
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

wait = WebDriverWait(driver, 20)


# ===============================
# OPEN SEARCH PAGE
# ===============================
search_url = f"https://www.flipkart.com/search?q={SEARCH_QUERY.replace(' ', '+')}"
driver.get(search_url)

# Close login popup
try:
    wait.until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'‚úï')]"))
    ).click()
except:
    pass


# ===============================
# MAIN SCRAPING LOGIC
# ===============================
seen_links = set()
page = 1

while True:

    if get_csv_count() >= TARGET_COUNT:
        print("\nüéØ Target reached. Stopping.")
        break

    print(f"\nüîÑ Page {page} | CSV count: {get_csv_count()}")

    # Load products
    product_elements = driver.find_elements(By.XPATH, "//a[contains(@href,'/p/')]")

    links = []
    for el in product_elements:
        try:
            href = el.get_attribute("href")
            if href and href not in seen_links:
                    links.append(href)
        except:
                continue
        
    for link in links:

        if not link or link in seen_links:
            continue
        seen_links.add(link)

        # ===============================
        # OPEN PRODUCT PAGE
        # ===============================
        driver.execute_script("window.open(arguments[0]);", link)
        driver.switch_to.window(driver.window_handles[1])

        # -------- PRODUCT DETAILS (EXACT CLASSES) --------
        try:
            name = wait.until(
                EC.presence_of_element_located(
                    (By.CLASS_NAME, "LMizgS")
                )
            ).text
        except:
            name = None

        try:
            price = driver.find_element(
                By.XPATH, "//div[contains(@class,'hZ3P6w')]"
            ).text
        except:
            price = None

        try:
            rating = driver.find_element(
                By.XPATH, "//div[contains(@class,'MKiFS6')]"
            ).text
        except:
            rating = None

        driver.close()
        driver.switch_to.window(driver.window_handles[0])

        # ===============================
        # SAVE REAL-TIME CSV
        # ===============================
        pd.DataFrame([{
            "product_name": name,
            "product_price": price,
            "overall_rating": rating,
            "product_url": link,
            "category": CATEGORY
        }]).to_csv(
            CSV_FILE,
            mode="a",
            header=False,
            index=False,
            encoding="utf-8-sig"
        )

        print(f"‚úÖ Saved: {get_csv_count()} / {TARGET_COUNT}")

    if get_csv_count() >= TARGET_COUNT:
        break

    # Next page
    try:
        next_btn = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//a/span[text()='Next']"))
        )
        driver.execute_script("arguments[0].click();", next_btn)
        page += 1
    except:
        print("‚ùå No more pages")
        break


# ===============================
# CLEAN EXIT
# ===============================
driver.quit()
print("\nüéâ DONE")
print("Final rows:", get_csv_count())
print("File saved:", CSV_FILE)


üîÑ Page 1 | CSV count: 51
‚úÖ Saved: 52 / 100
‚úÖ Saved: 53 / 100
‚úÖ Saved: 54 / 100
‚úÖ Saved: 55 / 100
‚úÖ Saved: 56 / 100
‚úÖ Saved: 57 / 100
‚úÖ Saved: 58 / 100
‚úÖ Saved: 59 / 100
‚úÖ Saved: 60 / 100
‚úÖ Saved: 61 / 100
‚úÖ Saved: 62 / 100
‚úÖ Saved: 63 / 100
‚úÖ Saved: 64 / 100
‚úÖ Saved: 65 / 100
‚úÖ Saved: 66 / 100
‚úÖ Saved: 67 / 100
‚úÖ Saved: 68 / 100
‚úÖ Saved: 69 / 100
‚úÖ Saved: 70 / 100
‚úÖ Saved: 71 / 100
‚úÖ Saved: 72 / 100
‚úÖ Saved: 73 / 100
‚úÖ Saved: 74 / 100
‚úÖ Saved: 75 / 100
‚úÖ Saved: 76 / 100
‚úÖ Saved: 77 / 100
‚úÖ Saved: 78 / 100
‚úÖ Saved: 79 / 100
‚úÖ Saved: 80 / 100
‚úÖ Saved: 81 / 100
‚úÖ Saved: 82 / 100
‚úÖ Saved: 83 / 100
‚úÖ Saved: 84 / 100
‚úÖ Saved: 85 / 100
‚úÖ Saved: 86 / 100
‚úÖ Saved: 87 / 100
‚úÖ Saved: 88 / 100
‚úÖ Saved: 89 / 100
‚úÖ Saved: 90 / 100
‚úÖ Saved: 91 / 100
‚úÖ Saved: 92 / 100
‚úÖ Saved: 93 / 100
‚úÖ Saved: 94 / 100
‚úÖ Saved: 95 / 100
‚úÖ Saved: 96 / 100

üîÑ Page 2 | CSV count: 96
‚úÖ Saved: 97 / 100
‚úÖ Saved: 98 / 100
‚ú