In [5]:
import re
import time
import pandas as pd
import nltk

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

from nltk.sentiment import SentimentIntensityAnalyzer

# ===============================
# NLTK SETUP
# ===============================
try:
    nltk.data.find("sentiment/vader_lexicon")
except LookupError:
    nltk.download("vader_lexicon")

sia = SentimentIntensityAnalyzer()

def get_sentiment(text):
    if not text or len(text) < 10:
        return "Neutral"
    score = sia.polarity_scores(text)["compound"]
    if score >= 0.05:
        return "Positive"
    elif score <= -0.05:
        return "Negative"
    else:
        return "Neutral"

# ===============================
# REVIEW PARSER
# ===============================
def parse_review_block(raw_text):
    data = {
        "overall_rating": None,
        "total_ratings": None,
        "total_reviews": None,
        "rating_5": 0,
        "rating_4": 0,
        "rating_3": 0,
        "rating_2": 0,
        "rating_1": 0,
        "review_text": "No review available",
        "review_summary": "No review available"
    }

    if not raw_text:
        return data

    m = re.search(r"(\d\.\d)\â˜…", raw_text)
    if m:
        data["overall_rating"] = float(m.group(1))

    m = re.search(r"([\d,]+)\s*Ratings\s*&\s*([\d,]+)\s*Reviews", raw_text)
    if m:
        data["total_ratings"] = int(m.group(1).replace(",", ""))
        data["total_reviews"] = int(m.group(2).replace(",", ""))

    def star_count(star):
        m = re.search(rf"{star}â˜…\s*([\d,]+)", raw_text)
        return int(m.group(1).replace(",", "")) if m else 0

    data["rating_5"] = star_count(5)
    data["rating_4"] = star_count(4)
    data["rating_3"] = star_count(3)
    data["rating_2"] = star_count(2)
    data["rating_1"] = star_count(1)

    lines = [l.strip() for l in raw_text.split("\n") if len(l) > 20]
    if lines:
        data["review_text"] = " ".join(lines)
        data["review_summary"] = data["review_text"][:120]

    return data

# ===============================
# SENTIMENT LOGIC
# ===============================
def sentiment_from_rating(rating):
    try:
        rating = float(rating)
    except:
        return "Neutral"

    if rating >= 4:
        return "Positive"
    elif rating >= 3:
        return "Neutral"
    else:
        return "Negative"

def hybrid_sentiment(text, rating, r5, r4, r1, r2):
    text_sent = get_sentiment(text)
    rating_sent = sentiment_from_rating(rating)

    pos = r5 + r4
    neg = r1 + r2

    dist_sent = "Neutral"
    if pos > neg:
        dist_sent = "Positive"
    elif neg > pos:
        dist_sent = "Negative"

    for s in [text_sent, rating_sent, dist_sent]:
        if s != "Neutral":
            return s
    return "Neutral"

# ===============================
# SELENIUM SETUP
# ===============================
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

wait = WebDriverWait(driver, 15)

# ===============================
# CONFIG
# ===============================
SEARCH_QUERY = "Air Conditioners"
TARGET_COUNT = 100
CATEGORY = "Electronics"

base_url = f"https://www.flipkart.com/search?q={SEARCH_QUERY}"
driver.get(base_url)
time.sleep(5)

# Close login popup
try:
    wait.until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'âœ•')]"))
    ).click()
except:
    pass

results = []
seen_urls = set()
page = 1
saved_count = 0

# ===============================
# MAIN LOOP (FIXED)
# ===============================
while saved_count < TARGET_COUNT:

    print(f"\nðŸ”„ Page {page}")

    for _ in range(5):
        driver.execute_script("window.scrollBy(0, 1500);")
        time.sleep(2)

    # âœ… WORKING SELECTOR
    products = driver.find_elements(
        By.XPATH,
        "//a[contains(@href,'/p/')]"
    )

    print("Found products:", len(products))

    for a in products:
        if saved_count >= TARGET_COUNT:
            break

        try:
            link = a.get_attribute("href")
        except:
            continue

        if not link or link in seen_urls:
            continue
        seen_urls.add(link)

        try:
            name = a.text.strip()
            if not name:
                continue
        except:
            continue

        # Open product page
        driver.execute_script("window.open(arguments[0]);", link)
        driver.switch_to.window(driver.window_handles[1])
        time.sleep(4)

        try:
            price = driver.find_element(By.XPATH, "//div[contains(text(),'â‚¹')]").text
        except:
            price = None

        try:
            rating = driver.find_element(By.XPATH, "//div[contains(@class,'MKiFS6')]").text
        except:
            rating = None

        raw_review = ""
        try:
            blocks = driver.find_elements(By.XPATH, "//div[contains(@class,'xgU6qg')]")
            raw_review = "\n".join([b.text for b in blocks[:5]])
        except:
            pass

        parsed = parse_review_block(raw_review)

        if parsed["review_text"] == "No review available":
            parsed["review_text"] = name

        sentiment = hybrid_sentiment(
            parsed["review_text"],
            parsed["overall_rating"],
            parsed["rating_5"],
            parsed["rating_4"],
            parsed["rating_1"],
            parsed["rating_2"]
        )

        driver.close()
        driver.switch_to.window(driver.window_handles[0])

        saved_count += 1

        results.append({
            "product_name": name,
            "product_price": price,
             "overall_rating": rating,
            "product_url": link,
            "category": CATEGORY
        })

        print(f"âœ… Saved: {saved_count}")

    page += 1
    driver.get(f"{base_url}&page={page}")
    time.sleep(5)

# ===============================
# SAVE CSV
# ===============================
pd.DataFrame(results).to_csv(
    "AirConditionerselectronics_dataset_flipkart.csv",
    index=False,
    encoding="utf-8-sig"
)

driver.quit()
print("\nðŸŽ‰ DONE")
print("Clean file saved: AirConditionerselectronics_dataset_flipkart.csv")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\SURABHI\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!



ðŸ”„ Page 1
Found products: 29
âœ… Saved: 1
âœ… Saved: 2
âœ… Saved: 3
âœ… Saved: 4
âœ… Saved: 5
âœ… Saved: 6
âœ… Saved: 7
âœ… Saved: 8
âœ… Saved: 9
âœ… Saved: 10
âœ… Saved: 11
âœ… Saved: 12
âœ… Saved: 13
âœ… Saved: 14
âœ… Saved: 15
âœ… Saved: 16
âœ… Saved: 17
âœ… Saved: 18
âœ… Saved: 19
âœ… Saved: 20
âœ… Saved: 21
âœ… Saved: 22
âœ… Saved: 23
âœ… Saved: 24
âœ… Saved: 25
âœ… Saved: 26
âœ… Saved: 27
âœ… Saved: 28
âœ… Saved: 29

ðŸ”„ Page 2
Found products: 29
âœ… Saved: 30
âœ… Saved: 31
âœ… Saved: 32
âœ… Saved: 33
âœ… Saved: 34
âœ… Saved: 35
âœ… Saved: 36
âœ… Saved: 37
âœ… Saved: 38
âœ… Saved: 39
âœ… Saved: 40
âœ… Saved: 41
âœ… Saved: 42
âœ… Saved: 43
âœ… Saved: 44
âœ… Saved: 45
âœ… Saved: 46
âœ… Saved: 47
âœ… Saved: 48
âœ… Saved: 49
âœ… Saved: 50
âœ… Saved: 51
âœ… Saved: 52
âœ… Saved: 53
âœ… Saved: 54
âœ… Saved: 55
âœ… Saved: 56
âœ… Saved: 57

ðŸ”„ Page 3
Found products: 29
âœ… Saved: 58
âœ… Saved: 59
âœ… Saved: 60
âœ… Saved: 61
âœ… Saved: 62
âœ… Saved: 63
âœ… Saved: 64
âœ… Saved: 65
âœ…

In [7]:
import re
import time
import pandas as pd
import nltk

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

from nltk.sentiment import SentimentIntensityAnalyzer

# ===============================
# NLTK SETUP
# ===============================
try:
    nltk.data.find("sentiment/vader_lexicon")
except LookupError:
    nltk.download("vader_lexicon")

sia = SentimentIntensityAnalyzer()

def get_sentiment(text):
    if not text or len(text) < 10:
        return "Neutral"
    score = sia.polarity_scores(text)["compound"]
    if score >= 0.05:
        return "Positive"
    elif score <= -0.05:
        return "Negative"
    else:
        return "Neutral"

# ===============================
# REVIEW PARSER
# ===============================
def parse_review_block(raw_text):
    data = {
        "overall_rating": None,
        "total_ratings": None,
        "total_reviews": None,
        "rating_5": 0,
        "rating_4": 0,
        "rating_3": 0,
        "rating_2": 0,
        "rating_1": 0,
        "review_text": "No review available",
        "review_summary": "No review available"
    }

    if not raw_text:
        return data

    m = re.search(r"(\d\.\d)\â˜…", raw_text)
    if m:
        data["overall_rating"] = float(m.group(1))

    m = re.search(r"([\d,]+)\s*Ratings\s*&\s*([\d,]+)\s*Reviews", raw_text)
    if m:
        data["total_ratings"] = int(m.group(1).replace(",", ""))
        data["total_reviews"] = int(m.group(2).replace(",", ""))

    def star_count(star):
        m = re.search(rf"{star}â˜…\s*([\d,]+)", raw_text)
        return int(m.group(1).replace(",", "")) if m else 0

    data["rating_5"] = star_count(5)
    data["rating_4"] = star_count(4)
    data["rating_3"] = star_count(3)
    data["rating_2"] = star_count(2)
    data["rating_1"] = star_count(1)

    lines = [l.strip() for l in raw_text.split("\n") if len(l) > 20]
    if lines:
        data["review_text"] = " ".join(lines)
        data["review_summary"] = data["review_text"][:120]

    return data

# ===============================
# SENTIMENT LOGIC
# ===============================
def sentiment_from_rating(rating):
    try:
        rating = float(rating)
    except:
        return "Neutral"

    if rating >= 4:
        return "Positive"
    elif rating >= 3:
        return "Neutral"
    else:
        return "Negative"

def hybrid_sentiment(text, rating, r5, r4, r1, r2):
    text_sent = get_sentiment(text)
    rating_sent = sentiment_from_rating(rating)

    pos = r5 + r4
    neg = r1 + r2

    dist_sent = "Neutral"
    if pos > neg:
        dist_sent = "Positive"
    elif neg > pos:
        dist_sent = "Negative"

    for s in [text_sent, rating_sent, dist_sent]:
        if s != "Neutral":
            return s
    return "Neutral"

# ===============================
# SELENIUM SETUP
# ===============================
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

wait = WebDriverWait(driver, 15)

# ===============================
# CONFIG
# ===============================
SEARCH_QUERY = "Laptops"
TARGET_COUNT = 100
CATEGORY = "Electronics"

base_url = f"https://www.flipkart.com/search?q={SEARCH_QUERY}"
driver.get(base_url)
time.sleep(5)

# Close login popup
try:
    wait.until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'âœ•')]"))
    ).click()
except:
    pass

results = []
seen_urls = set()
page = 1
saved_count = 0

# ===============================
# MAIN LOOP (FIXED)
# ===============================
while saved_count < TARGET_COUNT:

    print(f"\nðŸ”„ Page {page}")

    for _ in range(5):
        driver.execute_script("window.scrollBy(0, 1500);")
        time.sleep(2)

    # âœ… WORKING SELECTOR
    products = driver.find_elements(
        By.XPATH,
        "//a[contains(@href,'/p/')]"
    )

    print("Found products:", len(products))

    for a in products:
        if saved_count >= TARGET_COUNT:
            break

        try:
            link = a.get_attribute("href")
        except:
            continue

        if not link or link in seen_urls:
            continue
        seen_urls.add(link)

        try:
            name = a.text.strip()
            if not name:
                continue
        except:
            continue

        # Open product page
        driver.execute_script("window.open(arguments[0]);", link)
        driver.switch_to.window(driver.window_handles[1])
        time.sleep(4)

        try:
            price = driver.find_element(By.XPATH, "//div[contains(text(),'â‚¹')]").text
        except:
            price = None

        try:
            rating = driver.find_element(By.XPATH, "//div[contains(@class,'MKiFS6')]").text
        except:
            rating = None

        raw_review = ""
        try:
            blocks = driver.find_elements(By.XPATH, "//div[contains(@class,'xgU6qg')]")
            raw_review = "\n".join([b.text for b in blocks[:5]])
        except:
            pass

        parsed = parse_review_block(raw_review)

        if parsed["review_text"] == "No review available":
            parsed["review_text"] = name

        sentiment = hybrid_sentiment(
            parsed["review_text"],
            parsed["overall_rating"],
            parsed["rating_5"],
            parsed["rating_4"],
            parsed["rating_1"],
            parsed["rating_2"]
        )

        driver.close()
        driver.switch_to.window(driver.window_handles[0])

        saved_count += 1

        results.append({
            "product_name": name,
            "product_price": price,
            "overall_rating": rating,
            "product_url": link,
            "category": CATEGORY
        })

        print(f"âœ… Saved: {saved_count}")

    page += 1
    driver.get(f"{base_url}&page={page}")
    time.sleep(5)

# ===============================
# SAVE CSV
# ===============================
pd.DataFrame(results).to_csv(
    "laptop_dataset_flipkart.csv",
    index=False,
    encoding="utf-8-sig"
)

driver.quit()
print("\nðŸŽ‰ DONE")
print("Clean file saved: laptop_dataset_flipkart.csv")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\SURABHI\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!



ðŸ”„ Page 1
Found products: 29
âœ… Saved: 1
âœ… Saved: 2
âœ… Saved: 3
âœ… Saved: 4
âœ… Saved: 5
âœ… Saved: 6
âœ… Saved: 7
âœ… Saved: 8
âœ… Saved: 9
âœ… Saved: 10
âœ… Saved: 11
âœ… Saved: 12
âœ… Saved: 13
âœ… Saved: 14
âœ… Saved: 15
âœ… Saved: 16
âœ… Saved: 17
âœ… Saved: 18
âœ… Saved: 19
âœ… Saved: 20
âœ… Saved: 21
âœ… Saved: 22
âœ… Saved: 23
âœ… Saved: 24
âœ… Saved: 25
âœ… Saved: 26
âœ… Saved: 27
âœ… Saved: 28
âœ… Saved: 29

ðŸ”„ Page 2
Found products: 29
âœ… Saved: 30
âœ… Saved: 31
âœ… Saved: 32
âœ… Saved: 33
âœ… Saved: 34
âœ… Saved: 35
âœ… Saved: 36
âœ… Saved: 37
âœ… Saved: 38
âœ… Saved: 39
âœ… Saved: 40
âœ… Saved: 41
âœ… Saved: 42
âœ… Saved: 43
âœ… Saved: 44
âœ… Saved: 45
âœ… Saved: 46
âœ… Saved: 47
âœ… Saved: 48
âœ… Saved: 49
âœ… Saved: 50
âœ… Saved: 51
âœ… Saved: 52
âœ… Saved: 53
âœ… Saved: 54
âœ… Saved: 55
âœ… Saved: 56

ðŸ”„ Page 3
Found products: 29
âœ… Saved: 57
âœ… Saved: 58
âœ… Saved: 59
âœ… Saved: 60
âœ… Saved: 61
âœ… Saved: 62
âœ… Saved: 63
âœ… Saved: 64
âœ… Saved: 65
âœ…

In [9]:
import re
import time
import pandas as pd
import nltk

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

from nltk.sentiment import SentimentIntensityAnalyzer

# ===============================
# NLTK SETUP
# ===============================
try:
    nltk.data.find("sentiment/vader_lexicon")
except LookupError:
    nltk.download("vader_lexicon")

sia = SentimentIntensityAnalyzer()

def get_sentiment(text):
    if not text or len(text) < 10:
        return "Neutral"
    score = sia.polarity_scores(text)["compound"]
    if score >= 0.05:
        return "Positive"
    elif score <= -0.05:
        return "Negative"
    else:
        return "Neutral"

# ===============================
# REVIEW PARSER
# ===============================
def parse_review_block(raw_text):
    data = {
        "overall_rating": None,
        "total_ratings": None,
        "total_reviews": None,
        "rating_5": 0,
        "rating_4": 0,
        "rating_3": 0,
        "rating_2": 0,
        "rating_1": 0,
        "review_text": "No review available",
        "review_summary": "No review available"
    }

    if not raw_text:
        return data

    m = re.search(r"(\d\.\d)\â˜…", raw_text)
    if m:
        data["overall_rating"] = float(m.group(1))

    m = re.search(r"([\d,]+)\s*Ratings\s*&\s*([\d,]+)\s*Reviews", raw_text)
    if m:
        data["total_ratings"] = int(m.group(1).replace(",", ""))
        data["total_reviews"] = int(m.group(2).replace(",", ""))

    def star_count(star):
        m = re.search(rf"{star}â˜…\s*([\d,]+)", raw_text)
        return int(m.group(1).replace(",", "")) if m else 0

    data["rating_5"] = star_count(5)
    data["rating_4"] = star_count(4)
    data["rating_3"] = star_count(3)
    data["rating_2"] = star_count(2)
    data["rating_1"] = star_count(1)

    lines = [l.strip() for l in raw_text.split("\n") if len(l) > 20]
    if lines:
        data["review_text"] = " ".join(lines)
        data["review_summary"] = data["review_text"][:120]

    return data

# ===============================
# SENTIMENT LOGIC
# ===============================
def sentiment_from_rating(rating):
    try:
        rating = float(rating)
    except:
        return "Neutral"

    if rating >= 4:
        return "Positive"
    elif rating >= 3:
        return "Neutral"
    else:
        return "Negative"

def hybrid_sentiment(text, rating, r5, r4, r1, r2):
    text_sent = get_sentiment(text)
    rating_sent = sentiment_from_rating(rating)

    pos = r5 + r4
    neg = r1 + r2

    dist_sent = "Neutral"
    if pos > neg:
        dist_sent = "Positive"
    elif neg > pos:
        dist_sent = "Negative"

    for s in [text_sent, rating_sent, dist_sent]:
        if s != "Neutral":
            return s
    return "Neutral"

# ===============================
# SELENIUM SETUP
# ===============================
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

wait = WebDriverWait(driver, 15)

# ===============================
# CONFIG
# ===============================
SEARCH_QUERY = "Refrigerators"
TARGET_COUNT = 100
CATEGORY = "Electronics"

base_url = f"https://www.flipkart.com/search?q={SEARCH_QUERY}"
driver.get(base_url)
time.sleep(5)

# Close login popup
try:
    wait.until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'âœ•')]"))
    ).click()
except:
    pass

results = []
seen_urls = set()
page = 1
saved_count = 0

# ===============================
# MAIN LOOP (FIXED)
# ===============================
while saved_count < TARGET_COUNT:

    print(f"\nðŸ”„ Page {page}")

    for _ in range(5):
        driver.execute_script("window.scrollBy(0, 1500);")
        time.sleep(2)

    # âœ… WORKING SELECTOR
    products = driver.find_elements(
        By.XPATH,
        "//a[contains(@href,'/p/')]"
    )

    print("Found products:", len(products))

    for a in products:
        if saved_count >= TARGET_COUNT:
            break

        try:
            link = a.get_attribute("href")
        except:
            continue

        if not link or link in seen_urls:
            continue
        seen_urls.add(link)

        try:
            name = a.text.strip()
            if not name:
                continue
        except:
            continue

        # Open product page
        driver.execute_script("window.open(arguments[0]);", link)
        driver.switch_to.window(driver.window_handles[1])
        time.sleep(4)

        try:
            price = driver.find_element(By.XPATH, "//div[contains(text(),'â‚¹')]").text
        except:
            price = None

        try:
            rating = driver.find_element(By.XPATH, "//div[contains(@class,'MKiFS6')]").text
        except:
            rating = None

        raw_review = ""
        try:
            blocks = driver.find_elements(By.XPATH, "//div[contains(@class,'xgU6qg')]")
            raw_review = "\n".join([b.text for b in blocks[:5]])
        except:
            pass

        parsed = parse_review_block(raw_review)

        if parsed["review_text"] == "No review available":
            parsed["review_text"] = name

        sentiment = hybrid_sentiment(
            parsed["review_text"],
            parsed["overall_rating"],
            parsed["rating_5"],
            parsed["rating_4"],
            parsed["rating_1"],
            parsed["rating_2"]
        )

        driver.close()
        driver.switch_to.window(driver.window_handles[0])

        saved_count += 1

        results.append({
            "product_name": name,
            "product_price": price,
            "overall_rating": rating,
            "product_url": link,
            "category": CATEGORY
        })

        print(f"âœ… Saved: {saved_count}")

    page += 1
    driver.get(f"{base_url}&page={page}")
    time.sleep(5)

# ===============================
# SAVE CSV
# ===============================
pd.DataFrame(results).to_csv(
    "refrigerators_dataset_flipkart.csv",
    index=False,
    encoding="utf-8-sig"
)

driver.quit()
print("\nðŸŽ‰ DONE")
print("Clean file saved: Refrigerators_dataset_flipkart.csv")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\SURABHI\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!



ðŸ”„ Page 1
Found products: 29
âœ… Saved: 1
âœ… Saved: 2
âœ… Saved: 3
âœ… Saved: 4
âœ… Saved: 5
âœ… Saved: 6
âœ… Saved: 7
âœ… Saved: 8
âœ… Saved: 9
âœ… Saved: 10
âœ… Saved: 11
âœ… Saved: 12
âœ… Saved: 13
âœ… Saved: 14
âœ… Saved: 15
âœ… Saved: 16
âœ… Saved: 17
âœ… Saved: 18
âœ… Saved: 19
âœ… Saved: 20
âœ… Saved: 21
âœ… Saved: 22
âœ… Saved: 23
âœ… Saved: 24
âœ… Saved: 25
âœ… Saved: 26
âœ… Saved: 27
âœ… Saved: 28
âœ… Saved: 29

ðŸ”„ Page 2
Found products: 29
âœ… Saved: 30
âœ… Saved: 31
âœ… Saved: 32
âœ… Saved: 33
âœ… Saved: 34
âœ… Saved: 35
âœ… Saved: 36
âœ… Saved: 37
âœ… Saved: 38
âœ… Saved: 39
âœ… Saved: 40
âœ… Saved: 41
âœ… Saved: 42
âœ… Saved: 43
âœ… Saved: 44
âœ… Saved: 45
âœ… Saved: 46
âœ… Saved: 47
âœ… Saved: 48
âœ… Saved: 49
âœ… Saved: 50
âœ… Saved: 51
âœ… Saved: 52
âœ… Saved: 53
âœ… Saved: 54
âœ… Saved: 55
âœ… Saved: 56
âœ… Saved: 57
âœ… Saved: 58

ðŸ”„ Page 3
Found products: 29
âœ… Saved: 59
âœ… Saved: 60
âœ… Saved: 61
âœ… Saved: 62
âœ… Saved: 63
âœ… Saved: 64
âœ… Saved: 65
âœ…

In [11]:
import re
import time
import pandas as pd
import nltk

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

from nltk.sentiment import SentimentIntensityAnalyzer

# ===============================
# NLTK SETUP
# ===============================
try:
    nltk.data.find("sentiment/vader_lexicon")
except LookupError:
    nltk.download("vader_lexicon")

sia = SentimentIntensityAnalyzer()

def get_sentiment(text):
    if not text or len(text) < 10:
        return "Neutral"
    score = sia.polarity_scores(text)["compound"]
    if score >= 0.05:
        return "Positive"
    elif score <= -0.05:
        return "Negative"
    else:
        return "Neutral"

# ===============================
# REVIEW PARSER
# ===============================
def parse_review_block(raw_text):
    data = {
        "overall_rating": None,
        "total_ratings": None,
        "total_reviews": None,
        "rating_5": 0,
        "rating_4": 0,
        "rating_3": 0,
        "rating_2": 0,
        "rating_1": 0,
        "review_text": "No review available",
        "review_summary": "No review available"
    }

    if not raw_text:
        return data

    m = re.search(r"(\d\.\d)\â˜…", raw_text)
    if m:
        data["overall_rating"] = float(m.group(1))

    m = re.search(r"([\d,]+)\s*Ratings\s*&\s*([\d,]+)\s*Reviews", raw_text)
    if m:
        data["total_ratings"] = int(m.group(1).replace(",", ""))
        data["total_reviews"] = int(m.group(2).replace(",", ""))

    def star_count(star):
        m = re.search(rf"{star}â˜…\s*([\d,]+)", raw_text)
        return int(m.group(1).replace(",", "")) if m else 0

    data["rating_5"] = star_count(5)
    data["rating_4"] = star_count(4)
    data["rating_3"] = star_count(3)
    data["rating_2"] = star_count(2)
    data["rating_1"] = star_count(1)

    lines = [l.strip() for l in raw_text.split("\n") if len(l) > 20]
    if lines:
        data["review_text"] = " ".join(lines)
        data["review_summary"] = data["review_text"][:120]

    return data

# ===============================
# SENTIMENT LOGIC
# ===============================
def sentiment_from_rating(rating):
    try:
        rating = float(rating)
    except:
        return "Neutral"

    if rating >= 4:
        return "Positive"
    elif rating >= 3:
        return "Neutral"
    else:
        return "Negative"

def hybrid_sentiment(text, rating, r5, r4, r1, r2):
    text_sent = get_sentiment(text)
    rating_sent = sentiment_from_rating(rating)

    pos = r5 + r4
    neg = r1 + r2

    dist_sent = "Neutral"
    if pos > neg:
        dist_sent = "Positive"
    elif neg > pos:
        dist_sent = "Negative"

    for s in [text_sent, rating_sent, dist_sent]:
        if s != "Neutral":
            return s
    return "Neutral"

# ===============================
# SELENIUM SETUP
# ===============================
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

wait = WebDriverWait(driver, 15)

# ===============================
# CONFIG
# ===============================
SEARCH_QUERY = "Smartphones"
TARGET_COUNT = 100
CATEGORY = "Electronics"

base_url = f"https://www.flipkart.com/search?q={SEARCH_QUERY}"
driver.get(base_url)
time.sleep(5)

# Close login popup
try:
    wait.until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'âœ•')]"))
    ).click()
except:
    pass

results = []
seen_urls = set()
page = 1
saved_count = 0

# ===============================
# MAIN LOOP (FIXED)
# ===============================
while saved_count < TARGET_COUNT:

    print(f"\nðŸ”„ Page {page}")

    for _ in range(5):
        driver.execute_script("window.scrollBy(0, 1500);")
        time.sleep(2)

    # âœ… WORKING SELECTOR
    products = driver.find_elements(
        By.XPATH,
        "//a[contains(@href,'/p/')]"
    )

    print("Found products:", len(products))

    for a in products:
        if saved_count >= TARGET_COUNT:
            break

        try:
            link = a.get_attribute("href")
        except:
            continue

        if not link or link in seen_urls:
            continue
        seen_urls.add(link)

        try:
            name = a.text.strip()
            if not name:
                continue
        except:
            continue

        # Open product page
        driver.execute_script("window.open(arguments[0]);", link)
        driver.switch_to.window(driver.window_handles[1])
        time.sleep(4)

        try:
            price = driver.find_element(By.XPATH, "//div[contains(text(),'â‚¹')]").text
        except:
            price = None

        try:
            rating = driver.find_element(By.XPATH, "//div[contains(@class,'MKiFS6')]").text
        except:
            rating = None

        raw_review = ""
        try:
            blocks = driver.find_elements(By.XPATH, "//div[contains(@class,'xgU6qg')]")
            raw_review = "\n".join([b.text for b in blocks[:5]])
        except:
            pass

        parsed = parse_review_block(raw_review)

        if parsed["review_text"] == "No review available":
            parsed["review_text"] = name

        sentiment = hybrid_sentiment(
            parsed["review_text"],
            parsed["overall_rating"],
            parsed["rating_5"],
            parsed["rating_4"],
            parsed["rating_1"],
            parsed["rating_2"]
        )

        driver.close()
        driver.switch_to.window(driver.window_handles[0])

        saved_count += 1

        results.append({
            "product_name": name,
            "product_price": price,
            "overall_rating": rating,
            "product_url": link,
            "category": CATEGORY
        })

        print(f"âœ… Saved: {saved_count}")

    page += 1
    driver.get(f"{base_url}&page={page}")
    time.sleep(5)

# ===============================
# SAVE CSV
# ===============================
pd.DataFrame(results).to_csv(
    "smartphones_dataset_flipkart.csv",
    index=False,
    encoding="utf-8-sig"
)

driver.quit()
print("\nðŸŽ‰ DONE")
print("Clean file saved: Smartphones_dataset_flipkart.csv")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\SURABHI\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!



ðŸ”„ Page 1
Found products: 29
âœ… Saved: 1
âœ… Saved: 2
âœ… Saved: 3
âœ… Saved: 4
âœ… Saved: 5
âœ… Saved: 6
âœ… Saved: 7
âœ… Saved: 8
âœ… Saved: 9
âœ… Saved: 10
âœ… Saved: 11
âœ… Saved: 12
âœ… Saved: 13
âœ… Saved: 14
âœ… Saved: 15
âœ… Saved: 16
âœ… Saved: 17
âœ… Saved: 18
âœ… Saved: 19
âœ… Saved: 20
âœ… Saved: 21
âœ… Saved: 22
âœ… Saved: 23
âœ… Saved: 24
âœ… Saved: 25
âœ… Saved: 26
âœ… Saved: 27
âœ… Saved: 28
âœ… Saved: 29

ðŸ”„ Page 2
Found products: 29
âœ… Saved: 30
âœ… Saved: 31
âœ… Saved: 32
âœ… Saved: 33
âœ… Saved: 34
âœ… Saved: 35
âœ… Saved: 36
âœ… Saved: 37
âœ… Saved: 38
âœ… Saved: 39
âœ… Saved: 40
âœ… Saved: 41
âœ… Saved: 42
âœ… Saved: 43
âœ… Saved: 44
âœ… Saved: 45
âœ… Saved: 46
âœ… Saved: 47
âœ… Saved: 48
âœ… Saved: 49
âœ… Saved: 50
âœ… Saved: 51
âœ… Saved: 52
âœ… Saved: 53
âœ… Saved: 54
âœ… Saved: 55
âœ… Saved: 56
âœ… Saved: 57
âœ… Saved: 58

ðŸ”„ Page 3
Found products: 29
âœ… Saved: 59
âœ… Saved: 60
âœ… Saved: 61
âœ… Saved: 62
âœ… Saved: 63
âœ… Saved: 64
âœ… Saved: 65
âœ…

In [15]:
import re
import time
import pandas as pd
import nltk

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

from nltk.sentiment import SentimentIntensityAnalyzer

# ===============================
# NLTK SETUP
# ===============================
try:
    nltk.data.find("sentiment/vader_lexicon")
except LookupError:
    nltk.download("vader_lexicon")

sia = SentimentIntensityAnalyzer()

def get_sentiment(text):
    if not text or len(text) < 10:
        return "Neutral"
    score = sia.polarity_scores(text)["compound"]
    if score >= 0.05:
        return "Positive"
    elif score <= -0.05:
        return "Negative"
    else:
        return "Neutral"

# ===============================
# REVIEW PARSER
# ===============================
def parse_review_block(raw_text):
    data = {
        "overall_rating": None,
        "total_ratings": None,
        "total_reviews": None,
        "rating_5": 0,
        "rating_4": 0,
        "rating_3": 0,
        "rating_2": 0,
        "rating_1": 0,
        "review_text": "No review available",
        "review_summary": "No review available"
    }

    if not raw_text:
        return data

    m = re.search(r"(\d\.\d)\â˜…", raw_text)
    if m:
        data["overall_rating"] = float(m.group(1))

    m = re.search(r"([\d,]+)\s*Ratings\s*&\s*([\d,]+)\s*Reviews", raw_text)
    if m:
        data["total_ratings"] = int(m.group(1).replace(",", ""))
        data["total_reviews"] = int(m.group(2).replace(",", ""))

    def star_count(star):
        m = re.search(rf"{star}â˜…\s*([\d,]+)", raw_text)
        return int(m.group(1).replace(",", "")) if m else 0

    data["rating_5"] = star_count(5)
    data["rating_4"] = star_count(4)
    data["rating_3"] = star_count(3)
    data["rating_2"] = star_count(2)
    data["rating_1"] = star_count(1)

    lines = [l.strip() for l in raw_text.split("\n") if len(l) > 20]
    if lines:
        data["review_text"] = " ".join(lines)
        data["review_summary"] = data["review_text"][:120]

    return data

# ===============================
# SENTIMENT LOGIC
# ===============================
def sentiment_from_rating(rating):
    try:
        rating = float(rating)
    except:
        return "Neutral"

    if rating >= 4:
        return "Positive"
    elif rating >= 3:
        return "Neutral"
    else:
        return "Negative"

def hybrid_sentiment(text, rating, r5, r4, r1, r2):
    text_sent = get_sentiment(text)
    rating_sent = sentiment_from_rating(rating)

    pos = r5 + r4
    neg = r1 + r2

    dist_sent = "Neutral"
    if pos > neg:
        dist_sent = "Positive"
    elif neg > pos:
        dist_sent = "Negative"

    for s in [text_sent, rating_sent, dist_sent]:
        if s != "Neutral":
            return s
    return "Neutral"

# ===============================
# SELENIUM SETUP
# ===============================
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

wait = WebDriverWait(driver, 15)

# ===============================
# CONFIG
# ===============================
SEARCH_QUERY = "Televisions"
TARGET_COUNT = 100
CATEGORY = "Electronics"

base_url = f"https://www.flipkart.com/search?q={SEARCH_QUERY}"
driver.get(base_url)
time.sleep(5)

# Close login popup
try:
    wait.until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'âœ•')]"))
    ).click()
except:
    pass

results = []
seen_urls = set()
page = 1
saved_count = 0

# ===============================
# MAIN LOOP (FIXED)
# ===============================
while saved_count < TARGET_COUNT:

    print(f"\nðŸ”„ Page {page}")

    for _ in range(5):
        driver.execute_script("window.scrollBy(0, 1500);")
        time.sleep(2)

    # âœ… WORKING SELECTOR
    products = driver.find_elements(
        By.XPATH,
        "//a[contains(@href,'/p/')]"
    )

    print("Found products:", len(products))

    for a in products:
        if saved_count >= TARGET_COUNT:
            break

        try:
            link = a.get_attribute("href")
        except:
            continue

        if not link or link in seen_urls:
            continue
        seen_urls.add(link)

        try:
            name = a.text.strip()
            if not name:
                continue
        except:
            continue

        # Open product page
        driver.execute_script("window.open(arguments[0]);", link)
        driver.switch_to.window(driver.window_handles[1])
        time.sleep(4)

        try:
            price = driver.find_element(By.XPATH, "//div[contains(text(),'â‚¹')]").text
        except:
            price = None

        try:
            rating = driver.find_element(By.XPATH, "//div[contains(@class,'MKiFS6')]").text
        except:
            rating = None

        raw_review = ""
        try:
            blocks = driver.find_elements(By.XPATH, "//div[contains(@class,'xgU6qg')]")
            raw_review = "\n".join([b.text for b in blocks[:5]])
        except:
            pass

        parsed = parse_review_block(raw_review)

        if parsed["review_text"] == "No review available":
            parsed["review_text"] = name

        sentiment = hybrid_sentiment(
            parsed["review_text"],
            parsed["overall_rating"],
            parsed["rating_5"],
            parsed["rating_4"],
            parsed["rating_1"],
            parsed["rating_2"]
        )

        driver.close()
        driver.switch_to.window(driver.window_handles[0])

        saved_count += 1

        results.append({
            "product_name": name,
            "product_price": price,
            "overall_rating": rating,
            "product_url": link,
            "category": CATEGORY
        })

        print(f"âœ… Saved: {saved_count}")

    page += 1
    driver.get(f"{base_url}&page={page}")
    time.sleep(5)

# ===============================
# SAVE CSV
# ===============================
pd.DataFrame(results).to_csv(
    "televisions_dataset_flipkart.csv",
    index=False,
    encoding="utf-8-sig"
)

driver.quit()
print("\nðŸŽ‰ DONE")
print("Clean file saved: Televisions_dataset_flipkart.csv")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\SURABHI\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!



ðŸ”„ Page 1
Found products: 29
âœ… Saved: 1
âœ… Saved: 2
âœ… Saved: 3
âœ… Saved: 4
âœ… Saved: 5
âœ… Saved: 6
âœ… Saved: 7
âœ… Saved: 8
âœ… Saved: 9
âœ… Saved: 10
âœ… Saved: 11
âœ… Saved: 12
âœ… Saved: 13
âœ… Saved: 14
âœ… Saved: 15
âœ… Saved: 16
âœ… Saved: 17
âœ… Saved: 18
âœ… Saved: 19
âœ… Saved: 20
âœ… Saved: 21
âœ… Saved: 22
âœ… Saved: 23
âœ… Saved: 24
âœ… Saved: 25
âœ… Saved: 26
âœ… Saved: 27
âœ… Saved: 28
âœ… Saved: 29

ðŸ”„ Page 2
Found products: 29
âœ… Saved: 30
âœ… Saved: 31
âœ… Saved: 32
âœ… Saved: 33
âœ… Saved: 34
âœ… Saved: 35
âœ… Saved: 36
âœ… Saved: 37
âœ… Saved: 38
âœ… Saved: 39
âœ… Saved: 40
âœ… Saved: 41
âœ… Saved: 42
âœ… Saved: 43
âœ… Saved: 44
âœ… Saved: 45
âœ… Saved: 46
âœ… Saved: 47
âœ… Saved: 48
âœ… Saved: 49
âœ… Saved: 50
âœ… Saved: 51
âœ… Saved: 52
âœ… Saved: 53
âœ… Saved: 54
âœ… Saved: 55
âœ… Saved: 56
âœ… Saved: 57
âœ… Saved: 58

ðŸ”„ Page 3
Found products: 29
âœ… Saved: 59
âœ… Saved: 60
âœ… Saved: 61
âœ… Saved: 62
âœ… Saved: 63
âœ… Saved: 64
âœ… Saved: 65
âœ…