In [12]:

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import pandas as pd
from bs4 import BeautifulSoup
import time
from datetime import datetime

# Setup headless Chrome
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-blink-features=AutomationControlled')
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

all_data = []

for page in range(1, 5):  # Pages 1 to 4
    print(f"Scraping page {page}...")
    url = f"https://www.amazon.in/s?k=laptop&crid=2VQ9M2CVOWZF8&sprefix=laptop%2Caps%2C330&ref=nb_sb_noss_2={page}"
    driver.get(url)
    time.sleep(5)  # Wait for JS to render

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    results = soup.find_all('div', {'data-component-type': 's-search-result'})
    
    for item in results:
        try:
            name = item.h2.text.strip()
        except:
            name = "N/A"
        
        try:
            price = item.find('span', 'a-price-whole').text.strip().replace(",", "")
        except:
            price = "N/A"
        
        try:
            review = item.find('span', {'class': 'a-icon-alt'}).text.strip()
        except:
            review = "N/A"
        
        all_data.append({
            "Name": name,
            "Price": price,
            "Reviews": review,
            "Date": datetime.now().strftime("%Y-%m-%d")
        })

# Quit driver
driver.quit()

# Save to CSV
df = pd.DataFrame(all_data)
df.to_csv("amazon_data61.csv", index=False, encoding='utf-8')
print("Data saved to amazon_laptop_data.csv")


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Data saved to amazon_laptop_data.csv


In [13]:
df.head(25)

Unnamed: 0,Name,Price,Reviews,Date
0,"HP 15, AMD Ryzen 3 7320U (8GB LPDDR5, 512GB SS...",30290,4.0 out of 5 stars,2025-07-11
1,HP 255 G10 ‎‎Laptop (AMD Athlon Silver 7120U/ ...,22990,3.9 out of 5 stars,2025-07-11
2,"Apple MacBook Air Laptop: Apple M1 chip, 13.3-...",56990,4.6 out of 5 stars,2025-07-11
3,Walker Best Student &Office Work Laptop|Thin &...,12490,3.4 out of 5 stars,2025-07-11
4,"HP Professional 15, 12th Gen Intel Core Celero...",22110,4.7 out of 5 stars,2025-07-11
5,Lenovo {SmartChoice)Chromebook Intel Celeron N...,13990,3.8 out of 5 stars,2025-07-11
6,"Lenovo V15 Intel Celeron N4500 15.6"" (39.62 cm...",21890,3.9 out of 5 stars,2025-07-11
7,"HP 15, 13th Gen Intel Core i3-1315U Laptop (8G...",35990,4.1 out of 5 stars,2025-07-11
8,"HP 15 Laptop,13th Gen Intel Core i7-1355U Lapt...",67999,5.0 out of 5 stars,2025-07-11
9,Lenovo Ideapad 1 AMD Ryzen 5 5625U (16GB RAM/5...,38999,4.1 out of 5 stars,2025-07-11


#  Data Cleaning Process

In [15]:
# Load scraped data
df = pd.read_csv("amazon_data61.csv")

# Remove rows with missing names (likely ads or irrelevant blocks)
df = df[df["Name"] != "N/A"]


In [16]:
df.head()

Unnamed: 0,Name,Price,Reviews,Date
0,"HP 15, AMD Ryzen 3 7320U (8GB LPDDR5, 512GB SS...",30290,4.0 out of 5 stars,2025-07-11
1,HP 255 G10 ‎‎Laptop (AMD Athlon Silver 7120U/ ...,22990,3.9 out of 5 stars,2025-07-11
2,"Apple MacBook Air Laptop: Apple M1 chip, 13.3-...",56990,4.6 out of 5 stars,2025-07-11
3,Walker Best Student &Office Work Laptop|Thin &...,12490,3.4 out of 5 stars,2025-07-11
4,"HP Professional 15, 12th Gen Intel Core Celero...",22110,4.7 out of 5 stars,2025-07-11


In [17]:
# Extract numeric rating
df["Rating"] = df["Reviews"].str.extract(r"(\d+(\.\d+)?)")[0].astype(float)

In [18]:
df.head()

Unnamed: 0,Name,Price,Reviews,Date,Rating
0,"HP 15, AMD Ryzen 3 7320U (8GB LPDDR5, 512GB SS...",30290,4.0 out of 5 stars,2025-07-11,4.0
1,HP 255 G10 ‎‎Laptop (AMD Athlon Silver 7120U/ ...,22990,3.9 out of 5 stars,2025-07-11,3.9
2,"Apple MacBook Air Laptop: Apple M1 chip, 13.3-...",56990,4.6 out of 5 stars,2025-07-11,4.6
3,Walker Best Student &Office Work Laptop|Thin &...,12490,3.4 out of 5 stars,2025-07-11,3.4
4,"HP Professional 15, 12th Gen Intel Core Celero...",22110,4.7 out of 5 stars,2025-07-11,4.7


In [19]:
# Drop old 'Reviews' column
df.drop(columns=["Reviews"], inplace=True)

In [20]:
df.head()

Unnamed: 0,Name,Price,Date,Rating
0,"HP 15, AMD Ryzen 3 7320U (8GB LPDDR5, 512GB SS...",30290,2025-07-11,4.0
1,HP 255 G10 ‎‎Laptop (AMD Athlon Silver 7120U/ ...,22990,2025-07-11,3.9
2,"Apple MacBook Air Laptop: Apple M1 chip, 13.3-...",56990,2025-07-11,4.6
3,Walker Best Student &Office Work Laptop|Thin &...,12490,2025-07-11,3.4
4,"HP Professional 15, 12th Gen Intel Core Celero...",22110,2025-07-11,4.7


In [21]:
df["Price"] = pd.to_numeric(df["Price"], errors="coerce")

df = df[df["Price"] >= 1000]  # Filter unwanted rows

In [22]:
# Save cleaned data
df.to_csv("cleaned_amazon_laptops62.csv", index=False)
print("Cleaned data saved to cleaned_amazon_laptops62.csv")

Cleaned data saved to cleaned_amazon_laptops62.csv
