In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time
import random

In [4]:
# Function to extract Product Title
def get_title(soup):

    try:
        # Outer Tag Object
        title = soup.find("span", attrs={"id":'productTitle'}).text.strip()
        

    except AttributeError:
        title = ""

    return title

# Function to extract Product Price
def get_price(soup):

    try:
        price = soup.find("span", attrs={'class':'a-price-whole'}).text.strip()

    except AttributeError:
        price = ""

    return price

# Function to extract Product Rating
def get_rating(soup):

    try:
        rating = soup.find('span',attrs = {'class':'a-icon-alt'}).text.strip()
    
    except AttributeError:
        rating = ""	

    return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).text.strip()

    except AttributeError:
        review_count = ""	

    return review_count

# Function to extract Availability Status
def get_availability(soup):
    try:
        available = soup.find("div", attrs={'id':'availability'})
        available = available.find("span").string.strip()

    except AttributeError:
        available = "Not Available"	

    return available

In [5]:
if __name__ == "__main__":
    # User-Agent Header
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
        "Accept-Language": "en-US, en;q=0.5",
    }

    # Base URL
    BASE_URL = "https://www.amazon.in/s?k=apple+laptop&i=electronics&rh=n%3A976419031%2Cp_123%3A110955&dc"

    # Initialize data storage
    data = {"title": [], "price": [], "rating": [], "reviews": [], "availability": []}

    # Loop through pages
    for page in range(1, 6):  # Adjust the range to scrape more pages
        # Update the URL with the page number
        url = f"{BASE_URL}&page={page}"
        response = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(response.content, "html.parser")

        # Fetch product links on the current page
        links = soup.find_all("a", attrs={"class": "a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal"})
        links_list = [link.get("href") for link in links]

        # Loop through each product link and scrape details
        for link in links_list:
            product_url = "https://www.amazon.in" + link
            product_response = requests.get(product_url, headers=HEADERS)
            product_soup = BeautifulSoup(product_response.content, "html.parser")

            # Collect product details
            data["title"].append(get_title(product_soup))
            data["price"].append(get_price(product_soup))
            data["rating"].append(get_rating(product_soup))
            data["reviews"].append(get_review_count(product_soup))
            data["availability"].append(get_availability(product_soup))

    # Save data to a DataFrame
    df = pd.DataFrame(data)

    # Clean and save data
    df["title"].replace("", np.nan, inplace=True)
    df.dropna(subset=["title"], inplace=True)
    df.to_csv("apple_laptops.csv", index=False)

    print("Scraping completed. Data saved to 'apple_laptops.csv'.")

Scraping completed. Data saved to 'apple_laptops.csv'.


In [6]:
df.head()

Unnamed: 0,title,price,rating,reviews,availability
0,"Apple MacBook Air Laptop: Apple M1 chip, 13.3-...",56990.0,4.6 out of 5 stars,"4,607 ratings",In stock
1,Apple 2024 MacBook Pro Laptop with M4 chip wit...,169900.0,5.0 out of 5 stars,1 rating,In stock
2,"Apple 2024 MacBook Air (13-inch, Apple M3 chip...",114900.0,Previous page,,In stock
3,Apple 2024 MacBook Pro Laptop with M4 Max chip...,319900.0,Previous page,,In stock
4,Apple 2024 MacBook Air 15″ Laptop with M3 chip...,134900.0,Previous page,,In stock
