In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import time
from datetime import datetime
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests


In [2]:
# Set the number of pages to scrape
no_pages = 2

In [3]:


# Define a function to retrieve data from a specific page on Amazon
def get_data(pageNo):
    # Set headers to mimic a browser request
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
        "Accept-Encoding": "gzip, deflate",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "DNT": "1",
        "Connection": "close",
        "Upgrade-Insecure-Requests": "1"
    }

    # Make a GET request to the Amazon Bestsellers Beauty page
    r = requests.get('https://www.amazon.in/gp/bestsellers/beauty/ref=zg_bs_pg_' + str(pageNo) +
                     '?ie=UTF8&pg=' + str(pageNo), headers=headers)

    # Get the content of the page
    content = r.content

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(content)

    # Initialize an empty list to store product information
    alls = []

    # Iterate through each product div on the page
    for d in soup.findAll('div', attrs={'class': '_cDEzb_iveVideoWrapper_JJ34T'}):
        # Extract product details such as name, rating, users_rated, and price
        name = d.find('div', attrs={'class': '_cDEzb_p13n-sc-css-line-clamp-3_g3dy1'})
        rating = d.find('span', attrs={'class': 'a-icon-alt'})
        users_rated = d.find('span', attrs={'class': 'a-size-small'})
        price = d.find('span', attrs={'class': '_cDEzb_p13n-sc-price_3mJ9Z'})

        # Initialize a list to store individual product information
        all1 = []

        # Append product name to the list
        all1.append(name.text)

        # Append product rating to the list if available, otherwise use '-1'
        if rating is not None:
            all1.append(rating.text)
        else:
            all1.append('-1')

        # Append the number of users who rated the product to the list
        all1.append(users_rated.text)

        # Append product price to the list if available, otherwise use '0'
        if price is not None:
            all1.append(price.text)
        else:
            all1.append('0')

        # Append the list of product information to the overall list
        alls.append(all1)

    # Return the list of product information for the current page
    return alls


In [21]:
get_data(1)

[['The Derma Co 1% Hyaluronic Sunscreen Aqua Ultra Light Gel with SPF 50 PA++++ For Broad Spectrum, UV A, UV B & Blue Light Protection - 50g(dermaco)',
  '4.3 out of 5 stars',
  '14,645',
  '₹448.00'],
 ['NIVEA Nourishing Body Milk 600ml Body Lotion | 48 H Moisturization | With 2X Almond Oil | Smooth and Healthy Looking Skin |For Very Dry Skin',
  '4.4 out of 5 stars',
  '42,525',
  '₹337.00'],
 ['Aqualogica Glow+ Dewy Sunscreen SPF 50 PA++++ | UVA/B & Blue Light Protection for Men & Women | Oily, Dry, Sensitive & Combination Skin | Fragrance-Free | 50g',
  '4.3 out of 5 stars',
  '10,962',
  '₹394.00'],
 ['NIVEA Soft Light Moisturizer, 300 ml, for Face, Hand & Body, Non-Greasy Cream with Vitamin E & Jojoba Oil for Instant Hydration',
  '4.4 out of 5 stars',
  '58,262',
  '₹275.00'],
 ["Simple Kind To Skin Refreshing Facial Wash 150 ml | 100% Soap-Free Facewash that doesn't dry out your skin| For All Skin Types",
  '4.3 out of 5 stars',
  '15,766',
  '₹270.00'],
 ['Dettol Liquid Handwa

In [4]:
# Initialize an empty list to store the results for each page
results = []

# Loop through the specified number of pages and append data to the results list
for i in range(1, no_pages + 1):
    results.append(get_data(i))

# Define a lambda function to flatten the nested list structure
flatten = lambda l: [item for sublist in l for item in sublist]

# Create a DataFrame from the flattened results list
df = pd.DataFrame(flatten(results), columns=['Product Name', 'Rating', 'Customers_Rated', 'Price'])

# Save the DataFrame to a CSV file named 'amazon_products.csv'
df.to_csv('amazon_products.csv', index=False, encoding='utf-8')


In [5]:
# Read the data from the CSV file 'amazon_products.csv' into a DataFrame
df = pd.read_csv("amazon_products.csv")


In [6]:
# Display the first few rows of the DataFrame to inspect the data structure
df.head()


Unnamed: 0,Product Name,Rating,Customers_Rated,Price
0,The Derma Co 1% Hyaluronic Sunscreen Aqua Ultr...,4.3 out of 5 stars,14645,₹448.00
1,NIVEA Nourishing Body Milk 600ml Body Lotion |...,4.4 out of 5 stars,42525,₹337.00
2,Aqualogica Glow+ Dewy Sunscreen SPF 50 PA++++ ...,4.3 out of 5 stars,10962,₹394.00
3,"NIVEA Soft Light Moisturizer, 300 ml, for Face...",4.4 out of 5 stars,58262,₹275.00
4,Simple Kind To Skin Refreshing Facial Wash 150...,4.3 out of 5 stars,15766,₹270.00


In [7]:
# Retrieve the dimensions (number of rows and columns) of the DataFrame
df.shape


(60, 4)

Data preprocessing on the ratings, customers_rated, and price columns.


In [8]:
# Extract the numerical part of the 'Rating' column and update the DataFrame
df['Rating'] = df['Rating'].apply(lambda x: x.split()[0])


In [9]:
# Convert the 'Rating' column to numeric values in the DataFrame
df['Rating'] = pd.to_numeric(df['Rating'])


In [10]:
# Remove the '₹' symbol from the 'Price' column in the DataFrame
df["Price"] = df["Price"].str.replace('₹', '')


In [11]:
# Remove commas from the 'Price' column in the DataFrame
df["Price"] = df["Price"].str.replace(',', '')


In [12]:
# Extract the whole number part from the 'Price' column in the DataFrame
df['Price'] = df['Price'].apply(lambda x: x.split('.')[0])


In [13]:
# Convert the 'Price' column to integer type in the DataFrame
df['Price'] = df['Price'].astype(int)


In [14]:
# Remove commas from the 'Customers_Rated' column in the DataFrame
df["Customers_Rated"] = df["Customers_Rated"].str.replace(',', '')


In [15]:
# Convert the 'Customers_Rated' column to numeric values in the DataFrame
df['Customers_Rated'] = pd.to_numeric(df['Customers_Rated'])


In [17]:
df.head(50)

Unnamed: 0,Product Name,Rating,Customers_Rated,Price
0,The Derma Co 1% Hyaluronic Sunscreen Aqua Ultr...,4.3,14645,448
1,NIVEA Nourishing Body Milk 600ml Body Lotion |...,4.4,42525,337
2,Aqualogica Glow+ Dewy Sunscreen SPF 50 PA++++ ...,4.3,10962,394
3,"NIVEA Soft Light Moisturizer, 300 ml, for Face...",4.4,58262,275
4,Simple Kind To Skin Refreshing Facial Wash 150...,4.3,15766,270
5,Dettol Liquid Handwash Refill – Skincare Hand ...,4.5,48330,192
6,Cetaphil Face Wash Gentle Skin Cleanser for Dr...,4.3,17474,375
7,Minimalist Sunscreen SPF 50 PA++++ | Clinicall...,4.1,17216,379
8,"Himalaya Purifying Neem Face Wash, 400 ml",4.3,72733,297
9,Parachute Advansed Soft Touch Body Lotion for ...,4.1,21793,115
