In [1]:
# Import libraries

from bs4 import BeautifulSoup
import re
import pandas as pd
import requests

import smtplib

import datetime
import time

today = datetime.date.today()
print(today)

2024-08-13


In [2]:
# Connect to Website

URL = 'https://www.amazon.sg/s?k=sunscreen&s=exact-aware-popularity-rank&crid=26XGF0ETVPQUI&qid=1723367023&sprefix=sun%2Caps%2C833&ref=sr_st_exact-aware-popularity-rank&ds=v1%3A3pDT%2BAG86jcKv9fBxlUHlsFORcPucnxPmCLjzaZBAa4'
headers = ({"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36", 'Accept-Language': 'en-US en;=0.5'})

page = requests.get(URL, headers=headers)

soup = BeautifulSoup(page.content, "html.parser")
soup2 = BeautifulSoup(soup.prettify(), "html.parser")

In [3]:
# Functions to scrape the Title, Price, Rating, Reviews, and Monthly Sales of a Product on the Product Page

def get_title(soup):
    try:
        product_title = soup.find("span", attrs={"id":'productTitle'})
        title_value = product_title.text
        title_string = title_value.strip()
    
    except AttributeError:
        title_string = ""
    
    return title_string
    

def get_price(soup):
    price = soup.find(class_='a-offscreen')
    if price:
        price_text = price.text.strip()
        try:
            price = float(price_text[2:])
            return price
        except ValueError:
            return "N/A"
    else:
        return "N/A"

def get_rating(soup):
    rating = soup.find(class_='a-icon-alt')
    if rating:
        ratings = rating.text.split(' out of 5 stars')
        ratings = ratings[0]
        rat = ratings.strip()
        return rat
    else:
        return "N/A"
    
def get_reviews(soup):
    reviews_element = soup.find("span", attrs={'id':'acrCustomerReviewText'})
    if reviews_element:
        reviews = reviews_element.text.split(' ratings')
        reviews = reviews[0]
        rev = reviews.strip()
        return rev
    else:
        return "N/A"

def get_sales(soup):
    sales = soup.find(class_='a-size-small social-proofing-faceout-title-text')
    if sales:
        return sales.text
    else:
        return "N/A"

In [4]:
# Base URL of the site (without the page number)
base_url = 'https://www.amazon.sg/s?k=sunscreen&s=exact-aware-popularity-rank&crid=26XGF0ETVPQUI&qid=1723484528&sprefix=sun%2Caps%2C833&ref=sr_pg_'

# Header used to get request to scrape the webpage
headers = ({"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36", 'Accept-Language': 'en-US en;=0.5'})

# Define the number of pages you want to scrape
num_pages = 7

# Create a dictionary to store product details
d = {'title': [], 'price': [], 'rating': [], 'reviews': [], 'monthly_sales': [], 'URL': []}

# Loop through each page
for page in range(1, num_pages +1):
    url = f'{base_url}{page}'

    response = requests.get(url, headers=headers)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    links = soup.find_all('a', attrs={'class': 'a-link-normal s-no-outline'})
    
    for link in links:
        product_url = "https://www.amazon.sg" + link.get('href')
        new_product = requests.get(product_url, headers=headers)
        product_soup = BeautifulSoup(new_product.content, "html.parser")

        d['title'].append(get_title(product_soup))
        d['price'].append(get_price(product_soup))
        d['rating'].append(get_rating(product_soup))
        d['reviews'].append(get_reviews(product_soup))
        d['monthly_sales'].append(get_sales(product_soup))
        d['URL'].append(product_url)
        
        time.sleep(2)

In [5]:
# Create DataFrame
amazon_df = pd.DataFrame.from_dict(d)
amazon_df.tail()

Unnamed: 0,title,price,rating,reviews,monthly_sales,URL
331,,,,,,https://www.amazon.sg/Thinkbaby-Sunscreen-Resi...
332,,,,,,https://www.amazon.sg/Banana-Sunscreen-Lotion-...
333,MDSolarSciences Mineral Moisture Defense SPF 5...,62.68,4.4,460.0,,https://www.amazon.sg/MDSolarSciences-Mineral-...
334,"Cetaphil Sun Kids SPF50+ Liposomal Lotion, 150ml",,4.7,142.0,,https://www.amazon.sg/Cetaphil-SPF50-Liposomal...
335,,,,,,https://www.amazon.sg/Banana-Sport-Ultra-Sunsc...


In [6]:
import csv

# Creating the csv file
with open('AmazonWebScraper_SunscreenSingapore.csv', 'w', newline='', encoding='UTF8') as f:
    amazon_df.to_csv('AmazonWebScraper_SunscreenSingapore.csv', index=False)

PermissionError: [Errno 13] Permission denied: 'AmazonWebScraper_SunscreenSingapore.csv'

In [None]:
# Appending data to the csv
with open('AmazonWebScraper_SunscreenSingapore.csv', 'a+', newline='', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(amazon_df)