In [1]:
import requests
import time
from bs4 import BeautifulSoup 
from collections import defaultdict
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.expected_conditions import presence_of_element_located
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import csv

In [25]:
options = Options()
options.add_argument('--disable-gpu')
DRIVER_PATH = 'chromedriver'
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)

# Helpers to scrape list of item page URLs from product list page

In [28]:
# First we need to determine how many pages are there in total.
# start_url is the url to the product listings page.
start_url = "https://www.sephora.com/shop/moisturizing-cream-oils-mists?pageSize=300&currentPage=1"

# This function will scrape the page at starting_url
# and return an integer representing the last pagination number.
def find_last_page_number(starting_url):
    # request the html using the url, using selenium to take care of the javascript rendering stuff
    driver.get(starting_url)
    # scroll to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    soup = BeautifulSoup(driver.page_source, "html.parser")
    # this css class changes sometimes across different product listing pages. If this function fails
    # we most probably have to just update this css.
    page_button_css = 'css-ck6e0v eanm77i0'
    last_page_button = soup.find_all("button", class_=page_button_css)[-1]
    return int(last_page_button.text)

# This function will scrape and return a list of all products' urls on page_url.
# page_url is the url for a specific product listings page such as start_url above.
def get_product_urls(page_url):
    # request the product list page using page_url
    driver.get(page_url)
    total_height = int(driver.execute_script("return document.body.scrollHeight"))
    # using selenium, we slow-scroll to the bottom to lazy-load all the products
    for i in range(1, total_height, 5):
        driver.execute_script("window.scrollTo(0, {});".format(i))
    # gotta do this twice to account for the last few products
    new_total_height = int(driver.execute_script("return document.body.scrollHeight"))
    for i in range(total_height, new_total_height, 5):
        driver.execute_script("window.scrollTo(0, {});".format(i))
    # once all products are loaded, we can easily parse the URLs
    soup = BeautifulSoup(driver.page_source, "html.parser")
    product_group_css = "css-dkxsdo"
    group_elements = soup.find_all("div", class_=product_group_css)
    result = []
    product_css = "css-12egk0t"
    # for each group, iterate over all 12 of its child elements (individual products)
    for g in group_elements:
        child_products = g.findChildren("div", class_=product_css, recursive=False)
        for c in child_products:
            product = c.findChildren("a", recursive=True)
            if len(product) > 0:
                result.append("https://www.sephora.com" + product[0]["href"])
    return result

# This function combines find_last_page_number and get_product_urls to return a list of 
# all products across all the pages.
# We typically only have to call this wrapper function to scrape a list of all individual
# product's URLs for a specific category (e.g. moiturizers.)
def get_all_product_urls(start_url):
    last_page_number = find_last_page_number(start_url)
    result = []
    for i in range(1, last_page_number + 1):
        # build out the URL of the current page by changing the currentPage=X part of the URL
        current_url = start_url[:-1] + str(i)
        current_product_urls = get_product_urls(current_url)
        result = result + current_product_urls
    return result

In [29]:
all_urls = get_all_product_urls(start_url)
len(all_urls)

706

# Helpers to scrape each individual product page for desired product features.

In [143]:
# This function takes in a product_url and returns a beautiful-soup document presentation of the product page.
def get_product_soup(product_url):
    # request the html using the product_url, then transform the page source in beautiful-soup doc.
    driver.get(product_url)
    return BeautifulSoup(driver.page_source, "lxml")

# This function scrapes the brand name for the product given the beautiful-soup doc of a product page
# Returns empty string on error.
def get_brand_name(doc):
    brand_css = "css-nc375s e65zztl0"
    try:
        return doc.find('a', class_=brand_css).text
    except:
        return ""

# This helper function scrapes the "about the product" section and returns a list of its children bold elements.
def scrape_about_product_section(doc):
    about_product_css = "css-cnj3lw eanm77i0"
    try:
        about_product_container = doc.find("div", class_=about_product_css)
        return about_product_container.findChildren("b")
    except:
        return []

        
    
# This function scrapes the skin-type description under the "about the product" section given the
# beautiful-soup doc of a product page.
# Returns empty string if this field isnt available or on error.
def get_skin_type(doc):
    try:
        # since some fields may be left blank, we manually iterate over all the fields to see if skin_type is given
        for f in scrape_about_product_section(doc):
            if f.text == "Skin Type:":
                return f.next_sibling.strip()
        return ""
    except:
        return ""

    
# This function scrapes the skincare-concerns description under the "about the product" section given the
# beautiful-soup doc of a product page.
# Returns empty string if this field isnt available or on error.
def get_skincare_concerns(doc):
    try:
        for f in scrape_about_product_section(doc):
            if f.text == "Skincare Concerns:":
                return f.next_sibling.strip()
        return ""
    except:
        return ""

# This function returns True if the product is indicated to be vegan under Ingredient Callouts field in the 
# "about the product" section. If it is not indicated, we return False.
def get_is_vegan(doc):
    try:
        for f in scrape_about_product_section(doc):
            if f.text == "Ingredient Callouts:":
                content = f.next_sibling.strip()
                if "is vegan" in content:
                    return True
                return False
        return False
    except:
        return False
    
# This function returns True if the product is indicated to be cruelty-free under Ingredient Callouts field in the 
# "about the product" section. If it is not indicated, we return False.
def get_is_cruelty_free(doc):
    try:
        for f in scrape_about_product_section(doc):
            if f.text == "Ingredient Callouts:":
                content = f.next_sibling.strip()
                if "cruelty-free" in content:
                    return True
                return False
        return False
    except:
        return False

    
# This function returns the number of award wins of this product. Returns 0 by default.
def get_award_wins(doc):
    about_product_css = "css-cnj3lw eanm77i0"
    try:
        about_product_container = doc.find("div", class_=about_product_css)
        # find all <br> tags
        br_elems = about_product_container.findChildren("br")
        for idx, b in enumerate(br_elems):
           # for each <br>, we check if the next text element is "Award Wins".
            if b.previous_sibling == "Award Wins:":
                # This condition means this particular product has a few award wins listed in the following parts.
                # the number of award wins is simply the number of subsequent <br> tags which is followed by some text.
                # e.g. <br>some random award 1 <br> some random award 2 <br> ====> 2 awards
                total_awards = 0
                if idx + 1 == len(br_elems):
                    return total_awards
                for bb in br_elems[idx:]:
                    if bb.next_sibling.name == "b":
                        # we have iterated over all the awards listing and have now arrived at a new subsection
                        # within "about the product"
                        return total_awards
                    else:
                        total_awards += 1
                return total_awards
        return 0
    except:
        return 0

# This function returns true if this product has indicated that it is "Clean at Sephora", else False.
def get_is_clean_at_sephora(doc):
    try:
        for f in scrape_about_product_section(doc):
            if f.text == "Clean at Sephora":
                return True
        return False
    except:
        return False

# This function returns a string containing the results under the "Clinical Results" subsection in the 
# "about the product" section.
# Returns empty string if none specified or errored.
def get_clinical_results(doc):
    try:
        for f in scrape_about_product_section(doc):
            if f.text == "Clinical Results:":
                # we need to scrape all the lines until the end of this clinical results section.
                curr_elem = f
                results = ""
                while True:
                    if curr_elem.next_sibling.name != "br":
                        results += curr_elem.next_sibling + " "
                        curr_elem = curr_elem.next_sibling
                    else:
                        # if the current element is <br> followed by a <b>, we know that we have reached the end
                        # of the clinical results section.
                        curr_elem = curr_elem.next_sibling
                        if curr_elem.next_sibling.name == "b":
                            return results.strip()
                                
                return results
        return ""
    except:
        return ""

In [33]:
bs = get_product_soup(all_urls[0])
bs

<html class="css-mao3d8 no-touch" lang="en"><head data-comp="Head "><style data-vjs-version="7.11.6" id="bc-style-vjs" type="text/css">@charset "UTF-8";.video-js .vjs-big-play-button .vjs-icon-placeholder:before,.video-js .vjs-modal-dialog,.vjs-button>.vjs-icon-placeholder:before,.vjs-modal-dialog .vjs-modal-dialog-content{position:absolute;top:0;left:0;width:100%;height:100%}.video-js .vjs-big-play-button .vjs-icon-placeholder:before,.vjs-button>.vjs-icon-placeholder:before{text-align:center}@font-face{font-family:VideoJS;src:url(data:application/font-woff;charset=utf-8;base64,d09GRgABAAAAABDkAAsAAAAAG6gAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAABHU1VCAAABCAAAADsAAABUIIslek9TLzIAAAFEAAAAPgAAAFZRiV3hY21hcAAAAYQAAADaAAADPv749/pnbHlmAAACYAAAC3AAABHQZg6OcWhlYWQAAA3QAAAAKwAAADYZw251aGhlYQAADfwAAAAdAAAAJA+RCLFobXR4AAAOHAAAABMAAACM744AAGxvY2EAAA4wAAAASAAAAEhF6kqubWF4cAAADngAAAAfAAAAIAE0AIFuYW1lAAAOmAAAASUAAAIK1cf1oHBvc3QAAA/AAAABJAAAAdPExYuNeJxjYGRgYOBiMGCwY2BycfMJYeDLSSzJY5BiYGGAAJA8MpsxJzM9kYEDxgPKsYB

In [144]:
get_clinical_results(bs)

'In an independent consumer-testing panel of 50 women ages 25 to 55: - 100% agreed their skin felt replenished and nourished  - 97% agreed their skin felt moisturized - 97% agreed their skin texture felt soft and smooth'