In [263]:
import requests
import time
from bs4 import BeautifulSoup 
from collections import defaultdict
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.expected_conditions import presence_of_element_located
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import csv

In [25]:
options = Options()
options.add_argument('--disable-gpu')
DRIVER_PATH = 'chromedriver'
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)

# Helpers to scrape list of item page URLs from product list page

In [28]:
# First we need to determine how many pages are there in total.
# start_url is the url to the product listings page.
start_url = "https://www.sephora.com/shop/moisturizing-cream-oils-mists?pageSize=300&currentPage=1"

# This function will scrape the page at starting_url
# and return an integer representing the last pagination number.
def find_last_page_number(starting_url):
    # request the html using the url, using selenium to take care of the javascript rendering stuff
    driver.get(starting_url)
    # scroll to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    soup = BeautifulSoup(driver.page_source, "html.parser")
    # this css class changes sometimes across different product listing pages. If this function fails
    # we most probably have to just update this css.
    page_button_css = 'css-ck6e0v eanm77i0'
    last_page_button = soup.find_all("button", class_=page_button_css)[-1]
    return int(last_page_button.text)

# This function will scrape and return a list of all products' urls on page_url.
# page_url is the url for a specific product listings page such as start_url above.
def get_product_urls(page_url):
    # request the product list page using page_url
    driver.get(page_url)
    total_height = int(driver.execute_script("return document.body.scrollHeight"))
    # using selenium, we slow-scroll to the bottom to lazy-load all the products
    for i in range(1, total_height, 5):
        driver.execute_script("window.scrollTo(0, {});".format(i))
    # gotta do this twice to account for the last few products
    new_total_height = int(driver.execute_script("return document.body.scrollHeight"))
    for i in range(total_height, new_total_height, 5):
        driver.execute_script("window.scrollTo(0, {});".format(i))
    # once all products are loaded, we can easily parse the URLs
    soup = BeautifulSoup(driver.page_source, "html.parser")
    product_group_css = "css-dkxsdo"
    group_elements = soup.find_all("div", class_=product_group_css)
    result = []
    product_css = "css-12egk0t"
    # for each group, iterate over all 12 of its child elements (individual products)
    for g in group_elements:
        child_products = g.findChildren("div", class_=product_css, recursive=False)
        for c in child_products:
            product = c.findChildren("a", recursive=True)
            if len(product) > 0:
                result.append("https://www.sephora.com" + product[0]["href"])
    return result

# This function combines find_last_page_number and get_product_urls to return a list of 
# all products across all the pages.
# We typically only have to call this wrapper function to scrape a list of all individual
# product's URLs for a specific category (e.g. moiturizers.)
def get_all_product_urls(start_url):
    last_page_number = find_last_page_number(start_url)
    result = []
    for i in range(1, last_page_number + 1):
        # build out the URL of the current page by changing the currentPage=X part of the URL
        current_url = start_url[:-1] + str(i)
        current_product_urls = get_product_urls(current_url)
        result = result + current_product_urls
    return result

In [29]:
all_urls = get_all_product_urls(start_url)
len(all_urls)

706

# Helpers to scrape each individual product page for desired product features.

In [271]:
# This function takes in a product_url and returns a beautiful-soup document presentation of the product page.
def get_product_doc(product_url):
    # request the html using the product_url, then transform the page source in beautiful-soup doc.
    driver.get(product_url)
    return BeautifulSoup(driver.page_source, "lxml")

# This function scrapes the brand name for the product given the beautiful-soup doc of a product page
# Returns empty string on error.
def get_brand_name(doc):
    brand_css = "css-nc375s e65zztl0"
    try:
        return doc.find('a', class_=brand_css).text
    except:
        return ""

# This helper function scrapes the "about the product" section and returns a list of its children bold elements.
def scrape_about_product_section(doc):
    about_product_css = "css-cnj3lw eanm77i0"
    try:
        about_product_container = doc.find("div", class_=about_product_css)
        return about_product_container.findChildren("b")
    except:
        return []

        
    
# This function scrapes the skin-type description under the "about the product" section given the
# beautiful-soup doc of a product page.
# Returns empty string if this field isnt available or on error.
def get_skin_type(doc):
    try:
        # since some fields may be left blank, we manually iterate over all the fields to see if skin_type is given
        for f in scrape_about_product_section(doc):
            if f.text == "Skin Type:":
                return f.next_sibling.strip()
        return ""
    except:
        return ""

    
# This function scrapes the skincare-concerns description under the "about the product" section given the
# beautiful-soup doc of a product page.
# Returns empty string if this field isnt available or on error.
def get_skincare_concerns(doc):
    try:
        for f in scrape_about_product_section(doc):
            if f.text == "Skincare Concerns:":
                return f.next_sibling.strip()
        return ""
    except:
        return ""

# This function returns True if the product is indicated to be vegan under Ingredient Callouts field in the 
# "about the product" section. If it is not indicated, we return False.
def get_is_vegan(doc):
    try:
        for f in scrape_about_product_section(doc):
            if f.text == "Ingredient Callouts:":
                content = f.next_sibling.strip()
                if "is vegan" in content:
                    return True
                return False
        return False
    except:
        return False
    
# This function returns True if the product is indicated to be cruelty-free under Ingredient Callouts field in the 
# "about the product" section. If it is not indicated, we return False.
def get_is_cruelty_free(doc):
    try:
        for f in scrape_about_product_section(doc):
            if f.text == "Ingredient Callouts:":
                content = f.next_sibling.strip()
                if "cruelty-free" in content:
                    return True
                return False
        return False
    except:
        return False

    
# This function returns the number of award wins of this product. Returns 0 by default.
def get_award_wins(doc):
    about_product_css = "css-cnj3lw eanm77i0"
    try:
        about_product_container = doc.find("div", class_=about_product_css)
        # find all <br> tags
        br_elems = about_product_container.findChildren("br")
        for idx, b in enumerate(br_elems):
           # for each <br>, we check if the next text element is "Award Wins".
            if b.previous_sibling == "Award Wins:":
                # This condition means this particular product has a few award wins listed in the following parts.
                # the number of award wins is simply the number of subsequent <br> tags which is followed by some text.
                # e.g. <br>some random award 1 <br> some random award 2 <br> ====> 2 awards
                total_awards = 0
                if idx + 1 == len(br_elems):
                    return total_awards
                for bb in br_elems[idx:]:
                    if bb.next_sibling.name == "b":
                        # we have iterated over all the awards listing and have now arrived at a new subsection
                        # within "about the product"
                        return total_awards
                    else:
                        total_awards += 1
                return total_awards
        return 0
    except:
        return 0

# This function returns true if this product has indicated that it is "Clean at Sephora", else False.
def get_is_clean_at_sephora(doc):
    try:
        for f in scrape_about_product_section(doc):
            if f.text == "Clean at Sephora":
                return True
        return False
    except:
        return False

# This function returns a string containing the results under the "Clinical Results" subsection in the 
# "about the product" section.
# Returns empty string if none specified or errored.
def get_clinical_results(doc):
    try:
        for f in scrape_about_product_section(doc):
            if f.text == "Clinical Results:":
                # we need to scrape all the lines until the end of this clinical results section.
                curr_elem = f
                results = ""
                while True:
                    if curr_elem.next_sibling.name != "br":
                        results += curr_elem.next_sibling + " "
                        curr_elem = curr_elem.next_sibling
                    else:
                        # if the current element is <br> followed by a <b>, we know that we have reached the end
                        # of the clinical results section.
                        curr_elem = curr_elem.next_sibling
                        if curr_elem.next_sibling.name == "b":
                            return results.strip()
                                
                return results
        return ""
    except:
        return ""

# This function returns a string containing the entire ingredients section. Can be used to derive other 
# features such as whether the product contains alcohol etc.
# Returns empty string if product page does not describe ingredients.
def get_ingredients(doc):
    ingredients_css = "css-1ue8dmw eanm77i0"
    try:
        ingredients_container = doc.find("div", class_=ingredients_css).findChild("div")
        # scrape all the text in this section
        return ingredients_container.get_text()
    except:
        return ""

# This function returns the price of the product. Returns 0 by default.
def get_price(doc):
    price_css = "css-1lzahen"
    try:
        price_container = doc.find("span", class_=price_css)
        return float(price_container.findChild("b").text[1:])
    except:
        return 0

# This function returns a string (where each highlight is separated by comma) concatenation of all the highlights.
# Returns empty string if none available on the products page.
def get_highlights(doc):
    highlight_css = "css-aiipho eanm77i0"
    try:
        highlights_elems = doc.findAll("div", class_=highlight_css)
        highlights = ""
        for h in highlights_elems:
            highlights += h.get_text() + ","
        return highlights[:-1] # to remove trailing comma
    except:
        return ""

# This function returns # of images for a particular product 
# shown in the sidebar (i.e. marketing assets)
# Returns 0 if none available.
def get_image_count(product_url, doc):
    try:
        # need to first fully refresh the product page
        driver.get(product_url)
        # for some reason this element only works with xpath
        see_all_button_xpath = "/html/body/div[1]/div[2]/div/main/div/div[1]/div[2]/div[1]/div[2]/button"
        see_all_button = driver.find_element_by_xpath(see_all_button_xpath)
        # we need to click the button first to scrape the results
        see_all_button.click()
        assets_container_css = "css-1gxowto eanm77i0"
        # iterate over all the assets and count the number of images
        # we need to fetch the new doc since at this point after clicking the button the doc changed.
        doc = BeautifulSoup(driver.page_source, "lxml")
        assets_container = doc.find("div", class_=assets_container_css)
        image_count = 0
        image_thumbnail_css = "css-19cgo3r eanm77i0"
        for a in assets_container.findChildren(recursive=False):
            if a.findChild("div", class_=image_thumbnail_css):
                image_count += 1
        return image_count
    except:
        return 0

# This function returns # of videos for a particular product 
# shown in the sidebar (i.e. marketing assets)
# Returns 0 if none available.
def get_video_count(product_url, doc):
    try:
        # need to first fully refresh the product page
        driver.get(product_url)
        # for some reason this element only works with xpath
        see_all_button_xpath = "/html/body/div[1]/div[2]/div/main/div/div[1]/div[2]/div[1]/div[2]/button"
        see_all_button = driver.find_element_by_xpath(see_all_button_xpath)
        # we need to click the button first to scrape the results
        see_all_button.click()
        assets_container_css = "css-1gxowto eanm77i0"
        # iterate over all the assets and count the number of videos
        # we need to fetch the new doc since at this point after clicking the button the doc changed.
        doc = BeautifulSoup(driver.page_source, "lxml")
        assets_container = doc.find("div", class_=assets_container_css)
        video_count = 0
        video_thumbnail_css = "css-eizhj7 eanm77i0"
        for a in assets_container.findChildren(recursive=False):
            if a.findChild("div", class_=video_thumbnail_css):
                video_count += 1
        return video_count
    except:
        return 0

# This function scrapes the total number of reviews for this product.
# NOTE: this should probably NOT be used in a final model since we won't have this attribute
# for actual new products which we hope to predict the ratings for.
# Returns an int represent total number of reviews.
def get_review_count(doc):
    try:
        review_count_css = "css-1coslxg"
        review_count_elem = doc.find("span", class_=review_count_css)
        review_count_text = review_count_elem.text
        if 'K' in review_count_text:
             return int(float(review_count_text[:-1]) * 1000)
        else:
            return int(review_count_text)
    except:
        return 0
    
    
# This function scrapes the number of different sizing options available. Returns 1 by default.
def get_num_sizing_options(doc):
    try:
        sizing_container_css = "css-1npgsxx e65zztl0"
        sizing_container = doc.find("div", class_=sizing_container_css)
        sizing_options = sizing_container.findAll('div', attrs={'data-comp' : "SwatchGroup "})
        return len(sizing_options)
    except:
        return 1

# This function scrapes the overall ratings (dependent variable) as a float.
# The overall ratings is rounded to the nearest 0.5.
# Returns 0 by default.
def get_overall_ratings(doc):
    try:
        ratings_css = "css-jp4jy6"
        ratings_container = doc.find("div", class_=ratings_css)
        # this is usually shown as "X stars"
        ratings_text = ratings_container['aria-label']
        # clean the text to return a float
        return float(ratings_text.split(" ")[0])
    except:
        return 0.0

# Wrapper function which combines all of the above helpers and returns a single-row data-frame containing
# all the above independent and dependent features.
def generate_data_row(product_url):
    doc = get_product_doc(product_url)
    ratings = get_overall_ratings(doc)
    brand = get_brand_name(doc)
    skin_type = get_skin_type(doc)
    skincare_concerns = get_skincare_concerns(doc)
    is_vegan = get_is_vegan(doc)
    is_cruelty_free = get_is_cruelty_free(doc)
    award_wins = get_award_wins(doc)
    is_clean_product = get_is_clean_at_sephora(doc)
    cleanical_results = get_clinical_results(doc)
    ingredients = get_ingredients(doc)
    price = get_price(doc)
    highlights = get_highlights(doc)
    review_count = get_review_count(doc)
    sizing_options_count = get_num_sizing_options(doc)
    image_count = get_image_count(product_url, doc)
    video_count = get_video_count(product_url, doc)
    d = {
        "product_url": product_url,
        "overall_ratings": ratings,
        "brand": brand,
        "skin_type": skin_type,
        "skincare_concerns": skincare_concerns,
        "is_vegan": is_vegan,
        "is_cruelty_free": is_cruelty_free,
        "award_wins": award_wins,
        "is_clean_product": is_clean_product,
        "cleanical_results": cleanical_results,
        "ingredients": ingredients,
        "price": price,
        "highlights": highlights,
        "review_count":  review_count,
        "sizing_options_count": sizing_options_count,
        "image_count": image_count,
        "video_count": video_count
    }
    return pd.DataFrame(data=d, index=[0])

In [273]:
sample_row = generate_data_row(all_urls[0])

In [278]:
sample_row

Unnamed: 0,product_url,overall_ratings,brand,skin_type,skincare_concerns,is_vegan,is_cruelty_free,award_wins,is_clean_product,cleanical_results,ingredients,price,highlights,review_count,sizing_options_count,image_count,video_count
0,https://www.sephora.com/product/protini-tm-polypeptide-cream-P427421?icid2=products grid:p427421,4.0,Drunk Elephant,"Normal, Dry, Combination, and Oily","Dryness, Dullness and Uneven Texture, and Loss of Firmness and Elasticity",True,True,4,True,In an independent consumer-testing panel of 50 women ages 25 to 55: - 100% agreed their skin felt replenished and nourished - 97% agreed their skin felt moisturized - 97% agreed their skin texture felt soft and smooth,"-Signal Peptide Complex (Growth Factors): Nine signal peptides (made up of aminoacids, which form proteins in the skin) bind moisture to skin—visibly plumping, firming,and restoring bounce.-Pygmy Waterlily Stem Cell Extract: An excellent source of antioxidants and nutrients to replenish the skin; extremely moisturizing, soothing, and calming; and effective in supporting healthy-, youthful-looking skin.-Soybean Folic Acid Ferment Extract: This B vitamin plays a central role in maintaining youthful-, elastic-looking skin. Skin is thirsty for folic acid, especially after exposure to sun.Water/Aqua/Eau, Dicaprylyl Carbonate, Glycerin, Cetearyl Alcohol, Cetearyl Olivate, Sorbitan Olivate, Sclerocarya Birrea Seed Oil, Bacillus/Soybean/ Folic Acid Ferment Extract, Nymphaea Alba Root Extract, sh-Oligopeptide-1, sh-Oligopeptide-2, sh-Polypeptide-1, sh-Polypeptide-9, sh-Polypeptide-11, Copper Palmitoyl Heptapeptide-14, Heptapeptide-15 Palmitate, Palmitoyl Tetrapeptide-7, Palmitoyl Tripeptide-1, Alanine, Arginine, Glycine, Histidine, Isoleucine, Phenylalanine, Proline, Serine, Threonine, Valine, Acetyl Glutamine, Coconut Alkanes, Coco-Caprylate/Caprate, Sodium Hyaluronate, Aspartic Acid, Linoleic Acid, Linolenic Acid, Lecithin, Butylene Glycol, Polyvinyl Alcohol, Sodium Lactate, Sodium PCA, PCA, Sorbitan Isostearate, Carbomer, Polysorbate 20, Polysorbate 60, Lactic Acid/Glycolic Acid Copolymer, Hydroxyethyl Acrylate/Sodium Acryloyldimethyl Taurate Copolymer, Xanthan Gum, Isomalt, 1,2-Hexanediol, Caprylyl Glycol, Chlorphenesin, Phenoxyethanol, Tocopherol, Sodium Benzoate, Phenylpropanol, Glyceryl Caprylate, Symphytum Officinale Callus Culture Extract.Clean at Sephora products are formulated without:Sulfates—SLS + SLES, Parabens, Formaldehydes, Formaldehyde-releasing agents, Phthalates, Mineral Oil, Retinyl Palmitate, Oxybenzone, Coal Tar, Hydroquinone, Triclosan, Triclocarban, Undisclosed synthetic fragrances (Products can be formulated with disclosed synthetic fragrances that meet the following two criteria: (1) the synthetic fragrances do not include any of the ingredients listed in numbers 1 through 12 above and (2) the synthetic fragrances are at a concentration below 1% of the total formula) The following type of acrylates: (ethyl acrylate, ethyl methacrylate, methyl methacrylate, butyl methacrylate, hydroxypropyl methacrylate, tetrahydrofurfuryl methacrylate, trimethylolpropane trimethacrylate , aluminum salts), Animal Oils/Musks/Fats, Benzophenone + Related Compounds, Butoxyethanol, Carbon Black, Lead/Lead Acetate, Methyl Cellosolve + Methoxyethanol, Methylchloroisothiazolinone & Methylisothiazolinone, Mercury + Mercury Compounds (Thimerisol), Insoluble Plastic Microbeads (This prohibited ingredient applies to products that are meant to be rinsed off ), Resorcinol, Talc (Talc that is free of any asbestos can be used in the formulation provided that Brand conducts testing to ensure that talc is free of any asbestos.), Toluene, Butylated hydroxyanisole (BHA), Butylated hydroxytoluene (BHT) that is 0.1% or more of total formula, Ethanolamines DEA/TEA/MEA/ETA, Nanoparticles as defined by the European Commission, Petrolatum and Parrafin that is not USP grade, Phenoxyethanol that is 1% or more of total formulation, Polyacrylamide & Acrylamide, The following types of Styrene (Bromostyrene, Deastyrene/acrylates/dvbcopolymer, sodium styrene/divinylbenzene copolymer , styrene oxide, styrene), 1,4 Dioxane in final formulas must comply with the thresholds as follows: (10 or < ppm for products that are meant to be rinsed off, wiped off or removed, 3ppm or < for products that are meant to remain on the skin).",68.0,"Good for: Loss of firmness,Good for: Dullness/Uneven Texture,Good for: Anti-Aging,Good for: Dryness,Clean at Sephora,Community Favorite",5300,3,8,1


# Now we can begin scraping all products.

### First scrape all moisturizers.

In [280]:
moisturizers_start_url = "https://www.sephora.com/shop/moisturizing-cream-oils-mists?pageSize=300&currentPage=1"
all_moisturizer_urls = get_all_product_urls(moisturizers_start_url)

706

In [None]:
df = pd.DataFrame()
for u in all_moisturizer_urls:
    d = generate_data_row(u)
    df = pd.concat([pd, d], ignore_index=True)