In [42]:
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import os

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", -1)
pd.set_option("display.float_format", "{:.4f}".format)
pd.set_option("display.max_info_rows", 200)

In [2]:
import sephora_scrape as scrape
%load_ext autoreload
%autoreload 2

In this notebook, we will scrape product information from Sephora and reviews from Bazaarvoice.

# Brand List

Upon checking, there is no API that can give us all the products from Sephora.  We need to scrape them from the website instead.  We'll be able to get all products by doing it per brand.  First, let's get a list of all brands and the corresponding url link.

In [15]:
sephora = scrape.Sephora()
sephora_brands_url = "https://www.sephora.com/brands-list"
sephora.open_url(sephora_brands_url)
sephora.get_brands()

Currently, there are 338 brands on Sephora:

In [19]:
len(sephora.brand_list), sephora.brand_list[:10]

(338,
 [{'brand_name': 'Acqua di Parma',
   'brand_link': 'https://www.sephora.com/brand/acqua-di-parma'},
  {'brand_name': 'adwoa beauty',
   'brand_link': 'https://www.sephora.com/brand/adwoa-beauty'},
  {'brand_name': 'AERIN',
   'brand_link': 'https://www.sephora.com/brand/aerin-perfume'},
  {'brand_name': 'Aether Beauty',
   'brand_link': 'https://www.sephora.com/brand/aether-beauty'},
  {'brand_name': 'Algenist',
   'brand_link': 'https://www.sephora.com/brand/algenist'},
  {'brand_name': 'Alpha-H',
   'brand_link': 'https://www.sephora.com/brand/alpha-h'},
  {'brand_name': 'alpyn beauty',
   'brand_link': 'https://www.sephora.com/brand/alpyn-beauty'},
  {'brand_name': 'ALTERNA Haircare',
   'brand_link': 'https://www.sephora.com/brand/alterna'},
  {'brand_name': 'amika', 'brand_link': 'https://www.sephora.com/brand/amika'},
  {'brand_name': 'AMOREPACIFIC',
   'brand_link': 'https://www.sephora.com/brand/amorepacific'}])

In [17]:
df_brands = pd.DataFrame(sephora.brand_list)
df_brands.to_csv("brands.csv", index=False)
sephora.browser.quit()

In [3]:
df_brands = pd.read_csv("brands.csv")
df_brands.head()

Unnamed: 0,brand_name,brand_link
0,Acqua di Parma,https://www.sephora.com/brand/acqua-di-parma
1,adwoa beauty,https://www.sephora.com/brand/adwoa-beauty
2,AERIN,https://www.sephora.com/brand/aerin-perfume
3,Aether Beauty,https://www.sephora.com/brand/aether-beauty
4,Algenist,https://www.sephora.com/brand/algenist


# Product List & Detailed Information

After getting the brands and the corresponding urls, we can move on to scraping the products under each brand.  There is a script tag at the bottom of the page source that contains the products in json format. If this doesn't work then we can 

In [9]:
df_products = pd.DataFrame()

brand = scrape.Sephora()
product = scrape.Sephora()
brand_urls = df_brands.brand_link.unique()

for brand_url in tqdm(brand_urls):

    for page in range(1, 10):

        if page == 1:
            full_url = brand_url + "?pageSize=300"
        else:
            full_url = brand_url + "?pageSize=300&currentPage=" + str(page)

        brand.open_url(full_url)  # open url
        brand.get_productsJSON()  # get products from text/json
        if len(brand.product_list) == 0:
            # if get_productsJSON() returns None then get products from a tags
            brand.get_productsHTML()  

        df_products = df_products.append(pd.DataFrame(brand.product_list),
                                         ignore_index=True)
        
        if len(brand.product_list) < 300:
            # if current page has less than 300 items, then go to next brand
            break
        else:
            # otherwise go to next page
            continue

    df_products.to_csv("product_list.csv", index=False)

brand.browser.quit()
product.browser.quit()

del brand, product, brand_urls, full_url, brand_url, page, i

HBox(children=(IntProgress(value=0, max=338), HTML(value='')))




In [138]:
df_products = pd.read_csv("product_list.csv")
df_products

Unnamed: 0,brand_name,product_id,product_sku,product_name,product_price_low,product_price_high,product_rating,product_image,product_link
0,8Greens,P437988,2162444,8G Dietary Supplement,14.0000,78.0000,4.4219,https://www.sephora.com/productimages/sku/s2162444-main-Lhero.jpg,https://www.sephora.com/product/8greens-dietary-supplement-P437988
1,8Greens,P452918,2298891,8Greens Gummies Dietary Supplement,45.0000,45.0000,4.2000,https://www.sephora.com/productimages/sku/s2298891-main-Lhero.jpg,https://www.sephora.com/product/8greens-8greens-gummies-dietary-supplement-P452918
2,8Greens,P455912,2321461,8Greens Skin Tablet,16.0000,16.0000,3.8333,https://www.sephora.com/productimages/sku/s2321461-main-Lhero.jpg,https://www.sephora.com/product/8greens-8greens-skin-tablet-P455912
3,AERIN,P388762,1738582,Ikat Jasmine,30.0000,180.0000,4.4211,https://www.sephora.com/productimages/sku/s1738582-main-Lhero.jpg,https://www.sephora.com/product/ikat-jasmine-P388762
4,AERIN,P388764,1639210,Gardenia Rattan,130.0000,130.0000,4.1667,https://www.sephora.com/productimages/sku/s1639210-main-Lhero.jpg,https://www.sephora.com/product/gardenia-rattan-P388764
...,...,...,...,...,...,...,...,...,...
8988,trèStiQue,P416573,1901289,Magic Mattifying Balm & Blotting Sheet Duo,28.0000,28.0000,3.8889,https://www.sephora.com/productimages/sku/s1901289-main-Lhero.jpg,https://www.sephora.com/product/magic-mattifying-balm-blotting-sheet-duo-P416573
8989,trèStiQue,P429026,2041945,Mini Plumping Lip Balm,12.0000,12.0000,2.9307,https://www.sephora.com/productimages/sku/s2041945-main-Lhero.jpg,https://www.sephora.com/product/mini-plumping-lip-balm-P429026
8990,trèStiQue,P429276,2060762,Sugar Polish + Plumping Balm,24.0000,24.0000,3.2727,https://www.sephora.com/productimages/sku/s2060762-main-Lhero.jpg,https://www.sephora.com/product/sugar-polish-plumping-balm-P429276
8991,trèStiQue,P431762,2084358,Prime + Color Lip Glaze,25.0000,25.0000,3.6140,https://www.sephora.com/productimages/sku/s2084358-main-Lhero.jpg,https://www.sephora.com/product/prime-color-lip-glaze-P431762


In [4]:
json_folder = "/Users/valmadrid/DataScienceBootcamp/Projects/Gift Recommendation/Gift-Recommendation/web_scraping/json/"
if not os.path.exists(json_folder):
    os.mkdir(json_folder)

In [5]:
images_folder = "/Users/valmadrid/DataScienceBootcamp/Projects/Gift Recommendation/Gift-Recommendation/web_scraping/images/"
if not os.path.exists(images_folder):
    os.mkdir(images_folder)

In [164]:
product = scrape.Sephora()

for i in tqdm(range(len(df_products))):

    product.open_url(df_products_product_link.iloc[i])  # open url
    product.get_product_info(df_products_product_id.iloc[i], json_folder) #save json
    product.get_image(df_products_product_sku.iloc[i], images_folder) #save image
    sleep(2)
    
product.browser.quit()

del product, product_links

# Product Categories & List

In [142]:
categories_urls = [
    "https://www.sephora.com/beauty/new-beauty-products",
    "https://www.sephora.com/shop/gifts-for-her",
    "https://www.sephora.com/shop/gifts-for-men",
    "https://www.sephora.com/shop/gifts-for-them",
    "https://www.sephora.com/shop/gifts-for-teenage-girls",
    "https://www.sephora.com/shop/gift-sets-for-men",
    "https://www.sephora.com/shop/travel-size-toiletries",
    "https://www.sephora.com/shop/value-sets",
    "https://www.sephora.com/shop/editors-picks-gifts?",
    "https://www.sephora.com/shop/luxury-gifts",
    "https://www.sephora.com/shop/exclusive-products",
    "https://www.sephora.com/shop/mens-perfume",
    "https://www.sephora.com/shop/mens-facial-products",
    "https://www.sephora.com/shop/mens-grooming",
    "https://www.sephora.com/shop/mens-hair-care",
    "https://www.sephora.com/shop/mens-personal-care",
    "https://www.sephora.com/beauty/new-cologne-for-men",
    "https://www.sephora.com/beauty/best-selling-mens-products"
]

In [172]:
categories_name = [
    "just_arrived", "gifts_her", "gifts_him", "gifts_them", "gifts_teens",
    "gift_sets_him", "travel_size", "gift_sets", "editors_picks",
    "luxury_gifts", "sephora_exclusives", "perfume_him", "facial_him",
    "grooming_him", "hair_him", "personal_care_him", "just_arrived_him", "best_seller_him"
]

In [157]:
categories = {}

category = scrape.Sephora()

for url, cat in tqdm(zip(categories_urls, categories_name)):
    
    products = []
    
    for page in range(1, 10):

        if page == 1:
            full_url = url + "?pageSize=300"
        else:
            full_url = url + "?pageSize=300&currentPage=" + str(page)

        category.open_url(full_url)  # open url
        category.get_productsJSON()  # get products from text/json
        if len(category.product_list) == 0:
            # if get_productsJSON() returns None then get products class a's
            category.get_productsHTML()  
        
        products.extend([i["product_id"] for i in category.product_list])
        
        if len(category.product_list) < 300:
            # if current page has less than 300 items, then go to next brand
            break
        else:
            # otherwise go to next page
            continue

    categories[cat] = products
    with open("categories.json", "w") as outfile:
        json.dump(categories, outfile, indent=2)
    
category.browser.quit()

del category

[autoreload of sephora_scrape failed: Traceback (most recent call last):
  File "/Applications/anaconda3/envs/learn-env/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/Applications/anaconda3/envs/learn-env/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 450, in superreload
    update_generic(old_obj, new_obj)
  File "/Applications/anaconda3/envs/learn-env/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 387, in update_generic
    update(a, b)
  File "/Applications/anaconda3/envs/learn-env/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 357, in update_class
    update_instances(old, new)
  File "/Applications/anaconda3/envs/learn-env/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 312, in update_instances
    update_instances(old, new, obj.__dict__, visited)
  File "/Applications/anaconda3/envs/learn-env/lib/python3.6/site

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




# Product Stats & Reviews

In [152]:
reviews_folder = "/Users/valmadrid/DataScienceBootcamp/Projects/Gift Recommendation/Gift-Recommendation/web_scraping/reviews/"
if not os.path.exists(reviews_folder):
    os.mkdir(reviews_folder)

In [165]:
reviews = scrape.Sephora()

product_ids = df_products.product_id.unique()

for i in tqdm(range(len(product_ids))):
    reviews.get_reviews(product_ids[i], reviews_folder)

reviews.browser.quit()

del reviews, product_ids

HBox(children=(IntProgress(value=0, max=8993), HTML(value='')))

KeyboardInterrupt: 