<h1>
Web Scrapping for Sephora</h1>

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import re
import random

<h2>
Input the url of a product and get all info</h2>

<h3>
Function 1: get general information of a product </h3>

In [2]:
def get_general(product_page):
    product_url = 'https://www.sephora.com' + product_page
    response = requests.get(product_url)
    results_page = BeautifulSoup(response.content,'lxml')
    #category
    string = results_page.find_all('script')[5].get_text()
    pattern = r'"displayName":"([a-zA-Z &]+)"'
    match = re.findall(pattern,string)
    category = match[-1]
    subcategory = match[-2]
    #product general info
    names = results_page.find('h1',{'data-comp':"DisplayName Flex Box"})
    brand_name = names.find_all('span')[0].get_text()  
    product_name = names.find_all('span')[1].get_text()  
    loves = results_page.find('span',{'data-at':"product_love_count"}).get_text()
    price = results_page.find('div',{'data-comp':"Price Box"}).get_text().split()[0] #get the original price rather than discounted one
    pattern = r'P\d+'
    match = re.search(pattern,product_page)
    product_id = match[0]
    return [product_id, brand_name, product_name, category, subcategory, loves, price]

<h3>
Function 2: get general stats for all of a product's reviews
</h3>

In [4]:
def get_review_stats(result_json):
    try: 
        n = len(list(result_json['Includes']['Products'].values()))
        review_stats = list(result_json['Includes']['Products'].values())[n-1]['ReviewStatistics']#get the conclusive review stats
        total_review = review_stats['TotalReviewCount'] 
        avg_rating = review_stats['AverageOverallRating'] 
        recommended_count = review_stats['RecommendedCount'] 
        #get star counts
        dic = dict()
        for item in review_stats['RatingDistribution']:
            values = list(item.values())
            dic[values[0]] = values[1]
        if 1 in dic:
            onestar_count = dic[1]
        else:
            onestar_count = 0
        if 2 in dic:
            twostar_count = dic[2]
        else:
            twostar_count = 0
        if 3 in dic:
            threestar_count = dic[3]
        else:
            threestar_count = 0
        if 4 in dic:
            fourstar_count = dic[4]
        else:
            fourstar_count = 0
        if 5 in dic:
            fivestar_count = dic[5]
        else:
            fivestar_count = 0
    except:
        total_review, avg_rating, recommended_count, onestar_count, twostar_count, threestar_count, fourstar_count, fivestar_count = 0,0,0,0,0,0,0,0
    return [total_review, avg_rating, recommended_count, onestar_count, twostar_count, 
            threestar_count, fourstar_count, fivestar_count]

<h3>
Function 3: get detailed information of a single review
</h3>

In [3]:
def get_reviews(url):
    output = {}
    try:
        response = requests.get(url)
    except:
        return None
    result_json = json.loads(response.content.decode('utf-8'))
    for i in range(len(result_json['Results'])):
        review_content=result_json['Results'][i]
        review_id = review_content['Id']
        rating=review_content['Rating']
        review_text=review_content['ReviewText']
        helpfulness=review_content['Helpfulness']
        try: #get membership level(if there is a nonetype, level = None)
            level_string=review_content['AdditionalFields']['sociallockup']['Value']
            level_pattern=r'biTier=[a-zA-Z]+'
            level=re.search(level_pattern,level_string)[0]
        except:
            level=None
        positive_count=review_content['TotalPositiveFeedbackCount']
        negative_count=review_content['TotalNegativeFeedbackCount']
        title= review_content['Title']
        recommended=review_content['IsRecommended']
        output[review_id] = [rating, title, review_text, level, helpfulness, positive_count, negative_count, recommended]
    return output   

<h3>
Function 4: go to bazaarvoice and get all the review details and stats
</h3>

In [5]:
def bazaarvoice(product_id):
    review_url = f"https://api.bazaarvoice.com/data/reviews.json?Filter=contentlocale%3Aen*&Filter=ProductId%3A{product_id}&Sort=TotalPositiveFeedbackCount%3Adesc&Limit=100&Offset=0&Include=Products%2CComments&Stats=Reviews&passkey=rwbw526r2e7spptqd2qzbkp7&apiversion=5.4&Locale=en_US"
    response = requests.get(review_url)
    result_json = json.loads(response.content.decode('utf-8'))
    review_stats = get_review_stats(result_json)
    total_review = review_stats[0] #get total number of reviews
    all_reviews = dict()
    for i in range(0,total_review,100):  #loop for all review pages
        next_url = f"https://api.bazaarvoice.com/data/reviews.json?Filter=contentlocale%3Aen*&Filter=ProductId%3A{product_id}&Sort=TotalPositiveFeedbackCount%3Adesc&Limit=100&Offset={i}&Include=Products%2CComments&Stats=Reviews&passkey=rwbw526r2e7spptqd2qzbkp7&apiversion=5.4&Locale=en_US"
        all_reviews.update(get_reviews(next_url))
    return review_stats, all_reviews 

<h3>
Function 5: finalize to one function and get all the related info of a product
</h3>

In [6]:
def one_product_final(product_page): 
    output = list()
    general = get_general(product_page)
    product_id = general[0]
    output.extend(general)
    review_stats, all_reviews = bazaarvoice(product_id)
    output.extend(review_stats)
    output.append(all_reviews)
    return output

<h2>
Web Scrapping for all allure products</h2>

<h3>
STEP 1: get a dictionary of all the urls of allure products
</h3>

In [4]:
def get_url(url):
    response = requests.get(url)
    if not response.status_code == 200:
        return None
    try:
        results_page = BeautifulSoup(response.content,'lxml')
        script_list = results_page.find('script',{'data-comp':"PageJSON", 'id':"linkJSON", 'type':"text/json"}).text
        json_list = json.loads(script_list)[3]
        all_products = json_list['props']['items'][1]['skus']
        url_dic = dict()
        for product in all_products:
            brand_name = product['sku']['brandName']
            product_name = product['sku']['productName']
            name = brand_name + '/' + product_name
            url = product['sku']['targetUrl']
            url_dic[name] = url
        return url_dic
    except:
        return None

<h3>
STEP 2: get all allure products info
</h3>

In [None]:
#Don't run this!!!!!!!
categories = ['makeup','skin-care','fragrance','hair','bath-body','tools-brushes']
all_allure = list()
for category in categories:
    category_url = 'https://www.sephora.com/beauty/allure-best-of-beauty-' + category
    l = list(get_url(category_url).values())
    n = len(l)
    i=0
    for item in l:
        print(category, i,'/',n)
        one_product=one_product_final(item)
        all_allure.append(one_product)
        i+=1

<h3>
STEP 3: transform the data into df and write into csv</h3>

In [None]:
all_allure_df = pd.DataFrame(all_allure, columns = ['product_id', 'brand_name', 'product_name', 'category', 'subcategory', 'loves', 'price', 'total_review', 'avg_rating', 'recommended_count', 'onestar_count', 'twostar_count', 'threestar_count', 'fourstar_count', 'fivestar_count', 'all_reviews'])
all_allure_df.drop_duplicates(keep=False,inplace=True) #remove duplicates
all_allure_df = all_allure_df.reset_index(drop=True)
all_allure_df.to_csv('all_allure.csv',encoding='utf-8',index=False)

<h2>
Web Scrapping for all non-allure products</h2>

<h3>
STEP 1: get the amounts of non-allure products needed to scrape for each category
</h3>

In [None]:
df = all_allure_df
c = df.groupby('category')
allure_amounts = dict()

In [None]:
#makeup
data1 = df[df['category'] == 'Makeup']
data2 = data1.groupby('subcategory').size().sort_values(ascending=False)
data2.index = data2.index.str.lower()
data2.index = data2.index.str.replace(' &', '')
data2.index = data2.index.str.replace(' ', '-')
makeup = dict(data2)
makeup['face'] += makeup.pop('makeup-palettes')
makeup = {'/eye-makeup': 37, '/face-makeup': 33, '/lips-makeup': 25, '/makeup-applicators': 7, '/cheek-makeup': 4, '/nails-makeup': 1, '/makeup-accessories': 1}
allure_amounts['Makeup'] = makeup

In [None]:
#skincare
data1 = df[df['category'] == 'Skincare']
data2 = data1.groupby('subcategory').size().sort_values(ascending=False)
data2.index = data2.index.str.lower()
skincare = dict(data2)
skincare = {'/moisturizing-cream-oils-mists': 18,
 '/face-mask': 14,
 '/facial-treatments': 12,
 '/eye-treatment-dark-circle-treatment': 8,
 '/cleanser': 6,
 '/self-tanning-products': 4,
 '/sunscreen-sun-protection': 2,
 '/skin-care-tools': 1}
allure_amounts['Skincare'] = skincare

In [None]:
#hair
data1 = df[df['category'] == 'Hair']
data2 = data1.groupby('subcategory').size().sort_values(ascending=False)
data2.index = data2.index.str.lower()
hair = dict(data2)
hair = {'/hair-products-treatments': 17, '/shampoo-conditioner': 13, '/hair-styling-tools': 5}
allure_amounts['Hair'] = hair

In [None]:
#fragrance
data1 = df[df['category'] == 'Fragrance']
data2 = data1.groupby('subcategory').size().sort_values(ascending=False)
data2.index = data2.index.str.lower()
fragrance = dict(data2)
fragrance = {'/fragrances-for-women': 16, '/fragrances-for-men': 3}
allure_amounts['Fragrance'] = fragrance

In [None]:
#bath & body
data1 = df[df['category'] == 'Bath & Body']
data2 = data1.groupby('subcategory').size().sort_values(ascending=False)
data2.index = data2.index.str.lower()
bath_body = dict(data2)
bath_body = {'/body-moisturizers': 6, '/body-care': 2, '/bronzer-self-tanner-bath-body': 1, '/bath-and-body-soap': 1}
allure_amounts['Bath & Body'] = bath_body

In [None]:
#tools & brushes
data1 = df[df['category'] == 'Tools & Brushes']
data2 = data1.groupby('subcategory').size().sort_values(ascending=False)
data2.index = data2.index.str.lower()
tools_brushes = dict(data2)
tools_brushes = {'/small-tools': 2, '/professional-beauty-tools': 1}
allure_amounts['Tools & Brushes'] = tools_brushes

<h3>get all non-allure products based on allure_amounts
</h3>

In [None]:
allure_id = list(df['product_id'])
def get_url(url,n):
    url_dic = dict()
    response = requests.get(url)
    if not response.status_code == 200:
        return None
    try:
        results_page = BeautifulSoup(response.content,'lxml')
        number_of_product = int(results_page.find('span',{'data-at':"number_of_products"}).get_text().split()[0]) #get total number of products
        num = 5*n
        while len(url_dic.keys()) < num:
            chosen_product = random.randint(1,number_of_product)#randomly select one product
            if chosen_product % 300 != 0:
                chosen_page = chosen_product // 300 + 1
                chosen_product_loc = chosen_product % 300 - 1
            else:
                chosen_page = chosen_product / 300
                chosen_product_loc = 299
            current_page_url = url + '?pageSize=300&currentPage='+str(chosen_page)
            current_response = requests.get(current_page_url)
            current_result_page = BeautifulSoup(current_response.content,'lxml')
            script_list = current_result_page.find('script',{'data-comp':"PageJSON", 'id':"linkJSON", 'type':"text/json"}).text
            json_list = json.loads(script_list)[2]
            chosen_product_info = json_list['props']['products'][chosen_product_loc]
            productId = chosen_product_info['productId']
            brand_name = chosen_product_info['brandName']
            product_name = chosen_product_info['displayName']
            name = brand_name + '/' + product_name
            product_url = chosen_product_info['targetUrl']
            if productId in allure_id or name in all_url: #ensure it is non-allure and unique
                pass
            else:
                review_url = f"https://api.bazaarvoice.com/data/reviews.json?Filter=contentlocale%3Aen*&Filter=ProductId%3A{productId}&Sort=TotalPositiveFeedbackCount%3Adesc&Limit=100&Offset=0&Include=Products%2CComments&Stats=Reviews&passkey=rwbw526r2e7spptqd2qzbkp7&apiversion=5.4&Locale=en_US"
                response = requests.get(review_url)
                result_json = json.loads(response.content.decode('utf-8'))
                try:
                    x = len(list(result_json['Includes']['Products'].values())) #avoid 0 review products
                except:
                    x = 0
                if x == 0:
                    pass
                else:
                    url_dic[name] = product_url
        return url_dic
    except:
        return None

<h3>get all non-allure urls</h3>

In [None]:
#Don't run this !!!
all_url = dict()
for key1 in allure_amounts:
    for key2 in allure_amounts[key1]:
        sub_url = 'https://www.sephora.com/shop' + key2
        amount = allure_amounts[key1][key2]
        output = get_url(sub_url,amount)
        all_url.update(output)
        print(sub_url,5*amount,len(output.keys()))

<h3>get all non-allure products info</h3>

In [None]:
#Don't run this!!!!!
l = list(all_url.values())
n = len(l)
i=0
all_non_allure = list()
for item in l:
    print(i,'/',n)
    one_product=one_product_final(item)
    all_non_allure.append(one_product)
    i+=1

<h3>write non-allures into df and csv</h3>

In [None]:
non_allure_df = pd.DataFrame(all_allure, columns = ['product_id', 'brand_name', 'product_name', 'category', 'subcategory', 'loves', 'price', 'total_review', 'avg_rating', 'recommended_count', 'onestar_count', 'twostar_count', 'threestar_count', 'fourstar_count', 'fivestar_count', 'all_reviews'])
non_allure_df.to_csv('non_allure.csv',encoding='utf-8',index=False)

<h2>
Web Scrapping for all men products</h2>

In [None]:
def get_url(url,n):
    url_dic = dict()
    response = requests.get(url)
    if not response.status_code == 200:
        return None
    try:
        results_page = BeautifulSoup(response.content,'lxml')
        number_of_product = int(results_page.find('span',{'data-at':"number_of_products"}).get_text().split()[0])
        for i in range(n): #get all men products
            chosen_product = random.randint(1,number_of_product)
            if chosen_product % 300 != 0:
                chosen_page = chosen_product // 300 + 1
                chosen_product_loc = chosen_product % 300 - 1
            else:
                chosen_page = chosen_product / 300
                chosen_product_loc = 299
            current_page_url = url + '?pageSize=300&currentPage='+str(chosen_page)
            current_response = requests.get(current_page_url)
            current_result_page = BeautifulSoup(current_response.content,'lxml')
            script_list = current_result_page.find('script',{'data-comp':"PageJSON", 'id':"linkJSON", 'type':"text/json"}).text
            json_list = json.loads(script_list)[2]
            chosen_product_info = json_list['props']['products'][chosen_product_loc]
            productId = chosen_product_info['productId']
            brand_name = chosen_product_info['brandName']
            product_name = chosen_product_info['displayName']
            name = brand_name + '/' + product_name
            product_url = chosen_product_info['targetUrl']
            if name in all_url:
                pass
            else:
                review_url = f"https://api.bazaarvoice.com/data/reviews.json?Filter=contentlocale%3Aen*&Filter=ProductId%3A{productId}&Sort=TotalPositiveFeedbackCount%3Adesc&Limit=100&Offset=0&Include=Products%2CComments&Stats=Reviews&passkey=rwbw526r2e7spptqd2qzbkp7&apiversion=5.4&Locale=en_US"
                response = requests.get(review_url)
                result_json = json.loads(response.content.decode('utf-8'))
                try:
                    x = len(list(result_json['Includes']['Products'].values()))
                except:
                    x = 0
                if x == 0:
                    pass
                else:
                    url_dic[name] = product_url
        return url_dic
    except:
        return None

<h3>get all men urls</h3>

In [None]:
#don't run this!!!!
all_url = dict()
allure_amounts = {'/gift-sets-for-men':27,'/mens-perfume':246,'/mens-facial-products':106,'/mens-grooming':52,'/mens-hair-care':49,'/mens-personal-care':64}
for key in allure_amounts:
        sub_url = 'https://www.sephora.com/shop' + key
        amount = allure_amounts[key]
        output = get_url(sub_url,amount)
        all_url.update(output)

<h3>get all men products info</h3>

In [None]:
#don't run this!!!!!!
l = list(all_url.values())
n = len(l)
i=0
all_man = list()
for item in l:
    print(i,'/',n)
    one_product=one_product_final(item)
    all_man.append(one_product)
    i+=1

<h3>write into df and csv</h3>

In [None]:
all_man_df = pd.DataFrame(all_man, columns = ['product_id', 'brand_name', 'product_name', 'category', 'subcategory', 'loves', 'price', 'total_review', 'avg_rating', 'recommended_count', 'onestar_count', 'twostar_count', 'threestar_count', 'fourstar_count', 'fivestar_count', 'all_reviews'])
all_man_df.to_csv('all_man.csv',encoding='utf-8',index=False)