<h1>Code for Web Scrapping</h1>
The following functions can scrap the urls of a skincare category, urls of products, product info, reviews of the product. <br>
<b><span style="color:darkred">CAUTION: </span></b>A sample output for each function has been provided since it will take a very long time for some functions to run!
<li>get_subkinds: given a link of a big skincare category, scrape the urls of its sub-categories
<li>get_product_url: given a link of a category, scrape the names and urls of all products under this category
<li>get_product_info: given a link of a product, scrape its info
<li>get_product_reviews: given a link of a product, scrape all of its reviews contents, reviewers profile, review time and etc.
<li>scrape_execution: given a dictionary (key = product name, value = product url), scrape the reviews and relevant information of all products in this dictionary, return a dataframe and an error dictionary (tells you which product was not successfully scraped) 

### get_subkinds

In [5]:
import requests
from bs4 import BeautifulSoup
def get_subkinds(url):
    all_kinds = {} #subkinds of the big category
    response = requests.get(url)
    if not response.status_code == 200:
        return None
    try:
        results_page = BeautifulSoup(response.content,'lxml')
        kinds = results_page.find_all('a', class_='css-h6ss0r')
        for kind in kinds:
            kind_name = kind.get_text()
            kind_link = 'https://www.sephora.com' + kind.get('href')
            all_kinds[kind_name] = kind_link
        return all_kinds
    except:
        return None
print('URLs for categories under Cleanser: \n')
get_subkinds('https://www.sephora.com/shop/cleanser')

URLs for categories under Cleanser: 



{'Face Wash & Cleansers': 'https://www.sephora.com/shop/face-wash-facial-cleanser',
 'Exfoliators': 'https://www.sephora.com/shop/exfoliating-scrub-exfoliator',
 'Makeup Removers': 'https://www.sephora.com/shop/makeup-remover-skincare',
 'Face Wipes': 'https://www.sephora.com/shop/face-wipes',
 'Toners': 'https://www.sephora.com/shop/facial-toner-skin-toner',
 'Blotting Papers': 'https://www.sephora.com/shop/blotting-paper-oil-control-skincare'}

### get_product_url

In [8]:
#!pip install selenium
import sys
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
#from selenium.webdriver.support.ui import WebDriverWait
#from selenium.webdriver.support import expected_conditions as EC
#from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
path = '/usr/local/bin/chromedriver'   #modify the path to your own 
def get_product_url(url):
    #calculate total number of pages
    browser = webdriver.Chrome(executable_path = path ,options=options)
    browser.get(url)
    cancel = browser.find_element_by_xpath('//*[@id="modalDialog"]/div[2]/form/div[3]/div/div[1]/button')

    if cancel:
        cancel.click()  
    first_page = browser.find_elements_by_class_name("css-x544ax")
    try:
        next_pages = browser.find_elements_by_class_name("css-1f9ivf5")
        total_pages_count = next_pages[-1].text
        total_pages_count = int(total_pages_count)
    except:
        total_pages_count = 1

    #start scraping product names and links
    all_products = {}
    for i in range(1, total_pages_count + 1):
        page_url = url + "?pageSize=60&currentPage=" + str(i)
        browser.get(page_url)
        
        body = browser.find_element_by_tag_name("body")
        
        actions = ActionChains(browser)
        
        for _ in range(10):
            actions.send_keys(Keys.PAGE_DOWN).perform()
            time.sleep(0.1)
                
        source = browser.page_source

        results_page = BeautifulSoup(source, 'lxml')
        product_grid = results_page.find('div', {'data-comp': "ProductGrid"})
        products_sections = product_grid.find_all('div', class_ = "css-dkxsdo")

        for section in products_sections:
            products = section.find_all('div', class_ = "css-12egk0t")
            for product in products:
                if product.find('a'):
                    product_name = product.find('a').get('aria-label')
                    product_link = 'https://www.sephora.com' + product.find('a').get('href')                    
                    #space = product_link.find(' grid')
                    #product_link = product_link[:space] + '%20%' + product_link[space+1:]                 
                    all_products[product_name] = product_link
      
    browser.quit() 
    return all_products

get_product_url('https://www.sephora.com/shop/lip-treatments')

{'LANEIGE Lip Sleeping Mask': 'https://www.sephora.com/product/lip-sleeping-mask-P420652?icid2=products grid:p420652',
 'Fresh Sugar Lip Legends Gift Set': 'https://www.sephora.com/product/sugar-lip-legends-gift-set-P451273?icid2=products grid:p451273',
 'LANEIGE Kiss Me All Day': 'https://www.sephora.com/product/kiss-me-all-day-P449870?icid2=products grid:p449870',
 'Dior Dior Lip Glow': 'https://www.sephora.com/product/dior-addict-lip-glow-color-reviver-balm-P236816?icid2=products grid:p236816',
 'Fresh Sugar Lip Treatment Sunscreen SPF 15': 'https://www.sephora.com/product/sugar-lip-treatment-spf-15-P57002?icid2=products grid:p57002',
 'Buxom Powerplump Lip Balm': 'https://www.sephora.com/product/powerplump-lip-balm-P420163?icid2=products grid:p420163',
 'Bite Beauty Agave Lip Mask': 'https://www.sephora.com/product/agave-lip-mask-P384629?skuId=1575042&icid2=products grid:p384629',
 'FARSÁLI Unicorn Antioxidant Lip Mask': 'https://www.sephora.com/product/unicorn-antioxidant-lip-mask

### get_product_info

In [14]:
def get_product_info(product_url):
    response = requests.get(product_url)
    results_page = BeautifulSoup(response.content,'lxml')
    info = dict()

    product_name = results_page.find('span', {'class': 'css-0'}).get_text().lower()
    info['product name'] = product_name

    brand_name = results_page.find('span', {'class': 'css-euydo4'}).get_text().lower()
    info['brand name'] = brand_name

    item_id = int(results_page.find('div', {'class': 'css-1qf1va5'}).get_text(separator=" ").split()[-1])
    info['item id'] = item_id

    # review_num = int(results_page.find('span', {'class': 'css-2rg6q7'}).get_text().split()[0])
    # info['review number'] = review_num
    
    price = float(results_page.find('div', {'class': 'css-14hdny6'}).get_text()[1:])
    info['price'] = price

    loves = int(results_page.find('span', {'data-at': 'product_love_count'}).get_text())
    info['loves'] = loves

    try:
        rating = results_page.find('div', {'class': 'css-r17a09'}).get('aria-label')[0]     #the rating tag class is modified
        #rating = results_page.find('div', {'class': 'css-ca013b'}).get('aria-label')[0]
    except:
        rating = ''
    info['rating'] = rating

    try:
        category1 = results_page.find_all('a', {'class': 'css-1ylrown'})[1].get_text().lower()
    except:
        category1 = ''
    # for i in range(len(categories)):
    #       info[f'category_{i+1}'] = categories[i].get_text().lower()
    info['category 1'] = category1
    
    try:
        category2 = results_page.find('a', {'class': 'css-iasgl9'}).get_text().lower()
    except:
        category2 = ''
    # info[f'category_{len(categories)+1}'] = last_category
    info['category 2'] = category2

    return info
get_product_info('https://www.sephora.com/product/vitamin-enriched-face-base-P270594?icid2=products%20grid:p270594')

{'product name': 'vitamin enriched face base priming moisturizer',
 'brand name': 'bobbi brown',
 'item id': 1292820,
 'price': 60.0,
 'loves': 100251,
 'rating': '4',
 'category 1': 'moisturizers',
 'category 2': 'moisturizers'}

### get_product_reviews

In [15]:
path = '/usr/local/bin/chromedriver'   #modify the path to your own 
def get_product_reviews(product_url):
    import time
    import pandas as pd
    import re
    import requests
    from bs4 import BeautifulSoup
    
  
    browser = webdriver.Chrome(executable_path = path, options = options) # user-specific Chrome webdriver path
    browser.get(product_url)
    
    try:
        cancel = browser.find_element_by_xpath('//*[@id="modalDialog"]/div[2]/form/div[3]/div/div[1]/button') # close the log-in window
        if cancel:
            cancel.click()
    except:
        pass

    actions = ActionChains(browser) # drag the page down to the bottom
    for _ in range(10):
        time.sleep(0.1)
        actions.send_keys(Keys.PAGE_DOWN).perform()
    
    try:
        review_num = browser.find_element_by_class_name('css-mzsag6').text
        review_num = review_num[:review_num.find(' review')]
    except:
        review_num = ''

    try: # load all reviews by clicking 'show 6 more reviews'
        next_comments = browser.find_element_by_class_name('css-1phfyoj')
        i = 0
        while next_comments:
            next_comments.click() 
            time.sleep(0.1) 
            
            next_comments = browser.find_element_by_class_name('css-1phfyoj')
            i+=1
            #print(i)
    except:
        pass

    source = browser.page_source # use BeautifulSoup to locate sections
    results_page = BeautifulSoup(source, 'lxml')
    
    item_id = int(results_page.find('div', {'class': 'css-1qf1va5'}).get_text(separator=" ").split()[-1]) # the product id of these reviews

    reviews = results_page.find_all('div', {'data-comp': "Review"}) # locate all review boxes

    all_reviews = []

    for review in reviews:
        try:
            review_content = review.find('div', {'class': "css-1p4f59m"}).get_text() # get review content
        except:
            review_content = ''

        try:
            review_title = review.find('div', {'class': "css-ai9pjd"}).get_text() # get review title
        except:
            review_title = ''

        try:
            user_name = review.find('span', {'data-at': "nickname"}).get_text() # get user name
        except:
            user_name = ''

        try:
            user_infos = review.find_all('div', {'class': "css-15415he"}) # get all user info boxes
            for user_info in user_infos: # get three user attributes
                one_info = user_info.get_text()
                if one_info[:9] == 'Skin Tone':
                    skin_tone = one_info[10:]
                elif one_info[:9] == 'Skin Type':
                    skin_type = one_info[10:]
                elif one_info[:3] == 'Age':
                    age = one_info[4:]
                else:
                    skin_tone = ''
                    skin_type = ''
                    age = ''
            if not user_infos:
                skin_tone = ''
                skin_type = ''
                age = ''
        except:
            skin_tone = ''
            skin_type = ''
            age = ''

        try:
            review_rating = review.find('div', {'class': "css-5quttm"}).get('aria-label')[0] # get review rating
        except:
            review_rating = ''

        try:
            helpfulness = review.find_all('button', {'class': "css-1v7gxuq"}) # get 'Not Helpful' and 'Helpful' two buttons
            for item in helpfulness:
                helpful_not = item.get_text() # Eg: 'Helpful (60)' or 'Not Helpful (4)'
                pattern1 = r'^Not Helpful\D+(\d+)' # use regex to get the number
                pattern2 = r'^Helpful\D+(\d+)'
                match1 = re.search(pattern1, helpful_not)
                match2 = re.search(pattern2, helpful_not)
                if match1:
                    not_helpful = match1.group(1)
                elif match2:
                    helpful = match2.group(1)
        except:
            not_helpful = ''
            helpful = ''

        try: # whether there is a 'Received free product' tag
            if review.find('div', {'class': "css-dlcc30gf"}).get_text():
                free_product = 1 
        except:
            free_product = 0

        try: # whether there is a 'Recommends this product' tag
            if review.find('div', {'class': "css-ue839"}).get_text():
                recommend = 1 
        except:
            recommend = 0 

        try:
            review_time = review.find('span', {'class': "css-1bk7s0g"}).get_text() # get review time
        except:
            review_time = ''
        
        
        all_reviews.append( # associate all review information with the product id and append them into one line
                [item_id, review_num, user_name, skin_type, skin_tone, age, review_title, review_content,
                review_rating, not_helpful, helpful, free_product, recommend, review_time])


    columns = ['item_id', 'total_reviews', 'user_name', 'skin_type', 'skin_tone', 'age_range', 'review_title', 'review', 
               'review_rating', 'not_helpful', 'helpful', 'free_product', 'recommend', 'review_time']

    df = pd.DataFrame(all_reviews, columns=columns) # use pd to display the table
    
    
    return df
get_product_reviews('https://www.sephora.com/product/ice-ceramide-moisturizing-cream-P448937?icid2=products%20grid:p448937')

Unnamed: 0,item_id,total_reviews,user_name,skin_type,skin_tone,age_range,review_title,review,review_rating,not_helpful,helpful,free_product,recommend,review_time
0,2266260,122,1jazzychick,Dry,Deep,,DRENCHING WITH GOODNESS!!,I’m using 85% of Sunday Riley’s line (good gen...,5,17,184,0,1,13 Sep 2019
1,2266260,122,edithh,Dry,Porcelain,,,This cream feels great. It kind of the same co...,5,6,72,0,1,7 Sep 2019
2,2266260,122,DiveJ,Normal,Olive,,Another winner from Sunday Riley!!,"Ok, I’ve only used this twice. But wanted to a...",5,3,50,0,1,6 Sep 2019
3,2266260,122,morehi,Dry,Light,,Miracle cream,Miracle cream. I almost never take the time to...,5,5,48,0,1,10 Sep 2019
4,2266260,122,Kaiminz,Normal,Fair,,,Just don't like the smell personally. Nothing ...,2,61,44,0,0,9 Sep 2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,2266260,122,,Oily,Light,,Not for me,This has a thick creamy consistency like hand ...,2,0,0,0,0,7 d ago
118,2266260,122,crossman2588,Oily,Olive,,Amazing,I am normally not into Sunday Riley at all but...,5,0,0,0,1,16 d ago
119,2266260,122,MarwaMo,Dry,Tan,,,I have mixed feelings about this cream. It is ...,3,0,0,0,0,14 d ago
120,2266260,122,laurencoliinsq,Combination,Fair,,,Very thick and moisturizing but broke me out t...,2,0,0,0,0,11 d ago


In [None]:
def scrape_execution(name_url_dictionary):
    import pandas as pd
    reviews_df = pd.DataFrame()
    error_name_url = {}
    for i in name_url_dictionary.items():
        print(i[0], i[1])
        try:
            product_reviews = get_product_reviews(i[1])
            if product_reviews.empty:
                continue
            try:
                total = int(product_reviews['total_reviews'][0])
                reviews_got = len(product_reviews)
                p = reviews_got/total
                if p >= 0.9:
                    print('SUCCESS!')
                    reviews_df = reviews_df.append(product_reviews)
                else:
                    print('FAILED. Total number of reviews:', total,'\n', 'Length of this product review df: ',reviews_got)
                    error_name_url[i[0]] = i[1]
            except: 
                print('FAILED. Nothing Scraped, Product: ', i[0], i[1])
                error_name_url[i[0]] = i[1]
                pass            
        except:
                print('FAILED. Error in webscrapping functions, Product: ', i[0],i[1])
                error_name_url[i[0]] = i[1]
                pass
            
    return reviews_df, error_name_url

<h1>Code for Data Cleaning</h1>

In [123]:
def clean(df):
    import pandas as pd
    import numpy as np
    import nltk
    from nltk.tokenize import word_tokenize
    from nltk import sent_tokenize

    #df = pd.read_csv('All_Moisturizers_Reviews.csv',index_col=0)
    #df = df.drop('Unnamed: 0.1',axis=1)

    # drop duplicated reviews and reset index 
    df.drop_duplicates(inplace = True)
    df = df.reset_index(drop = True)

    # drop rows with no reviews
    df = df[(df['review'].notnull())]

    # standardize total_reviews to int
    df['total_reviews'].unique()
    df['total_reviews'] = df['total_reviews'].apply(lambda x: 1 if x=='1 revie' else float(x))
    df['total_reviews'] = df['total_reviews'].apply(lambda x: 0 if np.isnan(x) else int(x))

    # drop rows with same product id but different total review number
    df['max_reviews']=df[['item_id','total_reviews']].groupby(['item_id'])['total_reviews'].transform('max')
    df['min_reviews']=df[['item_id','total_reviews']].groupby(['item_id'])['total_reviews'].transform('min')
    df = df[(df['max_reviews']==df['min_reviews'])]
    df = df.drop('max_reviews',axis=1)
    df = df.drop('min_reviews',axis=1)


    # create a new column indicating whether the reviewer is a Sephora employee
    #df['employee'] = np.where((df['free_product'] == 'Sephora employee')|(df['free_product'] == 'Sephora employee|Received free product'),1,0)

    # convert free_product to 0 and 1
    #df['free_product'].unique()
    df['free_product'] = df['free_product'].fillna(0)
    def convert_free_product(x):
        if x == 'Received free product':
            x = 1
        elif x == 'Sephora employee':
            x = 0
        elif x == 'Sephora employee|Received free product':
            x = 1
        elif x == '0':
            x = 0
        elif x == '1':
            x = 1
        return x
    df['free_product'] = df['free_product'].apply(convert_free_product)
    
    
    return df

In [136]:
pre = pd.read_csv('All_Moisturizers_Reviews.csv',index_col=0)
print('Dataframe Size Before Cleaning: ', len(pre))

Dataframe Size Before Cleaning:  237300


In [137]:
df = clean(pre)
print('Dataframe Size After Cleaning: ', len(df))

Dataframe Size After Cleaning:  222025


In [141]:
#df.to_csv('Cleaned Moisturizers Reviews.csv')