In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Get all product URLs from all products page
def get_urls(page_start, page_end):
    all_products_url_part = []

    for page in range(page_start, page_end + 1):
        page_url = f'https://revisionskincare.com/collections/all-products?page={page}'
        dom = requests.get(page_url).text
        soup = BeautifulSoup(dom, 'html.parser')

        product_urls = soup.find_all('h3', class_="product-card-title typography-text typography-text--body")
        for product_url in product_urls:
            a_tag = product_url.find('a')
            if a_tag and 'href' in a_tag.attrs:
                href_value = a_tag['href']
                all_products_url_part.append(href_value)

    return all_products_url_part

# Get all product information from every product page
# Create a list to store all product information
def main(product_url):
    product_info = {
        'Company Name': 'Revision Skincare',
        'Product Title': '',
        'Price': '',
        'Product Ingredients': '',
        'Website URL': product_url,
        'Product Treat':'',
        'Skin Type':'',
        'Reviews Count':'',
        'Reviews Stars':'',
        'Before Image':'',
        'After Image':''
    }

    # Use the html.parser parser to convert the HTML text into a BeautifulSoup object soup
    dom = requests.get(product_url).text
    soup = BeautifulSoup(dom, 'html.parser')

    # Get product title
    title_tag = soup.find('head').find('title')
    product_info['Product Title'] = title_tag.text.strip()

    # Get product price
    price_span = soup.find('span', class_='price-item--regular')
    if price_span:
        product_info['Price'] = price_span.text.strip()
    else:
        product_info['Price'] = 'Price information not found.'

    # Get product ingredients
    modal_dialog_div = soup.find('modal-dialog', id='product-ingredients-modal-template--15772401238211__pdp-main')
    if modal_dialog_div:
        product_modal_div = modal_dialog_div.find('div', class_='product-modal-info typography-text typography-text--body')
        p_tags = product_modal_div.find_all('p')
        if p_tags:
            ingredients = []
            for p_tag in p_tags:
                text = p_tag.text.strip()
                info_list = [item.strip() for item in text.split(',')]
                ingredients.extend(info_list)
            product_info['Product Ingredients'] = ', '.join(ingredients)
        else:
            product_info['Product Ingredients'] = 'Product ingredients not found.'
    else:
        product_info['Product Ingredients'] = 'Product modal dialog not found.'

    # Get Product Treat
    treat_list = []
    treat_tag = soup.find('div', class_='product-panel typography-text typography-text--body', id='one-panel')
    if treat_tag:
        treat_tag_2 = treat_tag.find('ul')
        if treat_tag_2:
            for item in treat_tag_2.find_all('li'):
                treat_list.append(item.text.strip())
            product_info['Product Treat'] = ', '.join(treat_list)
        else:
            product_info['Product Treat'] = 'Product treat not found.'
    else:
        product_info['Product Treat'] = 'Product treat not found.'

    # Get Skin Type
    product_div = soup.find('div', class_='product-panel typography-text typography-text--body', id='one-panel')
    if product_div:
        all_info = product_div.get_text(strip=True)
        product_info['Skin Type'] = all_info
    else:
        product_info['Skin Type'] = 'Skin type information not found.'

    # Get Reviews Count
    product_reviews_div_parent = soup.find('div', class_="okeReviews-reviewsSummary-ratingCount")
    if product_reviews_div_parent:
        product_reviews_div = product_reviews_div_parent.find('span')
        if product_reviews_div:
            product_info['Reviews Count'] = product_reviews_div.text.strip()
        else:
            product_info['Reviews Count'] = "Reviews count not found."
    else:
        product_info['Reviews Count'] = "Reviews count not found."

    # Get Reviews Stars
    product_star_span = soup.find('span', class_="okeReviews-a11yText")
    if product_star_span:
        product_info['Reviews Stars'] = product_star_span.text.strip()
    else:
        product_info['Reviews Stars'] = "Reviews stars not found."

    # Get Before Image
    product_before_img = soup.find('img', class_="image-before slider-image")
    if product_before_img:
        src_url = product_before_img['src']
        product_info['Before Image'] = src_url
    else:
        product_info['Before Image'] = "Before image not found."

    # Get After Image
    product_after_img = soup.find('img', class_="image-after slider-image")
    if product_after_img:
        src_url_2 = product_after_img['src']
        product_info['After Image'] = src_url_2
    else:
        product_info['After Image'] = "After image not found."

    return product_info

# Get URL from all product page 1-5
if __name__ == '__main__':
    all_products_url_part = get_urls(1, 5)
    all_products_url = ['https://revisionskincare.com' + url for url in all_products_url_part]

    # Create a list to store all product information
    RS_product = []

    # Loop through each product URL and collect product information
    for product_url in all_products_url:
        product_info = main(product_url)
        RS_product.append(product_info)

    # Convert the list of dictionaries into a DataFrame
    df = pd.DataFrame(RS_product)
    print("DataFrame has been created successfully.")


DataFrame has been created successfully.


In [None]:
print(df)

In [None]:
# Data Cleaing
import numpy as np
df['Product Title'] = df['Product Title'].str.split('|').str[0]
df['Product Title'] = df['Product Title'].str.split('–').str[0]
df['Price'] = df['Price'].str.strip().str.replace('USD', '')
df['Product Ingredients'] = df['Product Ingredients'].str.replace(r'^.*Ingredients:', '', regex=True)
df['Product Ingredients'] = df['Product Ingredients'].apply(lambda x: x.title())
df['Product Ingredients'] = df['Product Ingredients'].str.replace(", Active Ingredients:", "").apply(lambda x: x.strip().title())
df['Product Ingredients'] = df['Product Ingredients'].apply(lambda x: x[2:] if x.startswith(", ") else x)
df['Skin Type'] = df['Skin Type'].str.replace(r'^.*benefits\?', '', regex=True).str.strip()
df['Skin Type'] = df['Skin Type'].str.replace(r'^.*Benefits\?', '', regex=True).str.strip()


df['Reviews Count'] = np.where(df['Reviews Count'] != 'Reviews count not found.', df['Reviews Count'].str.split().str[0], df['Reviews Count'])
df['Reviews Stars'] = np.where(df['Reviews Stars'] != 'Reviews stars not found.', df['Reviews Stars'].str.replace("Rated ", "").str.split(" out", expand=True)[0], df['Reviews Stars'])
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
print(df)


         Company Name                                     Product Title  \
0   Revision Skincare                            Intellishade® Original   
1   Revision Skincare                        C+ Correcting Complex 30%™   
2   Revision Skincare                                  D·E·J eye cream®   
3   Revision Skincare                        YouthFull Lip Replenisher™   
4   Revision Skincare                               Nectifirm® ADVANCED   
..                ...                                               ...   
62  Revision Skincare            Injection Perfection Full Size Regimen   
63  Revision Skincare              Pre Post Procedure Full Size Regimen   
64  Revision Skincare  Pre Post Procedure Limited Edition Trial Regimen   
65  Revision Skincare                           Gentle Cleansing Lotion   
66  Revision Skincare                         CMT Post-Procedure Cream™   

                           Price  \
0                         $84.00   
1                        $1

In [None]:
from google.colab import files
df.to_excel('RS_products.xlsx', index=False)
files.download('RS_products.xlsx')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import requests
from bs4 import BeautifulSoup

def main():
    # Print Company Name
    print('Company Name:')
    print('Revision Skincare')
    home = 'https://revisionskincare.com/'
    dom = requests.get('https://revisionskincare.com/products/dej-daily-boosting-serum').text
    soup = BeautifulSoup(dom, 'html.parser')

    # Get Product Title
    title_tag = soup.find('head').find('title')
    print('Product Title:')
    print(title_tag.text.strip())

    # Get Key Ingredients
    panel_div = soup.find('div', class_='product-panel', id='three-panel')
    if panel_div:
        strong_tags = panel_div.find_all('strong')
        key_ingredients = ', '.join([strong_tag.text.strip() for strong_tag in strong_tags])
        print('Key Ingredients:', key_ingredients)

if __name__ == '__main__':
    main()


Company Name:
Revision Skincare
Product Title:
D·E·J Daily Boosting Serum™ | Revision Skincare®
Key Ingredients: Patent-Pending Sunflower Sprout Extract Technology, Antioxidant Blend (THD Ascorbate (Vitamin C), Resveratrol and Red Seaweed), Iris Florentina Root Extract, D·E·J, Acetyl Hexapeptide-1, Postbiotic Blend: Saccharomyces Ferment Filtrate, Saccharide Isomerate and Bacillus Ferment, Hydrating Blend:, Jojoba Esters, Squalane and Sodium Hyaluronate (Hyaluronic Acid)


In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Get all product URLs from all products page
def get_urls(page_start, page_end):
    all_products_url_part = []

    for page in range(page_start, page_end + 1):
        page_url = f'https://revisionskincare.com/collections/all-products?page={page}'
        dom = requests.get(page_url).text
        soup = BeautifulSoup(dom, 'html.parser')

        product_urls = soup.find_all('h3', class_="product-card-title typography-text typography-text--body")
        for product_url in product_urls:
            a_tag = product_url.find('a')
            if a_tag and 'href' in a_tag.attrs:
                href_value = a_tag['href']
                all_products_url_part.append(href_value)

    return all_products_url_part

# Get all product information from every product page
# Create a list to store all product information
def main(product_url):
    product_info = {
        'Company Name': 'Revision Skincare',
        'Product Title': '',
        'Key Ingredients': '',
        'Website URL': product_url
    }

    # Use the html.parser parser to convert the HTML text into a BeautifulSoup object soup
    dom = requests.get(product_url).text
    soup = BeautifulSoup(dom, 'html.parser')

    # Get Key Ingredients
    title_tag = soup.find('head').find('title')
    product_info['Product Title'] = title_tag.text.strip()

    panel_div = soup.find('div', class_='product-panel', id='three-panel')
    if panel_div:
        strong_tags = panel_div.find_all('strong')
        if strong_tags:
            key_ingredients = ', '.join([strong_tag.text.strip() for strong_tag in strong_tags])
            product_info['Key Ingredients'] = key_ingredients
        else:
            product_info['Key Ingredients'] = 'Key Ingredients not found'
    else:
        product_info['Key Ingredients'] = 'Key Ingredients not found'

    # Get Collections



    return product_info

# Get URL from all product page 1-5
if __name__ == '__main__':
    all_products_url_part = get_urls(1, 5)
    all_products_url = ['https://revisionskincare.com' + url for url in all_products_url_part]

    # Create a list to store all product information
    RS_product = []

    # Loop through each product URL and collect product information
    for product_url in all_products_url:
        product_info = main(product_url)
        RS_product.append(product_info)

    # Convert the list of dictionaries into a DataFrame
    df2 = pd.DataFrame(RS_product)
    print("DataFrame has been created successfully.")


DataFrame has been created successfully.


In [6]:
print(df2)

         Company Name                                      Product Title  \
0   Revision Skincare        Intellishade® Original | Revision Skincare®   
1   Revision Skincare    C+ Correcting Complex 30%™ | Revision Skincare®   
2   Revision Skincare              D·E·J eye cream® | Revision Skincare®   
3   Revision Skincare  YouthFull Lip Replenisher™ | Lip Plumper with ...   
4   Revision Skincare           Nectifirm® ADVANCED | Revision Skincare®   
..                ...                                                ...   
63  Revision Skincare  Revision Skincare Reusable Antimicrobial Finis...   
64  Revision Skincare  Injection Perfection Full Size Regimen | Revis...   
65  Revision Skincare  Pre Post Procedure Full Size Regimen | Revisio...   
66  Revision Skincare  Pre Post Procedure Limited Edition Trial Regim...   
67  Revision Skincare     CMT Post-Procedure Cream™ | Revision Skincare®   

                                      Key Ingredients  \
0   Blend of Three Peptides., 

In [None]:
# Remove '.' and 'D·E·J' from the 'Key Ingredients' column
df2['Key Ingredients'] = df2['Key Ingredients'].str.replace('.', '').str.replace('D·E·J', '').str.strip()
df2['Product Title'] = df2['Product Title'].str.split('|').str[0]
df2['Product Title'] = df2['Product Title'].str.split('–').str[0]
df2 = df2.applymap(lambda x: x.strip() if isinstance(x, str) else x)



  df2['Key Ingredients'] = df2['Key Ingredients'].str.replace('.', '').str.replace('D·E·J', '').str.strip()


In [8]:
from google.colab import files
df2.to_excel('RS_products.xlsx', index=False)
files.download('RS_products.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [2]:
print(df2)

         Company Name Product Title  \
0   Revision Skincare                 
1   Revision Skincare                 
2   Revision Skincare                 
3   Revision Skincare                 
4   Revision Skincare                 
..                ...           ...   
63  Revision Skincare                 
64  Revision Skincare                 
65  Revision Skincare                 
66  Revision Skincare                 
67  Revision Skincare                 

                                          Website URL  
0   https://revisionskincare.com/products/intellis...  
1   https://revisionskincare.com/products/c-correc...  
2   https://revisionskincare.com/products/d-e-j-ey...  
3   https://revisionskincare.com/products/youthful...  
4   https://revisionskincare.com/products/nectifir...  
..                                                ...  
63  https://revisionskincare.com/products/revision...  
64  https://revisionskincare.com/products/injectio...  
65  https://revisionskincar

In [7]:
import pandas as pd
df2[['Title1', 'Title2']] = df2['Product Title'].str.split('|', expand=True)
