# Direction
Run the first Python code below to get the product URLs.












## Get Product URLs



In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = 'https://dermskincare.com/products-brands/skinbetter-science?product_list_limit=all'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a', class_='product photo product-item-photo')
hrefs_list = [f"{link.get('href')}" for link in links]
all_urls = hrefs_list
print(all_urls)

['https://dermskincare.com/skinbetter-science-trio-luxe-moisture-treatment', 'https://dermskincare.com/skinbetter-science-sunbetter-sheer-spf-70-sunscreen-lotion', 'https://dermskincare.com/skinbetter-science-men-s-regimen', 'https://dermskincare.com/skinbetter-science-even-glow-regimen', 'https://dermskincare.com/skinbetter-science-clarity-regimen', 'https://dermskincare.com/skinbetter-science-a-team-duo-advanced-free-trial-size-alpharet-alto', 'https://dermskincare.com/skinbetter-science-a-team-duo-original-free-trial-size-alpharet-alto', 'https://dermskincare.com/anti-aging-kit', 'https://dermskincare.com/skinbetter-science-sunbetter-sheer-spf-56-sunscreen-stick', 'https://dermskincare.com/skinbetter-science-protect-and-correct-a-team-duo-face', 'https://dermskincare.com/skinbetter-science-refresh-cleanse-detox-hydrate-trio-kit', 'https://dermskincare.com/skinbetter-science-refresh-cleanse-and-detox-duo', 'https://dermskincare.com/skinbetter-science-alto-advanced-defense-and-repair-

In [None]:
len(all_urls)

34

## Get Product Information

In [None]:
def crawl_product_info(url):
    dom = requests.get(url).text
    soup = BeautifulSoup(dom, 'html.parser')

    # Get Product Title
    title_tag = soup.find('h1', class_= "page-title")
    product_title = title_tag.text.strip()

    # Get Product Price
    price_span = soup.find('span', class_="price")
    if price_span:
        price = price_span.text.strip()
    else:
        price = "No Price"

    # Get Key Ingredients
    ingredients = []
    target_tag = soup.find('div', class_="data item content", id="ingredients.tab")
    if target_tag:
        # Use decompose to remove the <br> tags after extraction
        for br in target_tag.find_all('br'):
            br.decompose()

        # Extract and clean the ingredients
        ingredient_text = target_tag.get_text(separator='\n').strip()
        ingredients = [line.split(':')[0].strip() for line in ingredient_text.split('\n') if line]
    else:
        ingredients = ["No Ingredients"]

    # Get How to use
    div_content = soup.find('div', id='product-description-content-15')
    if div_content:
        synopsis_div = div_content.find('div', class_='athenaProductPageSynopsisContent')
        if synopsis_div:
            p_tags = synopsis_div.find_all('p')
            howtouse = ''
            for tag in p_tags:
                howtouse += tag.get_text(strip=True) + '\n'
        else:
            howtouse = 'No How to Use'
    else:
        howtouse = 'No How to Use'

    # Get Benefits
    benefits_td = soup.find('td', class_='col data', attrs={'data-th': 'Benefits'})
    if benefits_td:
        benefit_items = benefits_td.find_all('li')
        if benefit_items:
            benefits = [item.get_text(strip=True) for item in benefit_items]
        else:
            benefits = 'No Benefits'
    else:
        benefits = 'No Benefits'

    # Get Skin Type
    skin_type_td = soup.find('td', class_='col data', attrs={'data-th': 'Skin Type'})
    if skin_type_td:
      skin_types = skin_type_td.get_text(strip=True)
    else:
      skin_types = 'No Skin Type'

    # Get Size
    size_td = soup.find('td', class_='col data', attrs={'data-th': 'Size'})
    if size_td:
      size =  size_td.get_text(strip=True)
    else:
      size = 'No Size'

    # Get How to Use
    use_div = soup.find('div', class_='data item content', id = 'usage.tab')
    if use_div:
      usage = use_div.get_text(strip=True)
    else:
      usage = 'No Usage'

    return {
        'Product Title': product_title,
        'Product Price': price,
        'Key Ingredients':ingredients,
        'Benefits': benefits,
        'Skin Types': skin_types,
        'Size': size,
        'How to Use':usage
    }

def main():

    data = []
    for url in all_urls:
        product_info = crawl_product_info(url)
        data.append(product_info)

    df = pd.DataFrame(data)
    return df

if __name__ == '__main__':
    df = main()

In [None]:
print(df)

                                        Product Title Product Price  \
0   SkinBetter science Trio Luxe Moisture Treatmen...       $175.00   
1   SkinBetter Science sunbetter SHEER SPF 70 Suns...        $75.00   
2                    SkinBetter Science Men's Regimen       $290.00   
3                SkinBetter Science Even Glow Regimen       $420.00   
4                  SkinBetter Science Clarity Regimen       $360.00   
5              SkinBetter Science A-Team Duo Advanced       $335.00   
6              SkinBetter Science A-Team Duo Original       $310.00   
7                   SkinBetter Science Anti-Aging Kit       $555.00   
8   SkinBetter Science sunbetter SHEER SPF 56 Suns...        $55.00   
9                   SkinBetter Science A-Team Duo Kit       $175.00   
10                SkinBetter Science Refresh Trio Kit       $150.00   
11  SkinBetter Science Refresh Cleansing & Detox D...        $70.00   
12  SkinBetter Science Alto Advanced Defense and R...       $100.00   
13  Sk

## Data Cleaning

In [None]:
# Data Cleaning
import re
df2 = df.copy()
df2['Product Title'] = df2['Product Title'].str.replace('SkinBetter science ', '', regex=False)
df2['Product Title'] = df2['Product Title'].str.replace('SkinBetter Science ', '', regex=False)
df2['Product Title'] = df2['Product Title'].str.replace(r"\(.*\)", "", regex=True).str.strip()
df2['Product Price'] = df2['Product Price'].str.replace('$', '')
df2['Key Ingredients'] = df2['Key Ingredients'].apply(lambda x: x.replace('[', '').replace(']', '').replace("'", '').replace('"', '') if isinstance(x, str) else x)
df2['Product URLs'] = all_urls

In [None]:
print(df2)

                                    Product Title Product Price  \
0                    Trio Luxe Moisture Treatment        175.00   
1         sunbetter SHEER SPF 70 Sunscreen Lotion         75.00   
2                                   Men's Regimen        290.00   
3                               Even Glow Regimen        420.00   
4                                 Clarity Regimen        360.00   
5                             A-Team Duo Advanced        335.00   
6                             A-Team Duo Original        310.00   
7                                  Anti-Aging Kit        555.00   
8          sunbetter SHEER SPF 56 Sunscreen Stick         55.00   
9                                  A-Team Duo Kit        175.00   
10                               Refresh Trio Kit        150.00   
11              Refresh Cleansing & Detox Duo Kit         70.00   
12         Alto Advanced Defense and Repair Serum        100.00   
13         Alto Advanced Defense and Repair Serum        195.0

## Download Excel file

In [None]:
# Download Excel file
from google.colab import files
df2.to_excel('Skinbetter_products.xlsx', index=False)
files.download('Skinbetter_products.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>