In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [None]:
# Extract product title
def get_title(soup):
    try:
        title_element = new_soup.find('h1', attrs={'class':'product_title entry-title elementor-heading-title elementor-size-default'})
        product_title = title_element.contents[0] if title_element.contents else ''
        title_string = product_title.strip()
    except AttributeError:
        title_string = ''
    return title_string

# Extract product description
def get_description(soup):
    try:
        description_parent = new_soup.find('div', attrs={'class':'elementor-element elementor-element-fcab107 elementor-widget elementor-widget-woocommerce-product-content'})
        description_container = description_parent.find('div', attrs={'class':'elementor-widget-container'})
        description_paragraphs = description_container.find_all('p')
        description_bullets = description_container.find_all('li')
        descriptions = description_paragraphs + description_bullets
        description = ' '.join(p.text for p in descriptions)
        description_string = description.strip()
    except AttributeError:
        description_string = ''
    return description_string

# Extract usage instructions
def get_instructions(soup):
    try:
        instructions_parent = new_soup.find('div', attrs={'class':'elementor-element elementor-element-dac7d87 elementor-widget elementor-widget-text-editor'})
        instructions_container = instructions_parent.find('div', attrs={'class':'elementor-widget-container'})
        instructions_string = instructions_container.get_text(strip=True)
    except AttributeError:
        instructions_string = ''
    return instructions_string

# Extract product properties/effects
def get_properties(soup):
    try:
        properties_parent = new_soup.find('div', attrs={'class':'elementor-element elementor-element-25acc34 elementor-widget elementor-widget-woocommerce-product-short-description'})
        properties_container_1 = properties_parent.find('div', attrs={'class':'elementor-widget-container'})
        properties_container_2 = properties_container_1.find('div', attrs={'class':'woocommerce-product-details__short-description'})
        properties = properties_container_2.find('p')
        properties_string = properties.text.strip()
        words = properties_string.replace(' – ', ' ').replace(' - ', ' ').replace(' • ', ' ').split(' ')
        properties_string = ', '.join(word.capitalize() for word in words)
    except AttributeError:
        properties_string = ''
    return properties_string

# Extract product ingredients
def get_ingredients(soup):
    try:
        ingredients = new_soup.find('span', attrs={'class':'elementor-heading-title elementor-size-default'}).text
        ingredients_string = ingredients.replace('Ingredients:\n ', '').replace('.', '').strip()
        words = ingredients_string.split(', ')
        ingredients_string = ', '.join(word.capitalize() for word in words)
        ingredients_string.replace(' ((*)) ', '')
    except AttributeError:
        ingredients_string = ''
    return ingredients_string

# Extract product's intended skin defect
def get_defect(soup):
    try:
        defect_parent = new_soup.find('div', attrs={'class':'elementor-element elementor-element-3148c74 elementor-widget elementor-widget-heading'})
        defect_container = defect_parent.find('div', attrs={'class':'elementor-widget-container'})
        defect = defect_container.find('p', attrs={'class':'elementor-heading-title elementor-size-default'})
        defect_string = defect.text.strip()
    except AttributeError:
        defect_string = ''
    return defect_string

# Extract product's intended skin type
def get_skin_type(soup):
    try:
        skin_type_parent = new_soup.find('div', attrs={'class':'elementor-element elementor-element-725a496 elementor-widget elementor-widget-heading'})
        skin_type_container = skin_type_parent.find('div', attrs={'class':'elementor-widget-container'})
        skin_type = skin_type_container.find('p', attrs={'class':'elementor-heading-title elementor-size-default'})
        skin_type_string = skin_type.text.strip()
    except AttributeError:
        skin_type_string = ''
    return skin_type_string

# Extract product texture
def get_texture(soup):
    try:
        texture_parent = new_soup.find('div', attrs={'class':'elementor-element elementor-element-a85633f elementor-widget elementor-widget-heading'})
        texture_container = texture_parent.find('div', attrs={'class':'elementor-widget-container'})
        texture = texture_container.find('p', attrs={'class':'elementor-heading-title elementor-size-default'})
        texture_string = texture.text.strip().capitalize()
    except AttributeError:
        texture_string = ''
    return texture_string

# Extract product format
def get_format(soup):
    try:
        format_parent = new_soup.find('tr', attrs={'class':'woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_formato'})
        format_container = format_parent.find('td', attrs={'class':'woocommerce-product-attributes-item__value'})
        format = format_container.find('p')
        format_string = format.text.strip()
    except AttributeError:
        format_string = ''
    return format_string

# Extract product price (EUR)
def get_price(soup):
    try:
        # For English, use 'gt-block price'
        # price = new_soup.find('p', attrs={'class':'gt-block price'}).text
        price = new_soup.find('p', attrs={'class':'price'}).text
        price_string = price.replace('€', '').replace('\xa0', '').replace(',', '.').strip()
    except AttributeError:
        price_string = ''
    return price_string

# Extract product quantity (ml)
def get_quantity(soup):
    try:
        quantity_parent = new_soup.find('tr', attrs={'class':'woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_quantita-in-ml'})
        quantity = quantity_parent.find('td', attrs={'class':'woocommerce-product-attributes-item__value'})
        quantity_string = quantity.text.strip()
    except AttributeError:
        quantity_string = ''
    return quantity_string

# Scrape English dataset

In [None]:
# define user agent
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
           'Accept-Language': 'en'}

# connect to the target URL
page = requests.get('https://www.dorabruschi.com/en/cosmetica/', headers=headers)

# parse byte content to html, store in soup object
soup = BeautifulSoup(page.content, 'html.parser')

# fetch product pages as list of tag objects
pages = soup.find_all('a', attrs={'class':'page-numbers'})
pages.pop() # remove 'next page' arrow

# extract pages from tag objects
pages_list = []
for page in pages:
    pages_list.append(page.get('href'))

# insert first page
pages_list.insert(0, 'https://www.dorabruschi.com/en/cosmetica/')

# define attributes dictionary
attributes = {'title':[], 'description':[], 'usage_instructions':[], 'properties':[], 'ingredients':[], 'intended_defect':[], 'skin_type':[], 'texture':[], 'format':[], 'price':[], 'quantity':[]}

# extract product attributes from each link of each page
for page in pages_list:
    # connect to the target URL
    page = requests.get(page, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')

    # fetch product links as list of tag objects
    links = soup.find_all('a', attrs={'class':'woocommerce-LoopProduct-link woocommerce-loop-product__link'})

    # extract links from tag objects
    links_list = []
    for link in links:
        links_list.append(link.get('href'))

    for link in links_list:
        new_page = requests.get(link, headers=headers)
        new_soup = BeautifulSoup(new_page.content, 'html.parser')

        attributes['title'].append(get_title(new_soup))
        attributes['description'].append(get_description(new_soup))
        attributes['usage_instructions'].append(get_instructions(new_soup))
        attributes['properties'].append(get_properties(new_soup))
        attributes['ingredients'].append(get_ingredients(new_soup))
        attributes['intended_defect'].append(get_defect(new_soup))
        attributes['skin_type'].append(get_skin_type(new_soup))
        attributes['texture'].append(get_texture(new_soup))
        attributes['format'].append(get_format(new_soup))
        attributes['price'].append(get_price(new_soup))
        attributes['quantity'].append(get_quantity(new_soup))

# create dataframe
dorabruschi_df = pd.DataFrame.from_dict(attributes)

In [None]:
dorabruschi_df = dorabruschi_df[dorabruschi_df['title'] != "Gift Kit Get 2 Pay 1"]
dorabruschi_df

Unnamed: 0,title,description,usage_instructions,properties,ingredients,intended_defect,skin_type,texture,format,price,quantity
0,ACE 10% multivitamin concentrate,This rapidly absorbed concentrated treatment e...,Apply a few drops of concentrate in the mornin...,"Anti-wrinkle, Antioxidant, Illuminating","Aqua [water], Glycerin, Tocopheryl acetate, Pr...",Wrinkle,All types of skin,Liquid,Dropper,46.00,30
1,Revitalizing multivitamin cream,"Cream with a velvety and light texture, design...",Apply in the morning and/or in the evening to ...,"Anti-wrinkle, Antioxidant, Illuminating","Aqua [water], Glycerin, Cetyl alcohol, Capryli...",Wrinkle,All types of skin,Velvety cream,airless,49.00,50
2,Smoothing renewing cream,"Cream with a velvety and light texture, it is ...",Apply in the evening to perfectly cleansed ski...,"Anti-wrinkle, Renewing, Illuminating","Aqua [water], Peg-6 stearate, Glycolic acid, C...",Wrinkle,All types of skin,Velvety cream,airless,45.00,50
4,Acne roll-on lotion,Moderately alcoholic invisible lotion with a p...,Apply with the appropriate roll-on directly on...,"Purifying, Astringent","Aqua [water], Alcohol, Glycerin, Salicylic aci...",Purifying,Acne tendency,Liquid,roll-on,22.00,10
5,Acne paste,Paste for a quick and effective treatment of p...,Apply 1-2 times a day on pimples and impuritie...,"Purifying, Anti-imperfections","Paraffinum liquidum [mineral oil], Zinc oxide,...",Purifying,Acne tendency,Thick paste,Tubo,26.00,30
6,Micellar water,Micellar water with a delicate formula enriche...,Wet a pad with a little product and pass gentl...,"Cleanses, Removes, Make-up, Hydrate","Aqua [water], Ethoxydiglycol, Caprylyl/capryl ...",Detergent,All types of skin,Liquid,Dosing bottle,19.00,200
7,Anti-wrinkle cream K,"Cream with a very rich texture, identical to t...",Apply in the evening to perfectly cleansed ski...,"Anti-wrinkle, Emollient","Aqua [water], Prunus amygdalus dulcis (sweet a...",Wrinkle,All types of skin,Thick cream,Jar,55.00,50
8,Smoothing foot balm,Foot balm carefully formulated to reduce the t...,Apply to particularly thickened or calloused a...,"Intensive, Emollient, Against, Calls, And, Corns","Aqua [water], Urea, Glycerin, Theobroma cocoa ...",Emollient,All types of skin,Emollient cream,Tubo,19.00,30
9,Restructuring hand balm,"""SOS"" shock treatment for dehydrated hands bas...","Apply in the evening, massaging until complete...","Nourishing, Repair","Aqua [water], Glycerin, Paraffinum liquidum [m...",nourisher,All types of skin,Velvety cream,Tubo,15.00,30
10,Delicate sebum-balancing cleansing base,Extremely delicate washing base based on softe...,Apply a small amount of product to a damp face...,"Cleanses, Rebalances","Aqua [water], Glycerin, Disodium cocoamphodiac...",Detergent,Acne tendency,Foaming gel,Flip top bottle,22.00,165


In [None]:
# export df to csv and excel
dorabruschi_df.to_csv('dorabruschi_products.csv', index=False)
dorabruschi_df.to_excel('dorabruschi_products.xlsx', engine='openpyxl', index=False)

In [None]:
from google.colab import files
files.download('dorabruschi_products.csv')
files.download('dorabruschi_products.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>