In [1]:
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
from pytubefix.contrib.search import Search
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
from bs4 import BeautifulSoup
from time import sleep
import numpy as np
import os

from itertools import islice

In [2]:
def get_category_links(url, base_url, class_name):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    cat_list = soup.find('div', class_ = class_name)
    links = [base_url+a.get('href') for a in cat_list.findAll('a')]
    cats_title = [a.text.strip() for a in cat_list.findAll('a')]
    return zip(links, cats_title)

def get_item_df(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    #* Título
    title = soup.find('h1').get_text(strip=True) if soup.find('h1') else None
    #* Opiniones
    opinion = soup.find('span', class_='stamped-badge-caption')
    op_count = opinion.get('data-reviews') if opinion else np.nan
    op_rating = opinion.get('data-rating') if opinion else np.nan
    #* Precio
    price = soup.find('span', class_ = 'current-price theme-money').text.replace('€', '')
    #*Color
    color_options = []
    color_container = soup.find('div', class_='option-selector--swatch')
    if color_container:
        color_options = [span.get_text(strip=True) for span in color_container.find_all('span') if span.get_text(strip=True)]
    if len(color_options) == 0:
        color_options = np.nan
    #* Tallas
    size_options = []
    for tag in soup.find_all(['span', 'button']):
        text = tag.get_text(strip=True)
        # Filtra los tamaños esperados
        if text in ['XS', 'S', 'M', 'L', 'XL', 'XXL']:
            size_options.append(text)
    if len(size_options) == 0:
        size_options = np.nan

    #* Impacto
    try:
        impact = soup.find("summary", class_ = 'cc-accordion-item__title', string = 'Impacto ambiental').find_parent().text 
        co2 = re.search(r"(\d{1,3}(?:\.\d{3})*(?:,\d+)?)\s*kg de emisiones de CO2", impact)
        co2 = co2.group(1).replace(",", ".") if co2 else np.nan

        agua = re.search(r"(\d{1,3}(?:\.\d{3})*(?:,\d+)?)\s*litros de agua", impact)
        agua = agua.group(1).replace(",", ".") if agua else np.nan

        energia = re.search(r"(\d{1,3}(?:\.\d{3})*(?:,\d+)?)\s*kWh de energía", impact)
        energia = energia.group(1).replace(",", ".") if energia else np.nan
    except:
        co2, agua, energia = [np.nan]*3
    #! Producto total
    product_details = {
    'Nombre': title,
    'Opiniones': op_count,
    'Rating': op_rating,
    'Precio': price,
    'Opciones de color': color_options,
    'Tamaños': size_options,
    'CO2' : co2,
    'Agua': agua,
    "Energia": energia
    }
    df_producto = pd.DataFrame([product_details])
    return df_producto
def get_prod_links(url, base_url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    productos = soup.find('div', class_='filters-adjacent collection-listing')
    productos_links = [base_url+a.find('a').get('href') for a in productos.findAll('div', class_ = 'product-info')]
    return productos_links

In [5]:
cat = 'navigation__tier-1-container'
url_hombre = 'https://minimalismbrand.com/collections/ropa-minimalism'
base_minimalism = 'https://minimalismbrand.com'
subcat_cl = 'gallery gallery--height-fixed gallery--grid-4'
subsubcat_cl = 'collection-links-wrapper'

df_productos = pd.DataFrame()
categorias = get_category_links(url = base_minimalism, class_name=cat, base_url = base_minimalism)
# print(categorias)
for cat_link, cat_name in islice(categorias, 0,4):
    print("Scrapeando")
    print(cat_name)
    # print("Superlink:")
    # print(cat_link)
    try:
        subcats = get_category_links(url = cat_link, base_url=base_minimalism, class_name=subcat_cl)
    except:
        # print("No hay subcategorías")
        # subcats = [cat_link]
        # subcat_name = np.nan
        subcats = zip([cat_link], [np.nan])
    for subcat_link, subcat_name in subcats:
        # print(subcat_name)
        if subcat_name == 'Jerseis':
            if cat_name == 'Hombre':
                subcat_link = 'https://minimalismbrand.com/collections/sweater-men'
            elif cat_name == 'Mujer':
                subcat_link = 'https://minimalismbrand.com/collections/sweater-women'
        # print("Link principal")
        # print(link)
        try:
            subsubcats= get_category_links(url = subcat_link, base_url=base_minimalism, class_name=subsubcat_cl)
            # print("Sublinks:")
            # print(subsubcats)
            # print()
        except Exception as e:
            subsubcats = zip([subcat_link],[np.nan])
            # print(e)
            pass
        for subsubcat_link, subsubcat_name in subsubcats:
            # print(subsubcat_name)
            try:
                prod_links = get_prod_links(url = subsubcat_link, base_url=base_minimalism)
            except:
                prod_links = [subsubcat_link]
            for prod_link in prod_links:
                df_elemento = get_item_df(prod_link)
                df_elemento["categoria"] = cat_name
                df_elemento["subcategoria"] = subcat_name
                df_elemento["subsubcategoria"] = subsubcat_name
                df_productos = pd.concat([df_productos,df_elemento])
df_productos.reset_index(drop=True, inplace=True)

Scrapeando
Hombre
Scrapeando
Mujer
Scrapeando
Niño
Scrapeando
Mochilas


In [19]:
df_productos[df_productos.drop(columns=["Opciones de color","Tamaños", "categoria"]).duplicated(keep=False)].to_csv("datos/duplicados_mini.csv")

In [32]:
dupl_prod_id = df_productos[df_productos.drop(columns=["Opciones de color","Tamaños"]).duplicated()].index
df_productos.drop(index=dupl_prod_id, inplace=True)

In [None]:
df_productos.reset_index(drop=True, inplace=True)

In [42]:
unisex_mini_ids = df_productos[df_productos.drop(columns=["Opciones de color","Tamaños","categoria"]).duplicated(keep=False)].index
df_productos.loc[unisex_mini_ids, "categoria"] = "Unisex"

In [50]:
duplicated_unisex_id = df_productos[df_productos.drop(columns=["Opciones de color","Tamaños"]).duplicated()].index
df_productos.drop(index=duplicated_unisex_id, inplace=True)
df_productos.reset_index(drop=True, inplace=True)

In [60]:
df_productos.columns = [col.lower().replace(" ", '_') for col in df_productos.columns]
df_productos.head(3)

Unnamed: 0,nombre,opiniones,rating,precio,opciones_de_color,tamaños,co2,agua,energia,categoria,subcategoria,subsubcategoria
0,Camiseta algodón orgánico - Pack 3 uds,347,48,72.0,,,7.2,1.845,11.4,Hombre,Camisetas,Packs de camisetas
1,Camiseta algodón orgánico - Pack 5 uds,347,48,115.0,,,12.0,3.075,19.0,Hombre,Camisetas,Packs de camisetas
2,Camiseta algodón orgánico - Pack 7 uds,347,48,154.0,,,16.8,4.305,26.6,Hombre,Camisetas,Packs de camisetas


In [62]:
df_productos.to_excel("datos/minimalism.xlsx")

In [None]:
# df_productos.reset_index().drop(columns=["Opciones de color", "Tamaños"]).drop_duplicates()

In [64]:
driver = webdriver.Chrome()

driver.get(url = 'https://ecoalf.com/collections/sports-woman')
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(3)
women_html = driver.find_element(By.CSS_SELECTOR, "ul.grid.negative-margin.product-grid").get_attribute("innerHTML")
driver.quit()

In [65]:
driver = webdriver.Chrome()

driver.get(url = 'https://ecoalf.com/collections/sports-man')
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(3)
men_html = driver.find_element(By.CSS_SELECTOR, "ul.grid.negative-margin.product-grid").get_attribute("innerHTML")
driver.quit()


In [66]:
women_soup = BeautifulSoup(women_html, "html.parser")
men_soup = BeautifulSoup(men_html, "html.parser")

In [67]:
women_prods = ["https://ecoalf.com"+e.find('a').get('href') for e in women_soup.findAll('li')]
men_prods = ["https://ecoalf.com"+e.find('a').get('href') for e in men_soup.findAll('li')]

In [68]:
def get_product_info(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    litros_agua = np.nan
    kg_co2 = np.nan

    #* Impacto
    reports = soup.find_all('div', class_='accordion__content-report')
    # Variables para almacenar los resultados
    litros_agua = None
    co2_ahorro = None

    for report in reports:
        # Busca si el texto contiene 'litros de agua' o 'emisiones CO2' y extrae los valores
        texto = report.get_text()
        # print(texto)
        if 'litros de agua utilizados' in texto:
            # print(texto)
            agua = float(report.find('strong').text.strip().replace(',','.'))
        elif 'kg CO2 eq generados' in texto:
            # print(texto)
            co2 = float(report.find('strong').text.strip().replace(',','.'))  # Extrae el ahorro de CO2

    #* Nombre
    name = soup.find('h1', 'product__title h4').text.strip().capitalize()
    #* Precio
    price = soup.find('div', class_ = 'price__regular price-item price-item--regular').text.strip().replace('€','')
    #* Colores
    colors = [c.text.strip().capitalize() for c in soup.find('div', 'product__colors rte').findAll('a')] if soup.find('div', 'product__colors rte') else np.nan
    #* Tallas
    sizes = [s.get('data-get-size') for s in soup.find('fieldset', class_ = 'js product-form__input input--talla').findAll('label')] if soup.find('fieldset', class_ = 'js product-form__input input--talla') else np.nan

    product_info = {"Nombre" : name,
                    "Precio" : price,
                    "Colores" : colors,
                    "Tallas" : sizes}
    return pd.DataFrame([product_info])

df_ecoalf_men = pd.DataFrame()
df_ecoalf_women = pd.DataFrame()
try:
    for link in tqdm(men_prods):
        df_ecoalf_men = pd.concat([df_ecoalf_men,get_product_info(link)])
    for link in tqdm(women_prods):
        df_ecoalf_women = pd.concat([df_ecoalf_women,get_product_info(link)])
    df_ecoalf_men["Categoria"] = "Hombre"
    df_ecoalf_women["Categoria"] = "Mujer"
    df_ecoalf_men["Link"] = men_prods
    df_ecoalf_women["Link"] = women_prods
except Exception as e:
    print(link)
    print(e)

df_ecoalf = pd.concat([df_ecoalf_men, df_ecoalf_women]).reset_index(drop=True)

100%|██████████| 79/79 [00:59<00:00,  1.34it/s]
100%|██████████| 72/72 [00:53<00:00,  1.34it/s]


In [90]:
unisex_ids = df_ecoalf[df_ecoalf["Link"].duplicated(keep=False)].index
df_ecoalf.loc[unisex_ids, "Categoria"] = "Unisex"

In [91]:
duplicated_ids = df_ecoalf[df_ecoalf["Link"].duplicated()].index
df_ecoalf.drop(index=duplicated_ids, inplace=True)
df_ecoalf.reset_index(drop = True, inplace=True)

In [92]:
df_ecoalf.columns = [col.lower() for col in df_ecoalf.columns]
df_ecoalf.head(2)

Unnamed: 0,nombre,precio,colores,tallas,categoria,link
0,Top bombay naranja,5520,"[Black, Darkorange]","[S, M, L, XL]",Hombre,https://ecoalf.com/products/top-bombay-naranja
1,Pantalones cortos barcelona cactus,7920,"[Cactus, Black]","[S, M, L, XL]",Hombre,https://ecoalf.com/products/barcelonaalf-short...


In [99]:
df_ecoalf.to_excel("datos/ecoalf.xlsx")

In [153]:
response = requests.get('https://blaugab.com/shop/legging-interior-100-algodon-organico-hombre-1361?category=255#attr=5243')
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
def obtener_info(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    details = soup.find('div', {"id": "product_details"})
    name = details.find("h1", itemprop = "name").text
    price = details.find("span", itemprop = "price").text
    sizes = [o.text.strip().split()[0] for o in details.find('ul').findAll('option')] if details.find('ul') else np.nan
    subcat = soup.findAll('li', class_ = "breadcrumb-item")[1].text.strip()
    product_info = {"Nombre" : name,
                    "Precio" : price,
                    "Tallas" : sizes,
                    "Subcategoria" : subcat}
    
    return pd.DataFrame([product_info])

obtener_info('https://blaugab.com/es/shop/camiseta-termica-lana-merino-y-seda-mujer-121?category=213#attr=1032,75')

Unnamed: 0,Nombre,Precio,Tallas,Subcategoria
0,Camiseta térmica lana merino y seda Mujer,58.0,"[XS, S, M, L]",Ropa interior sostenible


In [None]:
lista_nombres=["mujer-201","hombre-253","bebe-296","sin-tintes-333","cuidado-hogar-314","nino-a-286"]
categorias = ["mujer","hombre","bebe","sin-tintes","cuidado-hogar","nino"]

lista_urls=[]
lista_categorias = []
for i in range(1,21):
    for nombre in lista_nombres:
        url=f"https://www.blaugab.com/es/shop/category/{nombre}/page/{i}"
        lista_urls.append(url)
        lista_categorias.append(nombre)



# Ejemplo de uso
categorias_paginas = {
    "mujer-201": 20,
    "hombre-253": 8,
    "bebe-296": 6,
    "nino-a-286":7,
    "sin-tintes-333": 3,
    "cuidado-hogar-314": 3
}
def filtrar_urls_por_categoria_y_pagina(urls, categorias_paginas, categorias_lista):
    urls_filtradas = []
    lista_categorias_filtrada = []
    i = -1
    for url in urls:
        i +=1
        # print(url)
        # Revisa si la URL contiene una categoría con un límite de páginas
        mantener_url = True
        for categoria, max_pagina in categorias_paginas.items():
            if categoria in url:
                # Extrae el número de la página desde el final de la URL
                numero_pagina = int(url.split('/')[-1].split('/')[-1].replace('page/', ''))
                if numero_pagina > max_pagina:
                    mantener_url = False
                    break
        if mantener_url:
            urls_filtradas.append(url)
            lista_categorias_filtrada.append(categorias_lista[i])
    return urls_filtradas, lista_categorias_filtrada

lista_urls_filtrada, lista_categorias_filtrada=filtrar_urls_por_categoria_y_pagina(lista_urls,categorias_paginas,lista_categorias)


lista_sopas=[]
for url in tqdm(lista_urls_filtrada):
    sopa=BeautifulSoup(requests.get(url).content)
    lista_sopas.append(sopa)



df_blaugab = pd.DataFrame()
i = -1
for soup in tqdm(lista_sopas):
    i += 1
    product_links=['https://www.blaugab.com/es' + e.find('a').get('href') for e in soup.findAll('h6', class_ = "o_wsale_products_item_title mb-2")]
    for link in tqdm(product_links):
        # try:
            df_categ = obtener_info(link)
            df_categ["Categoria"] = lista_categorias_filtrada[i]
            df_blaugab = pd.concat([df_blaugab, df_categ])
        # except Exception as e:
            # print(link)


In [50]:
cat_map = {
"mujer-201": "mujer",
"hombre-253": "hombre",
"bebe-296": "bebe",
"sin-tintes-333": "sin-tintes",
"cuidado-hogar-314": "cuidado-hogar",
"nino-a-286": "nino-a"
}

In [52]:
df_blaugab["Categoria"] = df_blaugab["Categoria"].map(cat_map)

In [55]:
df_blaugab.reset_index(drop=True, inplace=True)

In [57]:
df_blaugab.to_excel("datos/blaugab.xlsx")

In [56]:
df_blaugab.sample(5)

Unnamed: 0,Nombre,Precio,Tallas,Subcategoria,Categoria
151,Pantalón algodón orgánico SIBILA,89.99,"[S, M, L, XS]",Ropa casual,mujer
349,Braga sin costuras algodón orgánico Panty,19.95,"[36, 38, 40, 42]",Braga sin costuras algodón orgánico Panty,mujer
358,Chaqueta lana merino 100% lana virgen unisex,250.0,"[S, M, L]",Mujer,mujer
452,"Camiseta algodón orgánico, Tirantes Finos, Modern",22.95,"[36, 38, 40, 42, 44, 46]","Camiseta algodón orgánico, Tirantes Finos, Modern",mujer
234,"Body térmico bebé, lana merino y seda",26.05,"[50/56, 62/68, 74/80, 86/92, 98/104, 110/116]",Bebé,bebe
