In [1]:
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
from pytubefix.contrib.search import Search
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
from bs4 import BeautifulSoup
from time import sleep
import numpy as np
import os

from itertools import islice

In [2]:
def get_category_links(url, base_url, class_name):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    cat_list = soup.find('div', class_ = class_name)
    links = [base_url+a.get('href') for a in cat_list.findAll('a')]
    cats_title = [a.text.strip() for a in cat_list.findAll('a')]
    return zip(links, cats_title)

def get_item_df(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    #* Título
    title = soup.find('h1').get_text(strip=True) if soup.find('h1') else None
    #* Opiniones
    opinion = soup.find('span', class_='stamped-badge-caption')
    op_count = opinion.get('data-reviews') if opinion else np.nan
    op_rating = opinion.get('data-rating') if opinion else np.nan
    #* Precio
    price = soup.find('span', class_ = 'current-price theme-money').text.replace('€', '')
    #*Color
    color_options = []
    color_container = soup.find('div', class_='option-selector--swatch')
    if color_container:
        color_options = [span.get_text(strip=True) for span in color_container.find_all('span') if span.get_text(strip=True)]
    if len(color_options) == 0:
        color_options = np.nan
    #* Tallas
    size_options = []
    for tag in soup.find_all(['span', 'button']):
        text = tag.get_text(strip=True)
        # Filtra los tamaños esperados
        if text in ['XS', 'S', 'M', 'L', 'XL', 'XXL']:
            size_options.append(text)
    if len(size_options) == 0:
        size_options = np.nan

    #* Impacto
    try:
        impact = soup.find("summary", class_ = 'cc-accordion-item__title', string = 'Impacto ambiental').find_parent().text 
        co2 = re.search(r"(\d{1,3}(?:\.\d{3})*(?:,\d+)?)\s*kg de emisiones de CO2", impact)
        co2 = co2.group(1).replace(",", ".") if co2 else np.nan

        agua = re.search(r"(\d{1,3}(?:\.\d{3})*(?:,\d+)?)\s*litros de agua", impact)
        agua = agua.group(1).replace(",", ".") if agua else np.nan

        energia = re.search(r"(\d{1,3}(?:\.\d{3})*(?:,\d+)?)\s*kWh de energía", impact)
        energia = energia.group(1).replace(",", ".") if energia else np.nan
    except:
        co2, agua, energia = [np.nan]*3
    #! Producto total
    product_details = {
    'Nombre': title,
    'Opiniones': op_count,
    'Rating': op_rating,
    'Precio': price,
    'Opciones de color': color_options,
    'Tamaños': size_options,
    'CO2' : co2,
    'Agua': agua,
    "Energia": energia
    }
    df_producto = pd.DataFrame([product_details])
    return df_producto
def get_prod_links(url, base_url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    productos = soup.find('div', class_='filters-adjacent collection-listing')
    productos_links = [base_url+a.find('a').get('href') for a in productos.findAll('div', class_ = 'product-info')]
    return productos_links

In [5]:
cat = 'navigation__tier-1-container'
url_hombre = 'https://minimalismbrand.com/collections/ropa-minimalism'
base_minimalism = 'https://minimalismbrand.com'
subcat_cl = 'gallery gallery--height-fixed gallery--grid-4'
subsubcat_cl = 'collection-links-wrapper'

df_productos = pd.DataFrame()
categorias = get_category_links(url = base_minimalism, class_name=cat, base_url = base_minimalism)
# print(categorias)
for cat_link, cat_name in islice(categorias, 0,4):
    print("Scrapeando")
    print(cat_name)
    # print("Superlink:")
    # print(cat_link)
    try:
        subcats = get_category_links(url = cat_link, base_url=base_minimalism, class_name=subcat_cl)
    except:
        # print("No hay subcategorías")
        # subcats = [cat_link]
        # subcat_name = np.nan
        subcats = zip([cat_link], [np.nan])
    for subcat_link, subcat_name in subcats:
        # print(subcat_name)
        if subcat_name == 'Jerseis':
            if cat_name == 'Hombre':
                subcat_link = 'https://minimalismbrand.com/collections/sweater-men'
            elif cat_name == 'Mujer':
                subcat_link = 'https://minimalismbrand.com/collections/sweater-women'
        # print("Link principal")
        # print(link)
        try:
            subsubcats= get_category_links(url = subcat_link, base_url=base_minimalism, class_name=subsubcat_cl)
            # print("Sublinks:")
            # print(subsubcats)
            # print()
        except Exception as e:
            subsubcats = zip([subcat_link],[np.nan])
            # print(e)
            pass
        for subsubcat_link, subsubcat_name in subsubcats:
            # print(subsubcat_name)
            try:
                prod_links = get_prod_links(url = subsubcat_link, base_url=base_minimalism)
            except:
                prod_links = [subsubcat_link]
            for prod_link in prod_links:
                df_elemento = get_item_df(prod_link)
                df_elemento["categoria"] = cat_name
                df_elemento["subcategoria"] = subcat_name
                df_elemento["subsubcategoria"] = subsubcat_name
                df_productos = pd.concat([df_productos,df_elemento])
df_productos.reset_index(drop=True, inplace=True)

Scrapeando
Hombre
Scrapeando
Mujer
Scrapeando
Niño
Scrapeando
Mochilas


In [19]:
df_productos[df_productos.drop(columns=["Opciones de color","Tamaños", "categoria"]).duplicated(keep=False)].to_csv("datos/duplicados_mini.csv")

In [32]:
dupl_prod_id = df_productos[df_productos.drop(columns=["Opciones de color","Tamaños"]).duplicated()].index
df_productos.drop(index=dupl_prod_id, inplace=True)

In [None]:
df_productos.reset_index(drop=True, inplace=True)

In [42]:
unisex_mini_ids = df_productos[df_productos.drop(columns=["Opciones de color","Tamaños","categoria"]).duplicated(keep=False)].index
df_productos.loc[unisex_mini_ids, "categoria"] = "Unisex"

In [50]:
duplicated_unisex_id = df_productos[df_productos.drop(columns=["Opciones de color","Tamaños"]).duplicated()].index
df_productos.drop(index=duplicated_unisex_id, inplace=True)
df_productos.reset_index(drop=True, inplace=True)

In [None]:
[col for col in df_productos.columns]

Index(['Nombre', 'Opiniones', 'Rating', 'Precio', 'Opciones de color',
       'Tamaños', 'CO2', 'Agua', 'Energia', 'categoria', 'subcategoria',
       'subsubcategoria'],
      dtype='object')

In [None]:
# df_productos.reset_index().drop(columns=["Opciones de color", "Tamaños"]).drop_duplicates()

In [None]:
driver = webdriver.Chrome()

driver.get(url = 'https://ecoalf.com/collections/sports-woman')
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(3)
women_html = driver.find_element(By.CSS_SELECTOR, "ul.grid.negative-margin.product-grid").get_attribute("innerHTML")
driver.quit()

In [None]:
driver = webdriver.Chrome()

driver.get(url = 'https://ecoalf.com/collections/sports-man')
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(3)
men_html = driver.find_element(By.CSS_SELECTOR, "ul.grid.negative-margin.product-grid").get_attribute("innerHTML")
driver.quit()


In [None]:
women_soup = BeautifulSoup(women_html, "html.parser")
men_soup = BeautifulSoup(men_html, "html.parser")

In [None]:
women_prods = ["https://ecoalf.com"+e.find('a').get('href') for e in women_soup.findAll('li')]
men_prods = ["https://ecoalf.com"+e.find('a').get('href') for e in men_soup.findAll('li')]

In [None]:
def get_product_info(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    litros_agua = np.nan
    kg_co2 = np.nan

    #* Impacto
    reports = soup.find_all('div', class_='accordion__content-report')
    # Variables para almacenar los resultados
    litros_agua = None
    co2_ahorro = None

    for report in reports:
        # Busca si el texto contiene 'litros de agua' o 'emisiones CO2' y extrae los valores
        texto = report.get_text()
        # print(texto)
        if 'litros de agua utilizados' in texto:
            # print(texto)
            agua = float(report.find('strong').text.strip().replace(',','.'))
        elif 'kg CO2 eq generados' in texto:
            # print(texto)
            co2 = float(report.find('strong').text.strip().replace(',','.'))  # Extrae el ahorro de CO2

    #* Nombre
    name = soup.find('h1', 'product__title h4').text.strip().capitalize()
    #* Precio
    price = soup.find('div', class_ = 'price__regular price-item price-item--regular').text.strip().replace('€','')
    #* Colores
    colors = [c.text.strip().capitalize() for c in soup.find('div', 'product__colors rte').findAll('a')] if soup.find('div', 'product__colors rte') else np.nan
    #* Tallas
    sizes = [s.get('data-get-size') for s in soup.find('fieldset', class_ = 'js product-form__input input--talla').findAll('label')] if soup.find('fieldset', class_ = 'js product-form__input input--talla') else np.nan

    product_info = {"Nombre" : name,
                    "Precio" : price,
                    "Colores" : colors,
                    "Tallas" : sizes}
    return pd.DataFrame([product_info])

df_ecoalf_men = pd.DataFrame()
df_ecoalf_women = pd.DataFrame()
try:
    for link in tqdm(men_prods):
        df_ecoalf_men = pd.concat([df_ecoalf_men,get_product_info(link)])
    for link in tqdm(women_prods):
        df_ecoalf_women = pd.concat([df_ecoalf_women,get_product_info(link)])
    df_ecoalf_men["Categoria"] = "Hombre"
    df_ecoalf_women["Categoria"] = "Mujer"
    df_ecoalf_men["Link"] = men_prods
    df_ecoalf_women["Link"] = women_prods
except Exception as e:
    print(link)
    print(e)

df_ecoalf = pd.concat([df_ecoalf_men, df_ecoalf_women]).reset_index(drop=True)

  0%|          | 0/79 [00:00<?, ?it/s]

100%|██████████| 79/79 [00:52<00:00,  1.51it/s]
100%|██████████| 93/93 [00:58<00:00,  1.59it/s]


In [None]:
unisex_ids = df_ecoalf[df_ecoalf["Link"].duplicated(keep=False)].index
df_ecoalf.loc[unisex_ids, "Categoria"] = "Unisex"

In [None]:
df_ecoalf[["Nombre", "Categoria"]]

Unnamed: 0,Nombre,Categoria
0,Top bombay naranja,Hombre
1,Pantalones cortos barcelona cactus,Hombre
2,Camiseta zurich negra,Hombre
3,Chaqueta boulders negra,Hombre
4,Top bombay negro,Hombre
...,...,...
167,Botella sports de acero inoxidable bronson azul,Unisex
168,Botella sports de acero inoxidable bronson blanca,Unisex
169,Sudadera unisex orlando verde,Unisex
170,Sudadera unisex madagascar azul marino,Unisex
