# 0.0 Imports

In [44]:
import requests
import pandas as pd
import numpy as np
import re 
from datetime import datetime

from bs4 import BeautifulSoup

# 1.0 Loading Data

In [45]:
# URL 
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
    
# Parameters
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

# Request to URL 
page = requests.get( url, headers=headers )

# Beautiful Soup object
soup = BeautifulSoup( page.text, 'html.parser' )


## 1.1 Data Collect 

In [46]:
# ==============================product data========================================

products = soup.find( 'ul', class_='products-listing small' )

product_list = products.find_all( 'article', class_='hm-product-item')

# product id
product_id = [p.get( 'data-articlecode' ) for p in product_list]

# product category
product_category = [p.get( 'data-category' ) for p in product_list]

# product name
product_list = products.find_all( 'a', class_='link' )
product_name = [p.get_text() for p in product_list]

# price
product_list = products.find_all( 'span', class_='price regular' )
product_price = [p.get_text() for p in product_list]

data = pd.DataFrame( [product_id, product_category, product_name, product_price] ).T
data.columns = ['product_id', 'product_category', 'product_name', 'product_price']
data.head()

# scrapy datetime
data['scrapy_datetime'] = datetime.now().strftime( '%Y-%m-%d %H:%M:%S')

In [47]:
data.shape

(36, 5)

In [48]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime
0,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-06-21 21:29:05
1,985159001,men_jeans_skinny,Skinny Jeans,$ 19.99,2022-06-21 21:29:05
2,875105024,men_jeans_relaxed,Relaxed Jeans,$ 29.99,2022-06-21 21:29:05
3,1004199004,men_jeans_skinny,Skinny Cropped Jeans,$ 29.99,2022-06-21 21:29:05
4,1024256002,men_jeans_slim,Slim Jeans,$ 19.99,2022-06-21 21:29:05


## 1.2 Data Colletion by Product

In [49]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

# empty dataframe
df_details = pd.DataFrame() 

# unique columns for all products
aux = []

cols = ['Art. No.', 'Composition', 'Fit', 'Product safety', 'Size']
df_pattern = pd.DataFrame( columns=cols )

for i in range ( len( data ) ):
    
    # API Requests
    url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i, 'product_id' ] + '.html'
    print ( url )

    page = requests.get( url, headers=headers )

    # Beautiful Soup object
    soup = BeautifulSoup( page.text, 'html.parser' )

    # ================================= color name =================================
    product_list = soup.find_all( 'a', class_='filter-option miniature active' ) + soup.find_all('a', class_='filter-option miniature')
    color_name = [p.get( 'data-color') for p in product_list]

    # product id 
    product_id = [p.get( 'data-articlecode' ) for p in product_list]

    df_color = pd.DataFrame( [product_id, color_name] ).T
    df_color.columns = ['product_id', 'color_name']

    # generate style id + color id
    df_color['style_id'] = df_color['product_id'].apply( lambda x: x[:-3] )
    df_color['color_id'] = df_color['product_id'].apply( lambda x: x[-3:] )

    # ================================= composition =================================

    # HTML all data stored 
    product_composition_list = soup.find_all( 'div', class_='details-attributes-list-item' )
    product_composition = [list(filter(None, p.get_text().split( '\n' ) ) ) for p in product_composition_list]

    # rename columns DataFrame
    df_composition = pd.DataFrame(product_composition).T
    df_composition.columns = df_composition.iloc[0]

    # delete first row and substitution null values 
    df_composition = df_composition.iloc[1:].fillna( method='ffill' )
    
    # garantee the same number of columns
    df_composition = pd.concat( [ df_pattern, df_composition], axis=0 )

    # generate style id + color id
    df_composition['style_id'] = df_composition['Art. No.'].apply( lambda x: x[:-3] )
    df_composition['color_id'] = df_composition['Art. No.'].apply( lambda x: x[-3:] )
    
    aux = aux + df_composition.columns.tolist()


    # merge data color + data compostition
    data_sku = pd.merge( df_color, df_composition[['style_id', 'Fit', 'Composition', 'Product safety', 'Size']], how='left', on='style_id' )
    
    df_details = pd.concat( [df_details, data_sku], axis=0 )
    
# join showroom data + details
data['style_id'] = data['product_id'].apply( lambda x: x[:-3] )
data['color_id'] = data['product_id'].apply( lambda x: x[-3:] )

data_raw = pd.merge( data, df_details[['style_id', 'color_name', 'Fit', 'Composition', 'Product safety', 'Size']], 
                     how='left', on='style_id' )

data_raw.to_csv(r"C:\Users\ferki\repos\python_ds_ao_dev\datasets\data_raw_star_jeans.csv", index=False)

https://www2.hm.com/en_us/productpage.1024256001.html
https://www2.hm.com/en_us/productpage.0985159001.html
https://www2.hm.com/en_us/productpage.0875105024.html
https://www2.hm.com/en_us/productpage.1004199004.html
https://www2.hm.com/en_us/productpage.1024256002.html
https://www2.hm.com/en_us/productpage.0690449036.html
https://www2.hm.com/en_us/productpage.1024256007.html
https://www2.hm.com/en_us/productpage.1008549006.html
https://www2.hm.com/en_us/productpage.1024256003.html
https://www2.hm.com/en_us/productpage.1024256005.html
https://www2.hm.com/en_us/productpage.0985159007.html
https://www2.hm.com/en_us/productpage.0971061004.html
https://www2.hm.com/en_us/productpage.0690449056.html
https://www2.hm.com/en_us/productpage.1008549001.html
https://www2.hm.com/en_us/productpage.0971061005.html
https://www2.hm.com/en_us/productpage.0985159008.html
https://www2.hm.com/en_us/productpage.0690449022.html
https://www2.hm.com/en_us/productpage.0690449051.html
https://www2.hm.com/en_us/pr

In [50]:
data_raw.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,color_name,Fit,Composition,Product safety,Size
0,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-06-21 21:29:05,1024256,1,Black,Slim fit,"Shell: Cotton 99%, Spandex 1%",,
1,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-06-21 21:29:05,1024256,1,Black,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,
2,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-06-21 21:29:05,1024256,1,Black,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,
3,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-06-21 21:29:05,1024256,1,Black,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,
4,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-06-21 21:29:05,1024256,1,Black,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,


# 2.0 Data Cleaning

In [52]:
data = pd.read_csv(r"C:\Users\ferki\repos\python_ds_ao_dev\datasets\data_raw_star_jeans.csv")

# product id
data = data.dropna( subset=['product_id'] )
data['product_id'] = data['product_id'].astype( int )

# product name
data['product_name'] = data['product_name'].apply( lambda x: x.replace( ' ', '_').lower() )

# produtc price
data['product_price'] = data['product_price'].apply( lambda x: x.replace( '$', '') ).astype( float )

# scrapy datetime
data['scrapy_datetime'] = pd.to_datetime( data['scrapy_datetime'], format='%Y-%m-%d %H:%M:%S' )

# style id
data['style_id'] = data['style_id'].astype( int )

# color id
data['color_id'] = data['color_id'].astype( int )

# color name
data['color_name'] = data['color_name'].apply( lambda x: x.replace( ' ', '_').replace('/', '_').lower() 
                                              if pd.notnull( x ) else x )
# fit
data['Fit'] = data['Fit'].apply( lambda x: x.replace( ' ', '_').lower() if pd.notnull( x ) else x )

# size number    ####erro###   AttributeError: 'NoneType' object has no attribute 'group'
data['size_number'] = data['Size'].apply( lambda x: re.search( '\d{3}cm', x ).group(0) if pd.notnull( x ) else x)

# exclui a medida cm
data['size_number'] = data['Size'].apply( lambda x: re.search( '\d+', x ).group(0) if pd.notnull( x ) else x)

# size model 
data['size_model'] = data['Size'].str.extract( '(\d+/\\d+)' )

#remove column  ####remover depois de gerar as colunas size_number e size_model
#data = data.drop( columns=['Size', 'Product safety'], axis=1 )

# composition / remove columns
data = data[~data['Composition'].str.contains( 'Pocket lining:', na=False )]
data = data[~data['Composition'].str.contains( 'Lining:', na=False )]
data = data[~data['Composition'].str.contains( 'Shell:', na=False )]

# break composition by comma
df1 = data['Composition'].str.split( ',', expand=True )

data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,color_name,Fit,Composition,Product safety,Size,size_model
851,1004199004,men_jeans_skinny,skinny_cropped_jeans,29.99,2022-06-21 21:29:05,1004199,4,black,skinny_fit,"Cotton 99%, Spandex 1%",,,
852,1004199004,men_jeans_skinny,skinny_cropped_jeans,29.99,2022-06-21 21:29:05,1004199,4,black,skinny_fit,"Cotton 99%, Spandex 1%",,,
853,1004199004,men_jeans_skinny,skinny_cropped_jeans,29.99,2022-06-21 21:29:05,1004199,4,black,skinny_fit,"Cotton 99%, Spandex 1%",,,
854,1004199004,men_jeans_skinny,skinny_cropped_jeans,29.99,2022-06-21 21:29:05,1004199,4,black,skinny_fit,"Cotton 99%, Spandex 1%",,,
855,1004199004,men_jeans_skinny,skinny_cropped_jeans,29.99,2022-06-21 21:29:05,1004199,4,black,skinny_fit,"Cotton 99%, Spandex 1%",,,


In [53]:
# dataset gerou coluna vazia, que gera erro em size_number e size_model 
data['Size'].unique()

array([nan], dtype=object)