# 0.0 Imports

In [1]:
import requests
import pandas as pd
import numpy as np
import re 
from datetime import datetime

from bs4 import BeautifulSoup

# 1.0 Loading Data

In [2]:
# URL 
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
    
# Parameters
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

# Request to URL 
page = requests.get( url, headers=headers )

# Beautiful Soup object
soup = BeautifulSoup( page.text, 'html.parser' )


## 1.1 Data Collect 

In [3]:
# ==============================product data========================================

products = soup.find( 'ul', class_='products-listing small' )

product_list = products.find_all( 'article', class_='hm-product-item')

# product id
product_id = [p.get( 'data-articlecode' ) for p in product_list]

# product category
product_category = [p.get( 'data-category' ) for p in product_list]

# product name
product_list = products.find_all( 'a', class_='link' )
product_name = [p.get_text() for p in product_list]

# price
product_list = products.find_all( 'span', class_='price regular' )
product_price = [p.get_text() for p in product_list]

data = pd.DataFrame( [product_id, product_category, product_name, product_price] ).T
data.columns = ['product_id', 'product_category', 'product_name', 'product_price']
data.head()

# scrapy datetime
data['scrapy_datetime'] = datetime.now().strftime( '%Y-%m-%d %H:%M:%S')

In [4]:
data.shape

(36, 5)

In [5]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime
0,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-07-06 17:41:32
1,985159001,men_jeans_skinny,Skinny Jeans,$ 19.99,2022-07-06 17:41:32
2,1008549002,men_jeans_regular,Regular Jeans,$ 19.99,2022-07-06 17:41:32
3,875105024,men_jeans_relaxed,Relaxed Jeans,$ 29.99,2022-07-06 17:41:32
4,1008549001,men_jeans_regular,Regular Jeans,$ 19.99,2022-07-06 17:41:32


## 1.2 Data Colletion by Product

In [6]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

# empty dataframe
df_details = pd.DataFrame() 

# unique columns for all products
aux = []

cols = ['Art. No.', 'Composition', 'Fit', 'Product safety', 'Size']
df_pattern = pd.DataFrame( columns=cols )

for i in range ( len( data ) ):
    # API Requests
    url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i, 'product_id' ] + '.html'
    print ( url )

    page = requests.get( url, headers=headers )

    # Beautiful Soup object
    soup = BeautifulSoup( page.text, 'html.parser' )
    
    # ================================= color name =================================
    product_list = soup.find_all( 'a', class_='filter-option miniature active' ) + soup.find_all('a', class_='filter-option miniature')
    color_name = [p.get( 'data-color') for p in product_list]

    # product id 
    product_id = [p.get( 'data-articlecode' ) for p in product_list]

    df_color = pd.DataFrame( [product_id, color_name] ).T
    df_color.columns = ['product_id', 'color_name']

    # generate style id + color id
    df_color['style_id'] = df_color['product_id'].apply( lambda x: x[:-3] )
    df_color['color_id'] = df_color['product_id'].apply( lambda x: x[-3:] )

    # ================================= composition =================================

    # HTML all data stored 
    product_composition_list = soup.find_all( 'div', class_='details-attributes-list-item' )
    product_composition = [list(filter(None, p.get_text().split( '\n' ) ) ) for p in product_composition_list]
    
    # Collect the size here                             ######list index out of range(monitoria)######
    size_text = soup.find_all('dd')[0].text
    size_text = size_text if 'cm' in size_text else pd.NA
    
    for i in range(len(product_composition)):
        if product_composition[i][0] == 'Size':
            product_composition[i] = ['Size', size_text]
            break
    else:
        product_composition.append(['Size', size_text])
    
    # rename columns DataFrame
    df_composition = pd.DataFrame(product_composition).T
    df_composition.columns = df_composition.iloc[0]

    # delete first row and substitution null values 
    df_composition = df_composition.iloc[1:].fillna( method='ffill' )
    
    # garantee the same number of columns
    df_composition = pd.concat( [ df_pattern, df_composition ], axis=0)

    # generate style id + color id
    df_composition['style_id'] = df_composition['Art. No.'].apply( lambda x: x[:-3] )
    df_composition['color_id'] = df_composition['Art. No.'].apply( lambda x: x[-3:] )
    df_composition['Product safety'] = pd.NA
    aux = aux + df_composition.columns.tolist()

    # merge data color + data compostition
    data_sku = pd.merge( df_color, df_composition[['style_id', 'Fit', 'Composition', 'Product safety', 'Size']], how='left', on='style_id' )
    
    df_details = pd.concat( [df_details, data_sku], axis=0 )
    
# join showroom data + details
data['style_id'] = data['product_id'].apply( lambda x: x[:-3] )
data['color_id'] = data['product_id'].apply( lambda x: x[-3:] )

data_raw = pd.merge( data, df_details[['style_id', 'color_name', 'Fit', 'Composition', 'Product safety', 'Size']], 
                     how='left', on='style_id' )
#data_raw
#data_raw.to_csv(r"C:\Users\ferki\repos\python_ds_ao_dev\datasets\data_raw_star_jeans.csv", index=False)

https://www2.hm.com/en_us/productpage.1024256001.html
https://www2.hm.com/en_us/productpage.0985159001.html
https://www2.hm.com/en_us/productpage.1008549002.html
https://www2.hm.com/en_us/productpage.0875105024.html
https://www2.hm.com/en_us/productpage.1008549001.html
https://www2.hm.com/en_us/productpage.1024256002.html
https://www2.hm.com/en_us/productpage.1008549006.html
https://www2.hm.com/en_us/productpage.0985159007.html
https://www2.hm.com/en_us/productpage.0971061004.html
https://www2.hm.com/en_us/productpage.0985159008.html
https://www2.hm.com/en_us/productpage.1008110001.html
https://www2.hm.com/en_us/productpage.0690449022.html
https://www2.hm.com/en_us/productpage.1024256007.html
https://www2.hm.com/en_us/productpage.1004199004.html
https://www2.hm.com/en_us/productpage.1024256004.html
https://www2.hm.com/en_us/productpage.0971061002.html
https://www2.hm.com/en_us/productpage.0690449036.html
https://www2.hm.com/en_us/productpage.1024256003.html
https://www2.hm.com/en_us/pr

In [7]:
data_raw.shape

(9610, 12)

In [8]:
data_raw.dtypes

product_id          object
product_category    object
product_name        object
product_price       object
scrapy_datetime     object
style_id            object
color_id            object
color_name          object
Fit                 object
Composition         object
Product safety      object
Size                object
dtype: object

In [9]:
data_raw.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,color_name,Fit,Composition,Product safety,Size
0,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-07-06 17:41:32,1024256,1,Black,Slim fit,"Shell: Cotton 99%, Spandex 1%",,"The model is 185cm/6'1"" and wears a size 31/32"
1,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-07-06 17:41:32,1024256,1,Black,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
2,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-07-06 17:41:32,1024256,1,Black,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
3,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-07-06 17:41:32,1024256,1,Black,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
4,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-07-06 17:41:32,1024256,1,Black,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"


# 2.0 Data Cleaning

In [10]:
data = pd.read_csv(r"C:\Users\ferki\repos\python_ds_ao_dev\datasets\data_raw_star_jeans.csv")

# product id
data = data.dropna( subset=['product_id'] )
data['product_id'] = data['product_id'].astype( int )

# product name
data['product_name'] = data['product_name'].apply( lambda x: x.replace( ' ', '_').lower() )

# produtc price
data['product_price'] = data['product_price'].apply( lambda x: x.replace( '$', '') ).astype( float )

# scrapy datetime
data['scrapy_datetime'] = pd.to_datetime( data['scrapy_datetime'], format='%Y-%m-%d %H:%M:%S' )

# style id
data['style_id'] = data['style_id'].astype( int )

# color id
data['color_id'] = data['color_id'].astype( int )

# color name
data['color_name'] = data['color_name'].apply( lambda x: x.replace( ' ', '_').replace('/', '_').lower() 
                                               if pd.notnull( x ) else x )
# fit
data['Fit'] = data['Fit'].apply( lambda x: x.replace( ' ', '_').lower() if pd.notnull( x ) else x )

# size number   
data['size_number'] = data['Size'].apply( lambda x: re.search( '\d{3}cm', x ).group(0) if pd.notnull( x ) else x)

# exclui a medida cm
data['size_number'] = data['Size'].apply( lambda x: re.search( '\d+', x ).group(0) if pd.notnull( x ) else x)

# size model 
data['size_model'] = data['Size'].str.extract( '(\d+/\\d+)' )

# composition / remove columns
data = data[~data['Composition'].str.contains( 'Pocket lining:', na=False )]
data = data[~data['Composition'].str.contains( 'Lining:', na=False )]
data = data[~data['Composition'].str.contains( 'Shell:', na=False )]
data = data[~data['Composition'].str.contains( 'Pocket:', na=False )]

# # drop duplicates
data = data.drop_duplicates( subset=['product_id', 'product_category', 'product_name', 'product_price',
       'scrapy_datetime', 'style_id', 'color_id', 'color_name', 'Fit'], keep='last' )

# reset index 
data = data.reset_index( drop=True )

# break composition by comma
df1 = data['Composition'].str.split( ',', expand=True )

# cotton | polyester | spandex
df_ref = pd.DataFrame( index=np.arange( len( data ) ), columns=['cotton', 'polyester', 'spandex'] )

# cotton
df_cotton = df1[0]
df_cotton.name = 'cotton'

df_ref = pd.concat( [df_ref, df_cotton], axis=1 )
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated( keep='last' )]
df_ref['cotton'] = df_ref['cotton'].fillna( 'Cotton 0%' )


# polyester
df_polyester = df1.loc[df1[1].str.contains( 'Polyester', na=True ), 1]
df_polyester.name = 'polyester'

# combine spandex from both columns 1 and 2 
#df_polyester = df_polyester.combine_first( df1[2] )


df_ref = pd.concat( [df_ref, df_polyester], axis=1 )
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated( keep='last')]
df_ref['polyester'] = df_ref['polyester'].fillna( 'Polyester 0%' )

# spandex
df_spandex = df1.loc[df1[1].str.contains( 'Spandex', na=True ), 1]
df_spandex.name = 'spandex'

# combine spandex from both columns 1 and 2 
df_spandex = df_spandex.combine_first( df1[2] )

df_ref = pd.concat( [df_ref, df_spandex], axis=1 )
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated( keep='last')]
df_ref['spandex'] = df_ref['spandex'].fillna( 'Spandex 0%' )

# final join
data = pd.concat( [data, df_ref], axis=1 )

# format composition data 
data['cotton'] = data['cotton'].apply( lambda x: int( re.search( '\d+', x ).group(0) ) / 100 if pd.notnull( x ) else x )
data['polyester'] = data['polyester'].apply( lambda x: int( re.search( '\d+', x ).group(0) ) / 100 if pd.notnull( x ) else x )
data['spandex'] = data['spandex'].apply( lambda x: int( re.search( '\d+', x ).group(0) ) / 100 if pd.notnull( x ) else x )

# drop columns  ##remove after generating the columns size_number e size_model
data = data.drop( columns=['Size', 'Product safety', 'Composition'], axis=1 )

# drop duplicates 
data.drop_duplicates()

data.shape

# data.head()

(126, 14)

In [11]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,color_name,Fit,size_number,size_model,cotton,polyester,spandex
0,1004199002,men_jeans_skinny,skinny_cropped_jeans,29.99,2022-06-25 16:05:26,1004199,2,light_denim_blue,skinny_fit,185,,0.99,0.0,0.01
1,1004199002,men_jeans_skinny,skinny_cropped_jeans,29.99,2022-06-25 16:05:26,1004199,2,black,skinny_fit,185,,0.99,0.0,0.01
2,1004199002,men_jeans_skinny,skinny_cropped_jeans,29.99,2022-06-25 16:05:26,1004199,2,denim_blue,skinny_fit,185,,0.99,0.0,0.01
3,1004199002,men_jeans_skinny,skinny_cropped_jeans,29.99,2022-06-25 16:05:26,1004199,2,white,skinny_fit,185,,0.99,0.0,0.01
4,1004199002,men_jeans_skinny,skinny_cropped_jeans,29.99,2022-06-25 16:05:26,1004199,2,light_gray,skinny_fit,185,,0.99,0.0,0.01


In [12]:
# data
data.to_csv(r"C:\Users\ferki\repos\python_ds_ao_dev\datasets\data_star_jeans.csv", index=False)

In [13]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,color_name,Fit,size_number,size_model,cotton,polyester,spandex
0,1004199002,men_jeans_skinny,skinny_cropped_jeans,29.99,2022-06-25 16:05:26,1004199,2,light_denim_blue,skinny_fit,185,,0.99,0.0,0.01
1,1004199002,men_jeans_skinny,skinny_cropped_jeans,29.99,2022-06-25 16:05:26,1004199,2,black,skinny_fit,185,,0.99,0.0,0.01
2,1004199002,men_jeans_skinny,skinny_cropped_jeans,29.99,2022-06-25 16:05:26,1004199,2,denim_blue,skinny_fit,185,,0.99,0.0,0.01
3,1004199002,men_jeans_skinny,skinny_cropped_jeans,29.99,2022-06-25 16:05:26,1004199,2,white,skinny_fit,185,,0.99,0.0,0.01
4,1004199002,men_jeans_skinny,skinny_cropped_jeans,29.99,2022-06-25 16:05:26,1004199,2,light_gray,skinny_fit,185,,0.99,0.0,0.01
