# 1. Imports

## 1.1 Libraries 

In [1]:
import re
import math
import sqlite3
import requests
import inflection

import numpy    as np
import pandas   as pd

from bs4        import BeautifulSoup
from datetime   import datetime
from sqlalchemy import create_engine

## 1.2 Loading H&M Data (Web Scraping)

In [2]:
# Parameters
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

# URL
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

#Request to URL
page = requests.get (url, headers=headers)

#Beautiful Soup Object
soup = BeautifulSoup( page.text, 'html.parser')

# Paginação
total_item = soup.find_all( 'h2', class_='load-more-heading')[0].get('data-total')
page_number = math.ceil(int(total_item) / 36)

url2 = url + '?page-size=' + str(int(page_number*36))
page = requests.get (url2, headers=headers)
soup = BeautifulSoup( page.text, 'html.parser')

In [3]:
# Get the Product ID
products = soup.find('ul', class_ = 'products-listing small')
product_list = products.find_all('article', class_ = 'hm-product-item')
product_id = [p.get('data-articlecode') for p in product_list]

data = pd.DataFrame(product_id)
data.columns = ['product_id']

In [4]:
# empty dataframe
df_compositions = pd.DataFrame()

# Columns Pattern
aux = []
cols = ['Additional material information', 
        'Art. No.', 
        'Care instructions', 
        'Composition', 
        'Concept', 
        'Description', 
        'Fit', 
        'Imported', 
        'Material', 
        'Nice to know', 
        'Size', 
        'messages.clothingStyle', 
        'messages.garmentLength', 
        'messages.waistRise']

df_pattern = pd.DataFrame(columns=cols)

for i in range (len(data)):

    ###################### API Request #####################################
    
    #URL
    url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i, 'product_id'] +'.html'
    print('Product:{}'.format(url))
 
    #Request to URL
    page = requests.get (url, headers=headers)

    # Beautiful Soup Object
    soup = BeautifulSoup( page.text, 'html.parser')

    ###################### Color Name #####################################

    product_list = soup.find_all ('a', class_ = ['filter-option miniature', 'filter-option miniature active'])

    # Color name:
    color_name = [p.get('data-color') for p in product_list]

    # Product Id:
    product_id = [p.get('data-articlecode') for p in product_list]

    # Color Dataframe
    df_color = pd.DataFrame([product_id, color_name]).T
    df_color.columns = ['product_id', 'color_name']    

    for j in range (len(df_color)):
        #URL
        url = 'https://www2.hm.com/en_us/productpage.' + df_color.loc[j, 'product_id'] +'.html'
    
        #Request to URL
        page = requests.get (url, headers=headers)

        # Beautiful Soup Object
        soup = BeautifulSoup( page.text, 'html.parser')

        # Product Name 
        product_name = soup.find_all('hm-product-name', id='js-product-name')
        product_name = product_name[0].get_text()
        #print(product_name)

        # Product Price 
        product_price = soup.find_all('div', class_='primary-row product-item-price')
        product_price = re.findall(r'\d+\.?\d+', product_price[0].get_text())[0]
        
        # Composition
        product_composition_list = soup.find_all('div', class_='details-attributes-list-item') 
        product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composition_list]
       
        # Creating Dataframe
        df_composition = pd.DataFrame(product_composition).T                                                    # creating dataframe
        df_composition.columns = df_composition.iloc[0]                                                         # rename columns as the first row
        df_composition = df_composition.iloc[1:]                                                                # delete the first row
        df_composition = df_composition.fillna(method='ffill')                                                  # replace NA for de value above
        df_composition = pd.concat([df_pattern, df_composition], axis=0)                                        # garantee the same number of columns
        df_composition['product_price'] = product_price                                                         # Product Price
        df_composition['product_name'] = product_name                                                           # Product Name
        df_composition['Composition'] = df_composition['Composition'].str.replace('Shell: ', '', regex=True)    # Config composition
        df_composition.rename(columns = {'Art. No.': 'product_id'}, inplace=True)                               # Rename Columns
        
        # Keep new columns if it shows up
        aux = aux + df_composition.columns.tolist()
      
        ################# Merge Color and Composition ##########################
        df_composition = pd.merge(df_composition[['product_id', 'Fit', 'Composition', 'product_name', 'product_price']], df_color, how='left', on='product_id')
        df_composition = df_composition[~df_composition['Composition'].str.contains('Pocket', na=False)]
        df_composition = df_composition[~df_composition['Composition'].str.contains('Lining', na=False)]
        df_composition = df_composition.reset_index(drop=True)
        
        # All Products
        df_compositions = pd.concat([df_compositions, df_composition], axis=0)

# Generate Style ID + Color ID
df_compositions['style_id'] = df_compositions['product_id'].apply(lambda x: x[:-3])
df_compositions['color_id'] = df_compositions['product_id'].apply(lambda x: x[-3:])

# Scrapy Datetime
df_compositions['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')


Product:https://www2.hm.com/en_us/productpage.0985159001.html
Product:https://www2.hm.com/en_us/productpage.1024256001.html
Product:https://www2.hm.com/en_us/productpage.1008549001.html
Product:https://www2.hm.com/en_us/productpage.0690449059.html
Product:https://www2.hm.com/en_us/productpage.1024256002.html
Product:https://www2.hm.com/en_us/productpage.0979945001.html
Product:https://www2.hm.com/en_us/productpage.1008110001.html
Product:https://www2.hm.com/en_us/productpage.1024711006.html
Product:https://www2.hm.com/en_us/productpage.1008110006.html
Product:https://www2.hm.com/en_us/productpage.1071707001.html
Product:https://www2.hm.com/en_us/productpage.0993887008.html
Product:https://www2.hm.com/en_us/productpage.0690449022.html
Product:https://www2.hm.com/en_us/productpage.0985159005.html
Product:https://www2.hm.com/en_us/productpage.1024256003.html
Product:https://www2.hm.com/en_us/productpage.1008549006.html
Product:https://www2.hm.com/en_us/productpage.0979945002.html
Product:

In [5]:
df_compositions

Unnamed: 0,product_id,Fit,Composition,product_name,product_price,color_name,style_id,color_id,scrapy_datetime
0,0985159001,Skinny fit,"Cotton 99%, Spandex 1%",\n\nSkinny Jeans\n\n\n\n,24.99,Black,0985159,001,2022-09-29 18:07:25
0,0985159002,Skinny fit,"Cotton 99%, Spandex 1%",\n\nSkinny Jeans\n\n\n\n,24.99,Denim blue,0985159,002,2022-09-29 18:07:25
0,0985159003,Skinny fit,"Cotton 99%, Spandex 1%",\n\nSkinny Jeans\n\n\n\n,11.99,Dark gray,0985159,003,2022-09-29 18:07:25
0,0985159004,Skinny fit,"Cotton 99%, Spandex 1%",\n\nSkinny Jeans\n\n\n\n,12.99,Light denim blue,0985159,004,2022-09-29 18:07:25
0,0985159005,Skinny fit,"Cotton 99%, Spandex 1%",\n\nSkinny Jeans\n\n\n\n,24.99,Dark blue,0985159,005,2022-09-29 18:07:25
...,...,...,...,...,...,...,...,...,...
0,0985197003,Slim fit,"Cotton 99%, Spandex 1%",\n\nSlim Jeans\n\n\n\n,19.99,Denim blue,0985197,003,2022-09-29 18:07:25
0,0985197004,Slim fit,"Cotton 99%, Spandex 1%",\n\nSlim Jeans\n\n\n\n,19.99,Dark denim blue,0985197,004,2022-09-29 18:07:25
0,0985197005,Slim fit,"Cotton 99%, Spandex 1%",\n\nSlim Jeans\n\n\n\n,15.99,Dark denim blue,0985197,005,2022-09-29 18:07:25
0,0985197006,Slim fit,"Cotton 99%, Spandex 1%",\n\nSlim Jeans\n\n\n\n,19.99,Light denim blue,0985197,006,2022-09-29 18:07:25


In [6]:
# Cleaning duplicates
df1 = df_compositions.drop_duplicates(keep='first', inplace=False, ignore_index=False)

In [7]:
# Save as .csv
df1.to_csv('../data/hm_data_raw.csv', index=False)

# 2. Data Cleaning

## 2.1 Rename Columns

In [8]:
df2 = df1.copy()

In [9]:
df2.columns

Index(['product_id', 'Fit', 'Composition', 'product_name', 'product_price',
       'color_name', 'style_id', 'color_id', 'scrapy_datetime'],
      dtype='object')

In [10]:
# Rename Columns
cols_old = ['product_id', 'Fit', 'Composition', 'product_name', 'product_price', 'color_name', 'style_id', 'color_id', 'scrapy_datetime']
snakecase = lambda x: inflection.underscore(x)
cols_new = list(map(snakecase, cols_old))
df2.columns = cols_new

## 2.2 Change Types and Formats

In [11]:
df2.dtypes

product_id         object
fit                object
composition        object
product_name       object
product_price      object
color_name         object
style_id           object
color_id           object
scrapy_datetime    object
dtype: object

In [12]:
# Product Name
df2['product_name'] = df2['product_name'].str.replace('\n', '')
df2['product_name'] = df2['product_name'].str.replace(' ', '_').str.lower()

# Product Price
df2['product_price'] = df2['product_price'].astype(float)

# Scrapy Datetime
df2['scrapy_datetime'] = pd.to_datetime(df2['scrapy_datetime'], format = '%Y-%m-%d %H:%M:%S')

# Color Name
df2['color_name'] = df2['color_name'].apply(lambda x: x.replace(' ', '_').lower())

# Fit
df2['fit'] = df2['fit'].apply(lambda x: x.replace(' ', '_').lower())

In [13]:
df2.head()

Unnamed: 0,product_id,fit,composition,product_name,product_price,color_name,style_id,color_id,scrapy_datetime
0,985159001,skinny_fit,"Cotton 99%, Spandex 1%",skinny_jeans,24.99,black,985159,1,2022-09-29 18:07:25
0,985159002,skinny_fit,"Cotton 99%, Spandex 1%",skinny_jeans,24.99,denim_blue,985159,2,2022-09-29 18:07:25
0,985159003,skinny_fit,"Cotton 99%, Spandex 1%",skinny_jeans,11.99,dark_gray,985159,3,2022-09-29 18:07:25
0,985159004,skinny_fit,"Cotton 99%, Spandex 1%",skinny_jeans,12.99,light_denim_blue,985159,4,2022-09-29 18:07:25
0,985159005,skinny_fit,"Cotton 99%, Spandex 1%",skinny_jeans,24.99,dark_blue,985159,5,2022-09-29 18:07:25


## 2.3 Cleaning Composition Column

In [14]:
# Break composition by comma
aux = df2['composition'].str.split(',', expand=True).reset_index(drop=True)

# Sequence composition (cotton | polyester | Spandex):
df_ref = pd.DataFrame(index = np.arange(len(df2)), columns = ['cotton', 'polyester', 'spandex', 'lyocell', 'rayon'])

# Cotton 
df_cotton_0 = aux.loc[aux[0].str.contains('Cotton', na=True), 0]
df_cotton_0.name = 'cotton'

df_cotton_1 = aux.loc[aux[1].str.contains('Cotton', na=True), 1]
df_cotton_1.name = 'cotton'

df_cotton = df_cotton_0.combine_first(df_cotton_1)


df_ref = pd.concat([df_ref, df_cotton], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]
df_ref['cotton'] = df_ref['cotton'].fillna('Cotton 0%')

# # Polyester
df_polyester = aux.loc[aux[1].str.contains('ester', na=True), 1]
df_polyester.name = 'polyester'

df_ref = pd.concat([df_ref, df_polyester], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]
df_ref['polyester'] = df_ref['polyester'].fillna('Polyester 0%')

# Spandex
df_spandex_0 = aux.loc[aux[1].str.contains('Spandex', na=True), 1]
df_spandex_0.name = 'spandex'

df_spandex_1 = aux.loc[aux[2].str.contains('Spandex', na=True), 2]
df_spandex_1.name = 'spandex'

df_spandex = df_spandex_0.combine_first(df_spandex_1)

df_ref = pd.concat([df_ref, df_spandex], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]
df_ref['spandex'] = df_ref['spandex'].fillna('Spandex 0%')

# Lyocell (aux[0] e aux[1])
df_lyocell_0 = aux.loc[aux[0].str.contains('Lyocell', na=True), 0]
df_lyocell_0.name = 'lyocell'

df_lyocell_1 = aux.loc[aux[1].str.contains('Lyocell', na=True), 1]
df_lyocell_1.name = 'lyocell'

df_lyocell = df_lyocell_0.combine_first(df_lyocell_1)

df_ref = pd.concat([df_ref, df_lyocell], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]
df_ref['lyocell'] = df_ref['lyocell'].fillna('Lyocell 0%')


# Rayon (aux[0], aux[2])
df_rayon_0 = aux.loc[aux[0].str.contains('Rayon', na=True), 0]
df_rayon_0.name = 'rayon'

df_rayon_1 = aux.loc[aux[2].str.contains('Rayon', na=True), 2]
df_rayon_1.name = 'rayon'

df_rayon = df_rayon_0.combine_first(df_rayon_1)

df_ref = pd.concat([df_ref, df_rayon], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]
df_ref['rayon'] = df_ref['rayon'].fillna('Rayon 0%')

# Final Join
df2 = pd.concat ([df2.reset_index(drop=True), df_ref], axis = 1)

# Format Composition Data
df2['cotton']       = df2['cotton'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
df2['polyester']    = df2['polyester'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
df2['spandex']      = df2['spandex'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
df2['lyocell']      = df2['lyocell'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
df2['rayon']        = df2['rayon'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)

# Drop Columns
df2 = df2.drop(columns = ['composition'], axis=1)


In [15]:
df2.sample(20)

Unnamed: 0,product_id,fit,product_name,product_price,color_name,style_id,color_id,scrapy_datetime,cotton,polyester,spandex,lyocell,rayon
12,1024256005,slim_fit,slim_jeans,19.99,dark_blue,1024256,5,2022-09-29 18:07:25,0.99,0.0,0.01,0.0,0.0
55,1024711006,slim_fit,slim_jeans,29.99,dark_denim_blue,1024711,6,2022-09-29 18:07:25,0.98,0.0,0.02,0.0,0.0
97,875105009,relaxed_fit,relaxed_jeans,39.99,dark_denim_blue,875105,9,2022-09-29 18:07:25,1.0,0.0,0.0,0.0,0.0
152,1025726002,relaxed_fit,relaxed_jeans,11.99,light_denim_blue,1025726,2,2022-09-29 18:07:25,1.0,0.0,0.0,0.0,0.0
107,811993001,regular_fit,regular_jeans,12.99,black/washed_out,811993,1,2022-09-29 18:07:25,0.98,0.0,0.02,0.0,0.0
25,690449006,skinny_fit,skinny_jeans,7.99,black/washed,690449,6,2022-09-29 18:07:25,0.98,0.0,0.02,0.0,0.0
143,1027852002,relaxed_fit,relaxed_denim_joggers,29.99,dark_gray,1027852,2,2022-09-29 18:07:25,1.0,0.0,0.0,0.0,0.0
129,985197002,slim_fit,slim_jeans,6.99,midnight_blue,985197,2,2022-09-29 18:07:25,0.99,0.0,0.01,0.0,0.0
89,1004199002,skinny_fit,skinny_cropped_jeans,29.99,black,1004199,2,2022-09-29 18:07:25,0.99,0.0,0.01,0.0,0.0
132,985197005,slim_fit,slim_jeans,15.99,dark_denim_blue,985197,5,2022-09-29 18:07:25,0.99,0.0,0.01,0.0,0.0


# 3. Save Changes

## 3.1 Save .csv File

In [16]:
# Save as .csv
df2.to_csv('../data/hm_data_cleaned.csv', index=False)

## 3.2 Create SQL Database 

In [17]:
df2.dtypes

product_id                 object
fit                        object
product_name               object
product_price             float64
color_name                 object
style_id                   object
color_id                   object
scrapy_datetime    datetime64[ns]
cotton                    float64
polyester                 float64
spandex                   float64
lyocell                   float64
rayon                     float64
dtype: object

In [None]:
# Database Conection

endpoint_local = 'sqlite:////home/vitor/Repos/market-research/data/hm_db.sqlite'

engine = create_engine(endpoint_local)
connection = engine.connect()

# Create Table
# query_showroom_schema = """ 
#     CREATE TABLE showroom (
#         product_id                  TEXT,
#         product_name                TEXT,
#         product_category            TEXT,
#         product_price               REAL,
#         style_id                    INTEGER,
#         color_id                    INTEGER,
#         scrapy_datetime             TEXT,
#         color_name                  TEXT,
#         fit                         TEXT,
#         cotton                      REAL,
#         polyester                   REAL,
#         spandex                     REAL,
#         lyocell                     REAL,  
#         rayon                       REAL
#     )

# """

# cursor = conn.execute(query_showroom_schema)

# conn.commit()
# conn.close()

# Insert Data
df2.to_sql('showroom', con = engine, if_exists='append', index=False)

54

In [None]:
# Consulting Database
query = """ 
    SELECT * FROM showroom

"""

df3 = pd.read_sql_query(query, engine)
df3.head()


Unnamed: 0,product_id,product_name,product_category,product_price,style_id,color_id,scrapy_datetime,color_name,fit,cotton,polyester,spandex
0,938875007,slim_tapered_jeans,men_jeans_slim,39.99,938875,7,2022-09-23 09:43:57,black,slim_fit,0.99,0.0,0.01
1,993887007,hybrid_regular_denim_joggers,men_jeans_joggers,39.99,993887,7,2022-09-23 09:43:57,black/no_fade_black,regular_fit,0.8,0.19,0.01
2,993887007,hybrid_regular_denim_joggers,men_jeans_joggers,39.99,993887,7,2022-09-23 09:43:57,black/no_fade_black,regular_fit,0.77,0.21,0.02
3,993887007,hybrid_regular_denim_joggers,men_jeans_joggers,39.99,993887,7,2022-09-23 09:43:57,black/no_fade_black,regular_fit,0.78,0.21,0.01
4,1004199007,skinny_cropped_jeans,men_jeans_skinny,29.99,1004199,7,2022-09-23 09:43:57,denim_blue,skinny_fit,0.99,0.0,0.01


In [None]:
connection.close()