# 1. Imports

## 1.1 Libraries 

In [1]:
import math
import requests

import numpy as np
import pandas as pd

from bs4        import BeautifulSoup
from datetime   import datetime

## 1.2 Loading H&M Data (Web Scraping)

### 1.2.1 Request API

In [2]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get (url, headers=headers)

soup = BeautifulSoup( page.text, 'html.parser')

In [3]:
# Paginação
total_item = soup.find_all( 'h2', class_='load-more-heading')[0].get('data-total')
page_number = math.ceil(int(total_item) / 36)

url2 = url + '?page-size=' + str(int(page_number*36))

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get (url2, headers=headers)

soup = BeautifulSoup( page.text, 'html.parser')

### 1.2.2 Colecting Information: Id, Category, Name, Price

In [4]:
# Product_id
products = soup.find('ul', class_ = 'products-listing small')
product_list = products.find_all('article', class_ = 'hm-product-item')

product_id = [p.get('data-articlecode') for p in product_list]

# Product_category
product_category = [p.get('data-category') for p in product_list]

# Product_name
product_list = products.find_all('a', class_ = 'link')
product_name = [p.get_text() for p in product_list]

# Product_price
product_list = products.find_all('span', class_ = 'price regular')
product_price = [p.get_text() for p in product_list]

# Dataset
data = pd.DataFrame([product_id, product_name, product_category, product_price]).T
data.columns = ['product_id', 'product_name', 'product_category', 'product_price']

# Generate Style ID + Color ID
data['style_id'] = data['product_id'].apply(lambda x: x[:-3])
data['color_id'] = data['product_id'].apply(lambda x: x[-3:])

# Scrapy Datetime
data['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [5]:
data.head()

Unnamed: 0,product_id,product_name,product_category,product_price,style_id,color_id,scrapy_datetime
0,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,1024256,1,2022-09-20 12:15:59
1,1024256006,Slim Jeans,men_jeans_slim,$ 19.99,1024256,6,2022-09-20 12:15:59
2,993887007,Hybrid Regular Denim Joggers,men_jeans_joggers,$ 39.99,993887,7,2022-09-20 12:15:59
3,938875007,Slim Tapered Jeans,men_jeans_slim,$ 39.99,938875,7,2022-09-20 12:15:59
4,1004199007,Skinny Cropped Jeans,men_jeans_skinny,$ 29.99,1004199,7,2022-09-20 12:15:59


### 1.2.3 Colecting Information: Color and Composition (One Product Test)

In [6]:
# API Request

url = 'https://www2.hm.com/en_us/productpage.0690449056.html'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get (url, headers=headers)

# Beautiful Soup Object
soup = BeautifulSoup( page.text, 'html.parser')

In [7]:
###################### Color Name #####################################

product_list = soup.find_all ('a', class_ = ['filter-option miniature', 'filter-option miniature active'])

# Color name:
color_name = [p.get('data-color') for p in product_list]

# Product Id:
product_id = [p.get('data-articlecode') for p in product_list]

# Color Dataframe
df_color = pd.DataFrame([product_id, color_name]).T
df_color.columns = ['product_id', 'color_name']

# Generate Style ID + Color ID
df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])

In [8]:
df_color.head()

Unnamed: 0,product_id,color_name,style_id,color_id
0,690449001,Light denim blue/trashed,690449,1
1,690449002,Denim blue,690449,2
2,690449006,Black/washed,690449,6
3,690449007,Light denim blue,690449,7
4,690449009,Black washed out,690449,9


In [9]:
###################### Composition #####################################

product_composition_list = soup.find_all('div', class_='details-attributes-list-item')
product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composition_list]

# Creating Dataframe
df_composition = pd.DataFrame(product_composition).T
df_composition.columns = df_composition.iloc[0]         # rename columns as the first row
df_composition = df_composition.iloc[1:]                # delete the first row
df_composition = df_composition.fillna(method='ffill')  # replace NA for de value above

# Generate Style ID + Color ID
df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])


In [10]:
df_composition.columns

Index(['messages.garmentLength', 'messages.waistRise', 'Fit',
       'messages.clothingStyle', 'Composition', 'Care instructions',
       'Material', 'Description', 'Imported', 'Concept', 'Nice to know',
       'Art. No.', 'style_id', 'color_id'],
      dtype='object', name=0)

In [11]:

################# Merge Color and Composition ##########################

df_sku = pd.merge(df_color, df_composition[['messages.garmentLength', 
                                            'messages.waistRise', 'Fit', 
                                            'messages.clothingStyle', 
                                            'Composition', 
                                            'Care instructions',
                                            'Material', 
                                            'Description', 
                                            'Imported', 
                                            'Concept', 
                                            'Nice to know',
                                            'Art. No.', 
                                            'style_id',]], how='left', on='style_id')

In [12]:
df_sku.columns

Index(['product_id', 'color_name', 'style_id', 'color_id',
       'messages.garmentLength', 'messages.waistRise', 'Fit',
       'messages.clothingStyle', 'Composition', 'Care instructions',
       'Material', 'Description', 'Imported', 'Concept', 'Nice to know',
       'Art. No.'],
      dtype='object')

In [13]:
df_sku = df_sku[['product_id', 'color_name', 'style_id', 'color_id','Fit', 'Composition']]
df_sku = df_sku.drop_duplicates(keep='first', inplace=False, ignore_index=False)
df_sku.head()

Unnamed: 0,product_id,color_name,style_id,color_id,Fit,Composition
0,690449001,Light denim blue/trashed,690449,1,Skinny fit,"Cotton 98%, Spandex 2%"
1,690449001,Light denim blue/trashed,690449,1,Skinny fit,"Pocket lining: Polyester 65%, Cotton 35%"
6,690449002,Denim blue,690449,2,Skinny fit,"Cotton 98%, Spandex 2%"
7,690449002,Denim blue,690449,2,Skinny fit,"Pocket lining: Polyester 65%, Cotton 35%"
12,690449006,Black/washed,690449,6,Skinny fit,"Cotton 98%, Spandex 2%"


### 1.2.4 Colecting Information: Color and Composition (Multiple Products)

In [14]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

# empty dataframe
df_details = pd.DataFrame()

# Columns Pattern
aux = []
cols = ['Additional material information', 
        'Art. No.', 
        'Care instructions', 
        'Composition', 
        'Concept', 
        'Description', 
        'Fit', 
        'Imported', 
        'Material', 
        'Nice to know', 
        'Size', 
        'messages.clothingStyle', 
        'messages.garmentLength', 
        'messages.waistRise']

df_pattern = pd.DataFrame(columns=cols)

for i in range (len(data)):

    ###################### API Request #####################################

    url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i, 'product_id'] +'.html'
    page = requests.get (url, headers=headers)

    # Beautiful Soup Object
    soup = BeautifulSoup( page.text, 'html.parser')

    ###################### Color Name #####################################

    product_list = soup.find_all ('a', class_ = ['filter-option miniature', 'filter-option miniature active'])

    # Color name:
    color_name = [p.get('data-color') for p in product_list]

    # Product Id:
    product_id = [p.get('data-articlecode') for p in product_list]

    # Color Dataframe
    df_color = pd.DataFrame([product_id, color_name]).T
    df_color.columns = ['product_id', 'color_name']

    # Generate Style ID + Color ID
    df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])

    # ###################### Composition #####################################

    product_composition_list = soup.find_all('div', class_='details-attributes-list-item')
    product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composition_list]

    # Creating Dataframe
    df_composition = pd.DataFrame(product_composition).T                # creating dataframe
    df_composition.columns = df_composition.iloc[0]                     # rename columns as the first row
    df_composition = df_composition.iloc[1:]                            # delete the first row
    df_composition = df_composition.fillna(method='ffill')              # replace NA for de value above
    df_composition = pd.concat([df_pattern, df_composition], axis=0)    # garantee the same number of columns

    # Generate Style ID + Color ID
    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])

    aux = aux + df_composition.columns.tolist()

    ################# Merge Color and Composition ##########################
    df_sku = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition']], how='left', on='style_id')

    df_details = pd.concat([df_details, df_sku], axis=0)

### 1.2.5 Merging All Information

In [15]:
# Cleaning duplicates
df_details = df_details.drop_duplicates(keep='first', inplace=False, ignore_index=False)

In [16]:
df_raw = pd.merge(data, df_details[['product_id', 'color_name', 'Fit', 'Composition']], how='left', on='product_id')

In [17]:
# Save as .csv
df_raw.to_csv('../data/hm_data_raw.csv', index=False)