# O que é Granularidade?

Granularidade é o nível de detalhe no qual o dado é armazenado no banco de dados.<br>
( Jan L. Harrington, in Relational Database Design and Implementation)

In [1]:
import re
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv( 'products_hm.csv' )
# product id
data = data.dropna( subset=['product_id'] )
data['product_id'] = data['product_id'].astype( int )
# product name
data['product_name'] = data['product_name'].apply( lambda x: x.replace( ' ','_' ).lower() )
# product price
data['product_price'] = data['product_price'].apply( lambda x: x.replace( '$ ','' ) ).astype( float )
# # scrapy datetime
data['scrapy_datetime'] = pd.to_datetime( data['scrapy_datetime'],format='%Y-%m-%d %H:%M:%S' )
# # style id
data['style_id'] = data['style_id'].astype( int )
# # color id
data['color_id'] = data['color_id'].astype( int )
                   
# # color name
data['product_color'] = data['product_color'].apply( lambda x: x.replace( ' ', '_' ).replace( '/', '_' ).lower() if pd.notnull( x ) else x )
# # fit
data['Fit'] = data['Fit'].apply( lambda x: x.replace( ' ', '_' ).lower() if pd.notnull( x ) else x )
# # size number
data['size_number'] = data['Size'].apply( lambda x: re.search( '\d{2}\.\d', x ).group(0) if pd.notnull( x ) else x )
# data['size_number'] = data['size_number'].apply( lambda x: re.search( '\d+', x).group(0) if pd.notnull( x ) else x )

# # size model
data['size_model'] = data['Size'].str.extract( '\(Size (.*?)\)' )
data = data.drop( columns=['Size', 'Care instructions', 'Concept', 'Description',
       'Imported', 'Length', 'Material', 'More sustainable materials', 'Nice to know', 'Rise', 'Style'], axis=1 )

# composition
data = data[~data['Composition'].str.contains('Shell:', na=False)]
data = data[~data['Composition'].str.contains('Lining:', na=False)]
data = data[~data['Composition'].str.contains('Pocket lining:', na=False)]

# break composition by comma
df1 = data['Composition'].str.split( ',', expand=True )

# cotton | polyester | Elastomultiester | Spandex 
df_ref = pd.DataFrame( index=np.arange( len( data ) ), columns=['cotton','polyester', 'elastomultiester', 'spandex'] )

# cotton
df_cotton = df1[0]
df_cotton.name = 'cotton'
df_ref = pd.concat( [df_ref, df_cotton ], axis=1 )
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated( keep='last')]

# polyester
df_polyester = df1.loc[df1[1].str.contains( 'Polyester', na=True ), 1]
df_polyester.name = 'polyester'
df_ref = pd.concat( [df_ref, df_polyester], axis=1 )
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated( keep='last') ]

# elastomultiester
df_elastomultiester = df1.loc[df1[1].str.contains( 'Elastomultiester', na=True ), 1]
df_elastomultiester.name = 'elastomultiester'
df_ref = pd.concat( [df_ref, df_elastomultiester], axis=1 )
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated( keep='last') ]

# spandex
df_spandex = df1.loc[df1[1].str.contains( 'Spandex', na=True ), 1]
df_spandex.name = 'spandex'
df_ref = pd.concat( [df_ref, df_spandex], axis=1 )
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated( keep='last') ]

# final join
data = pd.concat( [data, df_ref], axis=1 )

# format composition data
data['cotton'] = data['cotton'].apply( lambda x: int( re.search( '\d+', x ).group(0) ) / 100 if pd.notnull( x ) else x )
data['polyester'] = data['polyester'].apply( lambda x: int( re.search( '\d+', x).group(0) ) / 100 if pd.notnull( x ) else x )
data['elastomultiester'] = data['elastomultiester'].apply( lambda x: int( re.search( '\d+', x ).group(0) ) / 100 if pd.notnull( x ) else x )
data['spandex'] = data['spandex'].apply( lambda x: int( re.search( '\d+', x ).group(0) ) / 100 if pd.notnull( x ) else x )

data.head()

Unnamed: 0,product_id,product_name,product_category,product_price,scrapy_datetime,style_id,color_id,product_color,Composition,Fit,size_number,size_model,cotton,polyester,elastomultiester,spandex
644,690449043.0,skinny_jeans,men_jeans_ripped,39.99,2022-05-23 20:56:47,690449.0,43.0,light_denim_blue_trashed,"Cotton 98%, Spandex 2%",skinny_fit,,,0.98,,,0.02
650,690449043.0,skinny_jeans,men_jeans_ripped,39.99,2022-05-23 20:56:47,690449.0,43.0,denim_blue,"Cotton 98%, Spandex 2%",skinny_fit,,,0.98,,,0.02
656,690449043.0,skinny_jeans,men_jeans_ripped,39.99,2022-05-23 20:56:47,690449.0,43.0,black_washed,"Cotton 98%, Spandex 2%",skinny_fit,,,0.98,,,0.02
662,690449043.0,skinny_jeans,men_jeans_ripped,39.99,2022-05-23 20:56:47,690449.0,43.0,light_denim_blue,"Cotton 98%, Spandex 2%",skinny_fit,,,0.98,,,0.02
668,690449043.0,skinny_jeans,men_jeans_ripped,39.99,2022-05-23 20:56:47,690449.0,43.0,black_washed_out,"Cotton 98%, Spandex 2%",skinny_fit,,,0.98,,,0.02
