# 2.1 Imports

In [1]:
# import libraries
import numpy as np
import pandas as pd

In [2]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# list of categories
categories = [
    'activewear',
    'jackets',
    'sweatshirts-hoodies'
]

# 2.2 'price' Cleaning

In [4]:
# check one of the categories csv
category = 'activewear'

In [5]:
# load data
df = pd.read_csv(f'data/{category}.csv')

In [6]:
df.head()

Unnamed: 0,name,price,img_file,url,num_price,color,apparel
0,CK ONE TANGA BIKINI,HKD 392.00,image/CK ONE TANGA BIKINI.png,https://www.calvinklein.com/hk/en/ck-one-tanga...,392.0,BLACK CUT OUT PRINT,BIKINI
1,CK ONE BRALETTE,HKD 392.00,image/CK ONE BRALETTE.png,https://www.calvinklein.com/hk/en/ck-one-brale...,392.0,BLACK CUT OUT PRINT,BRALETTE
2,ACTIVE ICON WOVEN SHORTS,HKD 413.00,image/ACTIVE ICON WOVEN SHORTS.png,https://www.calvinklein.com/hk/en/active-icon-...,413.0,RASPBERRY BLUSH,SHORTS
3,ACTIVE ICON FULL LENGTH ALL-OVER PRINT LEGGINGS,HKD 483.00,image/ACTIVE ICON FULL LENGTH ALL-OVER PRINT L...,https://www.calvinklein.com/hk/en/active-icon-...,483.0,CK BLK CK AOP,LEGGINGS
4,LACE LIGHTWEAR JACKET,HKD 903.00,image/LACE LIGHTWEAR JACKET.png,https://www.calvinklein.com/hk/en/lace-lightwe...,903.0,BRIGHT WHITE,JACKET


In [7]:
# check data type
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       12 non-null     object 
 1   price      12 non-null     object 
 2   img_file   12 non-null     object 
 3   url        12 non-null     object 
 4   num_price  12 non-null     float64
 5   color      12 non-null     object 
 6   apparel    12 non-null     object 
dtypes: float64(1), object(6)
memory usage: 800.0+ bytes


In [8]:
df['num_price'] = df.price.apply(lambda x: x[4:].replace(',','')).astype(float)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       12 non-null     object 
 1   price      12 non-null     object 
 2   img_file   12 non-null     object 
 3   url        12 non-null     object 
 4   num_price  12 non-null     float64
 5   color      12 non-null     object 
 6   apparel    12 non-null     object 
dtypes: float64(1), object(6)
memory usage: 800.0+ bytes


In [10]:
df.head()

Unnamed: 0,name,price,img_file,url,num_price,color,apparel
0,CK ONE TANGA BIKINI,HKD 392.00,image/CK ONE TANGA BIKINI.png,https://www.calvinklein.com/hk/en/ck-one-tanga...,392.0,BLACK CUT OUT PRINT,BIKINI
1,CK ONE BRALETTE,HKD 392.00,image/CK ONE BRALETTE.png,https://www.calvinklein.com/hk/en/ck-one-brale...,392.0,BLACK CUT OUT PRINT,BRALETTE
2,ACTIVE ICON WOVEN SHORTS,HKD 413.00,image/ACTIVE ICON WOVEN SHORTS.png,https://www.calvinklein.com/hk/en/active-icon-...,413.0,RASPBERRY BLUSH,SHORTS
3,ACTIVE ICON FULL LENGTH ALL-OVER PRINT LEGGINGS,HKD 483.00,image/ACTIVE ICON FULL LENGTH ALL-OVER PRINT L...,https://www.calvinklein.com/hk/en/active-icon-...,483.0,CK BLK CK AOP,LEGGINGS
4,LACE LIGHTWEAR JACKET,HKD 903.00,image/LACE LIGHTWEAR JACKET.png,https://www.calvinklein.com/hk/en/lace-lightwe...,903.0,BRIGHT WHITE,JACKET


# 2.3 Feature Engineering

# 2.3.1 Creating 'Apparel' feature

In [11]:
# obesrve the first url
df['name'].iloc[0]

'CK ONE TANGA BIKINI'

In [12]:
# extract the 'apparel' part
df['apparel'] = df['name'].apply(lambda x: x.split(' ')[-1])

In [13]:
df.head()

Unnamed: 0,name,price,img_file,url,num_price,color,apparel
0,CK ONE TANGA BIKINI,HKD 392.00,image/CK ONE TANGA BIKINI.png,https://www.calvinklein.com/hk/en/ck-one-tanga...,392.0,BLACK CUT OUT PRINT,BIKINI
1,CK ONE BRALETTE,HKD 392.00,image/CK ONE BRALETTE.png,https://www.calvinklein.com/hk/en/ck-one-brale...,392.0,BLACK CUT OUT PRINT,BRALETTE
2,ACTIVE ICON WOVEN SHORTS,HKD 413.00,image/ACTIVE ICON WOVEN SHORTS.png,https://www.calvinklein.com/hk/en/active-icon-...,413.0,RASPBERRY BLUSH,SHORTS
3,ACTIVE ICON FULL LENGTH ALL-OVER PRINT LEGGINGS,HKD 483.00,image/ACTIVE ICON FULL LENGTH ALL-OVER PRINT L...,https://www.calvinklein.com/hk/en/active-icon-...,483.0,CK BLK CK AOP,LEGGINGS
4,LACE LIGHTWEAR JACKET,HKD 903.00,image/LACE LIGHTWEAR JACKET.png,https://www.calvinklein.com/hk/en/lace-lightwe...,903.0,BRIGHT WHITE,JACKET


# 2.3.2 Creating 'Color' feature

In [14]:
# obesrve the first url
df['url'].iloc[0]

'https://www.calvinklein.com/hk/en/ck-one-tanga-bikini-KW01435.html?dwvar_KW01435_color=BLACK%20CUT%20OUT%20PRINT'

In [15]:
# extract the 'color' part
# split to get the 'color' text
# replace the html code with space

df['color'] = df['url'].apply(
    lambda x: x.split('color=')[1].replace('%20',' ').replace('%2F',' ')
)

In [16]:
df.head()

Unnamed: 0,name,price,img_file,url,num_price,color,apparel
0,CK ONE TANGA BIKINI,HKD 392.00,image/CK ONE TANGA BIKINI.png,https://www.calvinklein.com/hk/en/ck-one-tanga...,392.0,BLACK CUT OUT PRINT,BIKINI
1,CK ONE BRALETTE,HKD 392.00,image/CK ONE BRALETTE.png,https://www.calvinklein.com/hk/en/ck-one-brale...,392.0,BLACK CUT OUT PRINT,BRALETTE
2,ACTIVE ICON WOVEN SHORTS,HKD 413.00,image/ACTIVE ICON WOVEN SHORTS.png,https://www.calvinklein.com/hk/en/active-icon-...,413.0,RASPBERRY BLUSH,SHORTS
3,ACTIVE ICON FULL LENGTH ALL-OVER PRINT LEGGINGS,HKD 483.00,image/ACTIVE ICON FULL LENGTH ALL-OVER PRINT L...,https://www.calvinklein.com/hk/en/active-icon-...,483.0,CK BLK CK AOP,LEGGINGS
4,LACE LIGHTWEAR JACKET,HKD 903.00,image/LACE LIGHTWEAR JACKET.png,https://www.calvinklein.com/hk/en/lace-lightwe...,903.0,BRIGHT WHITE,JACKET


In [17]:
# for every category, amend the csv file

for category in categories:
    
    # load data
    df = pd.read_csv(f'data/{category}.csv')
    
    # create new column 'num_price'
    df['num_price'] = df.price.apply(lambda x: x[4:].replace(',','')).astype(float)

    # extract the 'apparel' part
    df['apparel'] = df['name'].apply(lambda x: x.split(' ')[-1])

    # extract the 'color' part
    # split to get the 'color' text
    # replace the html code with space

    df['color'] = df['url'].apply(
        lambda x: x.split('color=')[1].replace('%20',' ').replace('%2F',' ')
    )
    
    # save file
    df.to_csv(f'data/{category}.csv', index=False)

In [18]:
# dataframe of 'activewear'

df = pd.read_csv(f'data/{categories[0]}.csv')
df.head()

Unnamed: 0,name,price,img_file,url,num_price,color,apparel
0,CK ONE TANGA BIKINI,HKD 392.00,image/CK ONE TANGA BIKINI.png,https://www.calvinklein.com/hk/en/ck-one-tanga...,392.0,BLACK CUT OUT PRINT,BIKINI
1,CK ONE BRALETTE,HKD 392.00,image/CK ONE BRALETTE.png,https://www.calvinklein.com/hk/en/ck-one-brale...,392.0,BLACK CUT OUT PRINT,BRALETTE
2,ACTIVE ICON WOVEN SHORTS,HKD 413.00,image/ACTIVE ICON WOVEN SHORTS.png,https://www.calvinklein.com/hk/en/active-icon-...,413.0,RASPBERRY BLUSH,SHORTS
3,ACTIVE ICON FULL LENGTH ALL-OVER PRINT LEGGINGS,HKD 483.00,image/ACTIVE ICON FULL LENGTH ALL-OVER PRINT L...,https://www.calvinklein.com/hk/en/active-icon-...,483.0,CK BLK CK AOP,LEGGINGS
4,LACE LIGHTWEAR JACKET,HKD 903.00,image/LACE LIGHTWEAR JACKET.png,https://www.calvinklein.com/hk/en/lace-lightwe...,903.0,BRIGHT WHITE,JACKET


In [19]:
# dataframe of 'jackets'

df = pd.read_csv(f'data/{categories[1]}.csv')
df.head()

Unnamed: 0,name,price,img_file,url,num_price,color,apparel
0,REFLECTION PACKABLE ZIP UP HOODIE,"HKD 1,393.00",image/REFLECTION PACKABLE ZIP UP HOODIE.png,https://www.calvinklein.com/hk/en/reflection-p...,1393.0,CRUSHED ORANGE,HOODIE
1,CHINESE NEW YEAR CAPSULE REVERSIBLE JACKET,"HKD 1,603.00",image/CHINESE NEW YEAR CAPSULE REVERSIBLE JACK...,https://www.calvinklein.com/hk/en/chinese-new-...,1603.0,YELLOW AOP,JACKET
2,SHORT HOODED WINDBREAKER,"HKD 1,253.00",image/SHORT HOODED WINDBREAKER.png,https://www.calvinklein.com/hk/en/short-hooded...,1253.0,ILLUMINATED BLK,WINDBREAKER
3,CHINESE NEW YEAR CAPSULE RELAXED DENIM TRUCKER...,"HKD 1,533.00",image/CHINESE NEW YEAR CAPSULE RELAXED DENIM T...,https://www.calvinklein.com/hk/en/chinese-new-...,1533.0,CC407 LT BLUE,JACKET
4,HYBRID DENIM JACKET,"HKD 2,093.00",image/HYBRID DENIM JACKET.png,https://www.calvinklein.com/hk/en/hybrid-denim...,2093.0,ACD632 LT BLUE,JACKET


In [20]:
# dataframe of 'sweatshirts-hoodies'

df = pd.read_csv(f'data/{categories[2]}.csv')
df.head()

Unnamed: 0,name,price,img_file,url,num_price,color,apparel
0,MODERN ESSENTIALS+ MICRO LOGO HOODIE,HKD 763.00,image/MODERN ESSENTIALS+ MICRO LOGO HOODIE.png,https://www.calvinklein.com/hk/en/modern-essen...,763.0,PALMA LILAC,HOODIE
1,REFLECTION HOODED CAPE,HKD 903.00,image/REFLECTION HOODED CAPE.png,https://www.calvinklein.com/hk/en/reflection-h...,903.0,PEACOAT,CAPE
2,REFLECTION INSTITUTIONAL ZIP UP HOODIE,HKD 903.00,image/REFLECTION INSTITUTIONAL ZIP UP HOODIE.png,https://www.calvinklein.com/hk/en/reflection-i...,903.0,BRIGHT WHITE,HOODIE
3,PREMIUM MIX MEDIA HOODIE,"HKD 1,183.00",image/PREMIUM MIX MEDIA HOODIE.png,https://www.calvinklein.com/hk/en/premium-mix-...,1183.0,CK BLACK,HOODIE
4,PREMIUM FLORAL PRINT SWEATSHIRT,HKD 903.00,image/PREMIUM FLORAL PRINT SWEATSHIRT.png,https://www.calvinklein.com/hk/en/premium-flor...,903.0,VIVID ORANGE,SWEATSHIRT


In [21]:
# END