## **Import Data**

In [4]:
# Refer to https://www.youtube.com/watch?v=57N1g8k2Hwc for setup
# Install kaggle
!pip install -q kaggle

In [5]:
# upload API json file (CAUTION! sensitive data)
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"gloriahwoang","key":"1cd29c7226242b39c9e9c65d290721b5"}'}

In [6]:
# create the kaggle folder and cp json file into it
!mkdir -p ~/.kaggle
!mv *json ~/.kaggle
!ls ~/.kaggle

kaggle.json


In [7]:
# change file permission
!chmod 600 ~/.kaggle/*json

In [8]:
# download HM dataset
!kaggle competitions download -c h-and-m-personalized-fashion-recommendations

Downloading h-and-m-personalized-fashion-recommendations.zip to /content
100% 28.7G/28.7G [04:01<00:00, 159MB/s]
100% 28.7G/28.7G [04:01<00:00, 127MB/s]


In [9]:
!mkdir -p hm_data

In [10]:
# extract only csv files
!unzip -p h-and-m-personalized-fashion-recommendations customers.csv > ./hm_data/customers.csv
!unzip -p h-and-m-personalized-fashion-recommendations articles.csv > ./hm_data/articles.csv
!unzip -p h-and-m-personalized-fashion-recommendations transactions_train.csv > ./hm_data/transactions_train.csv

In [11]:
!rm *zip

In [12]:
!ls hm_data

articles.csv  customers.csv  transactions_train.csv


In [13]:
!ls

hm_data  sample_data


In [150]:
import numpy as np
import datetime
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

In [151]:
mydir = "./hm_data"
articles = pd.read_csv(mydir+"/articles.csv")
customers = pd.read_csv(mydir+"/customers.csv")
transactions = pd.read_csv(mydir+"/transactions_train.csv")

# **Choosing Features in Articles and Customers**




In [152]:
# Check for NaN values
customers.isna().sum()
# Note that age has 15861 nan values.

customer_id                    0
FN                        895050
Active                    907576
club_member_status          6062
fashion_news_frequency     16009
age                        15861
postal_code                    0
dtype: int64

In [153]:
# Select our columns
customers = customers[['customer_id', 'age', 'FN', 'Active']]
articles = articles[['article_id', 'product_code', 'index_group_name', 'product_group_name', 'perceived_colour_master_name', 'perceived_colour_value_name', 'graphical_appearance_name']]

In [154]:
# Fill the NaN values in FA and Active with 0s
customers['FN'] = customers['FN'].fillna(0)
customers['Active'] = customers['Active'].fillna(0)
customers

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,customer_id,age,FN,Active
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,49.0,0.0,0.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,25.0,0.0,0.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,24.0,0.0,0.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,54.0,0.0,0.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,52.0,1.0,1.0
...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,24.0,0.0,0.0
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,21.0,0.0,0.0
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,21.0,1.0,1.0
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,18.0,1.0,1.0


# **【Filter 1】Customer: bought 2-8 items -> 466175**

In [155]:
# group transactions by customer id
transactions_byid_n = transactions.groupby('customer_id').count()

In [156]:
# Select the customer id's of people who purchased 2-8 items
uids_include0 = np.asarray(transactions_byid_n.query('article_id <8 and article_id>1').index)
len(uids_include0)

474049

In [157]:
# create a dataframe of the customer ids
df_cus_id_filter = pd.DataFrame(uids_include0)
df_cus_id_filter.rename(columns={0:'customer_id'}, inplace = True)
df_cus_id_filter

Unnamed: 0,customer_id
0,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...
1,000064249685c11552da43ef22a5030f35a147f723d5b0...
2,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...
3,00007e8d4e54114b5b2a9b51586325a8d0fa74ea23ef77...
4,00008469a21b50b3d147c97135e25b4201a8c58997f787...
...,...
474044,ffff25c78688e1c34e48a4e34b9a953bde663cf937e715...
474045,ffff2d1849db66617499febae392fb5e9335ebf160de0e...
474046,ffff7d65748db4d52e48b74c8f83ccb0029fc3bbafa511...
474047,ffffa28cd7ab5d1cbbbfe7b582b1c419270cc0539f3dae...


In [158]:
# left merge on the filtered customer ids with our original customer id data set
customers = df_cus_id_filter.merge(customers, 
                                     on = "customer_id", 
                                     how = "left")
customers

# THIS IS THE FILTERED CUSTOMERS DATAFRAME

Unnamed: 0,customer_id,age,FN,Active
0,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,54.0,0.0,0.0
1,000064249685c11552da43ef22a5030f35a147f723d5b0...,,0.0,0.0
2,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,20.0,0.0,0.0
3,00007e8d4e54114b5b2a9b51586325a8d0fa74ea23ef77...,20.0,0.0,0.0
4,00008469a21b50b3d147c97135e25b4201a8c58997f787...,20.0,0.0,0.0
...,...,...,...,...
474044,ffff25c78688e1c34e48a4e34b9a953bde663cf937e715...,24.0,0.0,0.0
474045,ffff2d1849db66617499febae392fb5e9335ebf160de0e...,32.0,0.0,0.0
474046,ffff7d65748db4d52e48b74c8f83ccb0029fc3bbafa511...,20.0,1.0,1.0
474047,ffffa28cd7ab5d1cbbbfe7b582b1c419270cc0539f3dae...,22.0,1.0,1.0


In [159]:
len(customers)

474049

In [160]:
customers.isna().sum()

customer_id       0
age            7874
FN                0
Active            0
dtype: int64

In [161]:
customers = customers.dropna()

In [162]:
len(customers)

466175

# **【Filter 2】Item: more than 50 purchases -> 56696**

In [163]:
# group transactions by article id
transactions_byprod_n = transactions.groupby('article_id').count()

In [164]:
# Select the customer id's of items that were purchased over 50 in count
iids_include0 = np.asarray(transactions_byprod_n.query('customer_id >50').index)
len(iids_include0)

56696

In [165]:
# create a dataframe of the customer ids
df_cus_id_filter_articles = pd.DataFrame(iids_include0)
df_cus_id_filter_articles.rename(columns={0:'article_id'}, inplace = True)
df_cus_id_filter_articles

Unnamed: 0,article_id
0,108775015
1,108775044
2,108775051
3,110065001
4,110065002
...,...
56691,947060001
56692,947509001
56693,949198001
56694,949551001


In [166]:
# left merge on the filtered customer ids with our original customer id data set
articles = df_cus_id_filter_articles.merge(articles, 
                                     on = "article_id", 
                                     how = "left")
articles

# THIS IS THE FILTERED ARTICLES DATAFRAME

Unnamed: 0,article_id,product_code,index_group_name,product_group_name,perceived_colour_master_name,perceived_colour_value_name,graphical_appearance_name
0,108775015,108775,Ladieswear,Garment Upper body,Black,Dark,Solid
1,108775044,108775,Ladieswear,Garment Upper body,White,Light,Solid
2,108775051,108775,Ladieswear,Garment Upper body,White,Dusty Light,Stripe
3,110065001,110065,Ladieswear,Underwear,Black,Dark,Solid
4,110065002,110065,Ladieswear,Underwear,White,Light,Solid
...,...,...,...,...,...,...,...
56691,947060001,947060,Menswear,Nightwear,Blue,Dark,All over pattern
56692,947509001,947509,Ladieswear,Accessories,Brown,Medium Dusty,Solid
56693,949198001,949198,Ladieswear,Garment Lower body,Black,Dark,Solid
56694,949551001,949551,Divided,Garment Upper body,Black,Dark,Solid


In [167]:
len(articles)

56696

# **【Filter 3】Transactions -> 534856**

In [168]:
len(transactions)

31788324

In [169]:
# Left merge to extract transactions from our filtered customers only
transactions = customers.merge(transactions, 
                                     on = "customer_id", 
                                     how = "left")

# Left merge to extract transactions from our filtered products only
transactions = articles.merge(transactions, 
                                     on = "article_id", 
                                     how = "left")

In [170]:
len(transactions)

1787831

# **Articles Data Cleaning**

In [171]:
# product_group_name - categorizing small distributions as others
articles['product_group_name'].mask(articles['product_group_name'] == 'Items', "Others", inplace=True)
articles['product_group_name'].mask(articles['product_group_name'] == 'Unknown', "Others", inplace=True)
articles['product_group_name'].mask(articles['product_group_name'] == 'Underwear/nightwear', "Others", inplace=True)
articles['product_group_name'].mask(articles['product_group_name'] == 'Cosmetic', "Others", inplace=True)
articles['product_group_name'].mask(articles['product_group_name'] == 'Interior textile', "Others", inplace=True)
articles['product_group_name'].mask(articles['product_group_name'] == 'Bags', "Others", inplace=True)
articles['product_group_name'].mask(articles['product_group_name'] == 'Furniture', "Others", inplace=True)
articles['product_group_name'].mask(articles['product_group_name'] == 'Garment and Shoe care', "Others", inplace=True)
articles['product_group_name'].mask(articles['product_group_name'] == 'Stationery', "Others", inplace=True)
articles['product_group_name'].mask(articles['product_group_name'] == 'Fun', "Others", inplace=True)

In [172]:
# perceived_colour_master_name - categorizing small distributions as others
articles['perceived_colour_master_name'].mask(articles['perceived_colour_master_name'] == 'Lilac Purple', "Others", inplace=True)
articles['perceived_colour_master_name'].mask(articles['perceived_colour_master_name'] == 'Mole', "Others", inplace=True)
articles['perceived_colour_master_name'].mask(articles['perceived_colour_master_name'] == 'Orange', "Others", inplace=True)
articles['perceived_colour_master_name'].mask(articles['perceived_colour_master_name'] == 'Metal', "Others", inplace=True)
articles['perceived_colour_master_name'].mask(articles['perceived_colour_master_name'] == 'Brown', "Others", inplace=True)
articles['perceived_colour_master_name'].mask(articles['perceived_colour_master_name'] == 'Khaki green', "Others", inplace=True)
articles['perceived_colour_master_name'].mask(articles['perceived_colour_master_name'] == 'Yellow', "Others", inplace=True)
articles['perceived_colour_master_name'].mask(articles['perceived_colour_master_name'] == 'Green', "Others", inplace=True)
articles['perceived_colour_master_name'].mask(articles['perceived_colour_master_name'] == 'Turquoise', "Others", inplace=True)
articles['perceived_colour_master_name'].mask(articles['perceived_colour_master_name'] == 'undefined', "Others", inplace=True)
articles['perceived_colour_master_name'].mask(articles['perceived_colour_master_name'] == 'Unknown', "Others", inplace=True)
articles['perceived_colour_master_name'].mask(articles['perceived_colour_master_name'] == 'Yellowish Green', "Others", inplace=True)
articles['perceived_colour_master_name'].mask(articles['perceived_colour_master_name'] == 'Bluish Green', "Others", inplace=True)

In [173]:
# perceived_colour_value_name - categorizing small distributions as others
articles['perceived_colour_value_name'].mask(articles['perceived_colour_value_name'] == 'Undefined', "Others", inplace=True)
articles['perceived_colour_value_name'].mask(articles['perceived_colour_value_name'] == 'Unknown', "Others", inplace=True)

In [174]:
# graphical_appearance_name - categorizing small distributions as others
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Transparent', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Metallic', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Dot', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Other structure', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Contrast', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Treatment', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Glittering/Metallic', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Mixed solid/pattern', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Other pattern', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Neps', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Placement print', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Application/3D', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Chambray', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Front print', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Jacquard', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Check', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Unknown', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Colour blocking', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Argyle', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Lace', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Slub', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Mesh', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Embroidery', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Sequin', "Others", inplace=True)
articles['graphical_appearance_name'].mask(articles['graphical_appearance_name'] == 'Hologram', "Others", inplace=True)


## **Index Group Name**

In [175]:
from sklearn.preprocessing import OneHotEncoder

#creating instance of one-hot-encoder
encoder_ign = OneHotEncoder(handle_unknown='ignore')

#perform one-hot encoding on 'team' column 
encoder_index_group_name = pd.DataFrame(encoder_ign.fit_transform(articles[['index_group_name']]).toarray())
encoder_index_group_name.columns = encoder_ign.get_feature_names()




In [176]:
#merge one-hot encoded columns back with original DataFrame
articles_final = articles.join(encoder_index_group_name)
articles_final

Unnamed: 0,article_id,product_code,index_group_name,product_group_name,perceived_colour_master_name,perceived_colour_value_name,graphical_appearance_name,x0_Baby/Children,x0_Divided,x0_Ladieswear,x0_Menswear,x0_Sport
0,108775015,108775,Ladieswear,Garment Upper body,Black,Dark,Solid,0.0,0.0,1.0,0.0,0.0
1,108775044,108775,Ladieswear,Garment Upper body,White,Light,Solid,0.0,0.0,1.0,0.0,0.0
2,108775051,108775,Ladieswear,Garment Upper body,White,Dusty Light,Stripe,0.0,0.0,1.0,0.0,0.0
3,110065001,110065,Ladieswear,Underwear,Black,Dark,Solid,0.0,0.0,1.0,0.0,0.0
4,110065002,110065,Ladieswear,Underwear,White,Light,Solid,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
56691,947060001,947060,Menswear,Nightwear,Blue,Dark,All over pattern,0.0,0.0,0.0,1.0,0.0
56692,947509001,947509,Ladieswear,Accessories,Others,Medium Dusty,Solid,0.0,0.0,1.0,0.0,0.0
56693,949198001,949198,Ladieswear,Garment Lower body,Black,Dark,Solid,0.0,0.0,1.0,0.0,0.0
56694,949551001,949551,Divided,Garment Upper body,Black,Dark,Solid,0.0,1.0,0.0,0.0,0.0


In [177]:
#drop 'index_group_name' column
articles_final.drop('index_group_name', axis=1, inplace=True)

#rename columns
articles_final.rename(columns={'x0_Baby/Children':'Index: Baby/Children',
                               'x0_Divided':'Index: Divided',
                               'x0_Ladieswear':'Index: Ladieswear',
                               'x0_Menswear': 'Index: Menswear',
                               'x0_Sport':'Index: Sport'},
                      inplace = True)
                    

articles_final

Unnamed: 0,article_id,product_code,product_group_name,perceived_colour_master_name,perceived_colour_value_name,graphical_appearance_name,Index: Baby/Children,Index: Divided,Index: Ladieswear,Index: Menswear,Index: Sport
0,108775015,108775,Garment Upper body,Black,Dark,Solid,0.0,0.0,1.0,0.0,0.0
1,108775044,108775,Garment Upper body,White,Light,Solid,0.0,0.0,1.0,0.0,0.0
2,108775051,108775,Garment Upper body,White,Dusty Light,Stripe,0.0,0.0,1.0,0.0,0.0
3,110065001,110065,Underwear,Black,Dark,Solid,0.0,0.0,1.0,0.0,0.0
4,110065002,110065,Underwear,White,Light,Solid,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
56691,947060001,947060,Nightwear,Blue,Dark,All over pattern,0.0,0.0,0.0,1.0,0.0
56692,947509001,947509,Accessories,Others,Medium Dusty,Solid,0.0,0.0,1.0,0.0,0.0
56693,949198001,949198,Garment Lower body,Black,Dark,Solid,0.0,0.0,1.0,0.0,0.0
56694,949551001,949551,Garment Upper body,Black,Dark,Solid,0.0,1.0,0.0,0.0,0.0


## **Product Group Name**

In [178]:
#creating instance of one-hot-encoder
encoder_pgn = OneHotEncoder(handle_unknown='ignore')

#perform one-hot encoding on 'team' column 
encoder_product_group_name = pd.DataFrame(encoder_pgn.fit_transform(articles_final[['product_group_name']]).toarray())
encoder_product_group_name.columns = encoder_pgn.get_feature_names()

#merge one-hot encoded columns back with original DataFrame
articles_final = articles_final.join(encoder_product_group_name)

#drop 'product_group_name' column
articles_final.drop('product_group_name', axis=1, inplace=True)

articles_final



Unnamed: 0,article_id,product_code,perceived_colour_master_name,perceived_colour_value_name,graphical_appearance_name,Index: Baby/Children,Index: Divided,Index: Ladieswear,Index: Menswear,Index: Sport,x0_Accessories,x0_Garment Full body,x0_Garment Lower body,x0_Garment Upper body,x0_Nightwear,x0_Others,x0_Shoes,x0_Socks & Tights,x0_Swimwear,x0_Underwear
0,108775015,108775,Black,Dark,Solid,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,108775044,108775,White,Light,Solid,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,108775051,108775,White,Dusty Light,Stripe,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,110065001,110065,Black,Dark,Solid,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,110065002,110065,White,Light,Solid,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56691,947060001,947060,Blue,Dark,All over pattern,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
56692,947509001,947509,Others,Medium Dusty,Solid,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56693,949198001,949198,Black,Dark,Solid,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56694,949551001,949551,Black,Dark,Solid,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [179]:
articles_final.columns

Index(['article_id', 'product_code', 'perceived_colour_master_name',
       'perceived_colour_value_name', 'graphical_appearance_name',
       'Index: Baby/Children', 'Index: Divided', 'Index: Ladieswear',
       'Index: Menswear', 'Index: Sport', 'x0_Accessories',
       'x0_Garment Full body', 'x0_Garment Lower body',
       'x0_Garment Upper body', 'x0_Nightwear', 'x0_Others', 'x0_Shoes',
       'x0_Socks & Tights', 'x0_Swimwear', 'x0_Underwear'],
      dtype='object')

In [180]:
#rename columns

articles_final.rename(columns={'x0_Accessories':'Product: Accessories',
                               'x0_Garment Full body':'Product: Garment Full body',
                               'x0_Garment Lower body':'Product: Garment Lower body',
                               'x0_Garment Upper body': 'Product: Garment Upper body',
                               'x0_Nightwear':'Product: Nightwear',
                               'x0_Others':'Product: Others',
                               'x0_Shoes':'Product: Shoes',
                               'x0_Socks & Tights':'Product: Socks & Tights',
                               'x0_Swimwear':'Product: Swimwear',
                               'x0_Underwear':'Product: Underwear',
                               },
                      inplace = True)

articles_final

Unnamed: 0,article_id,product_code,perceived_colour_master_name,perceived_colour_value_name,graphical_appearance_name,Index: Baby/Children,Index: Divided,Index: Ladieswear,Index: Menswear,Index: Sport,Product: Accessories,Product: Garment Full body,Product: Garment Lower body,Product: Garment Upper body,Product: Nightwear,Product: Others,Product: Shoes,Product: Socks & Tights,Product: Swimwear,Product: Underwear
0,108775015,108775,Black,Dark,Solid,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,108775044,108775,White,Light,Solid,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,108775051,108775,White,Dusty Light,Stripe,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,110065001,110065,Black,Dark,Solid,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,110065002,110065,White,Light,Solid,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56691,947060001,947060,Blue,Dark,All over pattern,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
56692,947509001,947509,Others,Medium Dusty,Solid,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56693,949198001,949198,Black,Dark,Solid,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56694,949551001,949551,Black,Dark,Solid,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


## **Perceived Colour Master Name**

In [181]:
#creating instance of one-hot-encoder
encoder_pcmn = OneHotEncoder(handle_unknown='ignore')

#perform one-hot encoding on 'team' column 
encoder_colour_master_name = pd.DataFrame(encoder_pcmn.fit_transform(articles_final[['perceived_colour_master_name']]).toarray())
encoder_colour_master_name.columns = encoder_pcmn.get_feature_names()

#merge one-hot encoded columns back with original DataFrame
articles_final = articles_final.join(encoder_colour_master_name)

#drop 'product_group_name' column
articles_final.drop('perceived_colour_master_name', axis=1, inplace=True)

articles_final



Unnamed: 0,article_id,product_code,perceived_colour_value_name,graphical_appearance_name,Index: Baby/Children,Index: Divided,Index: Ladieswear,Index: Menswear,Index: Sport,Product: Accessories,...,Product: Swimwear,Product: Underwear,x0_Beige,x0_Black,x0_Blue,x0_Grey,x0_Others,x0_Pink,x0_Red,x0_White
0,108775015,108775,Dark,Solid,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,108775044,108775,Light,Solid,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,108775051,108775,Dusty Light,Stripe,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,110065001,110065,Dark,Solid,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,110065002,110065,Light,Solid,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56691,947060001,947060,Dark,All over pattern,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
56692,947509001,947509,Medium Dusty,Solid,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
56693,949198001,949198,Dark,Solid,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
56694,949551001,949551,Dark,Solid,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [182]:
articles_final.columns

Index(['article_id', 'product_code', 'perceived_colour_value_name',
       'graphical_appearance_name', 'Index: Baby/Children', 'Index: Divided',
       'Index: Ladieswear', 'Index: Menswear', 'Index: Sport',
       'Product: Accessories', 'Product: Garment Full body',
       'Product: Garment Lower body', 'Product: Garment Upper body',
       'Product: Nightwear', 'Product: Others', 'Product: Shoes',
       'Product: Socks & Tights', 'Product: Swimwear', 'Product: Underwear',
       'x0_Beige', 'x0_Black', 'x0_Blue', 'x0_Grey', 'x0_Others', 'x0_Pink',
       'x0_Red', 'x0_White'],
      dtype='object')

In [183]:
#rename columns

articles_final.rename(columns={'x0_Beige':'Color Master: Baby/Beige',
                               'x0_Black':'Color Master: Black',
                               'x0_Blue':'Color Master: Blue',
                               'x0_Grey': 'Color Master: Grey',
                               'x0_Others':'Color Master: Others',
                               'x0_Pink':'Color Master: Pink',
                               'x0_Red':'Color Master: Red',
                               'x0_White':'Color Master: White',
                               },
                      inplace = True)

articles_final

Unnamed: 0,article_id,product_code,perceived_colour_value_name,graphical_appearance_name,Index: Baby/Children,Index: Divided,Index: Ladieswear,Index: Menswear,Index: Sport,Product: Accessories,...,Product: Swimwear,Product: Underwear,Color Master: Baby/Beige,Color Master: Black,Color Master: Blue,Color Master: Grey,Color Master: Others,Color Master: Pink,Color Master: Red,Color Master: White
0,108775015,108775,Dark,Solid,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,108775044,108775,Light,Solid,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,108775051,108775,Dusty Light,Stripe,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,110065001,110065,Dark,Solid,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,110065002,110065,Light,Solid,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56691,947060001,947060,Dark,All over pattern,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
56692,947509001,947509,Medium Dusty,Solid,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
56693,949198001,949198,Dark,Solid,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
56694,949551001,949551,Dark,Solid,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


## **Perceived Colour Value Name**

In [184]:
#creating instance of one-hot-encoder
encoder_pcvn = OneHotEncoder(handle_unknown='ignore')

#perform one-hot encoding on 'team' column 
encoder_colour_value_name = pd.DataFrame(encoder_pcvn.fit_transform(articles_final[['perceived_colour_value_name']]).toarray())
encoder_colour_value_name.columns = encoder_pcvn.get_feature_names()

#merge one-hot encoded columns back with original DataFrame
articles_final = articles_final.join(encoder_colour_value_name)

#drop 'product_group_name' column
articles_final.drop('perceived_colour_value_name', axis=1, inplace=True)

articles_final



Unnamed: 0,article_id,product_code,graphical_appearance_name,Index: Baby/Children,Index: Divided,Index: Ladieswear,Index: Menswear,Index: Sport,Product: Accessories,Product: Garment Full body,...,Color Master: Pink,Color Master: Red,Color Master: White,x0_Bright,x0_Dark,x0_Dusty Light,x0_Light,x0_Medium,x0_Medium Dusty,x0_Others
0,108775015,108775,Solid,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,108775044,108775,Solid,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,108775051,108775,Stripe,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,110065001,110065,Solid,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,110065002,110065,Solid,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56691,947060001,947060,All over pattern,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
56692,947509001,947509,Solid,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
56693,949198001,949198,Solid,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
56694,949551001,949551,Solid,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [185]:
articles_final.columns

Index(['article_id', 'product_code', 'graphical_appearance_name',
       'Index: Baby/Children', 'Index: Divided', 'Index: Ladieswear',
       'Index: Menswear', 'Index: Sport', 'Product: Accessories',
       'Product: Garment Full body', 'Product: Garment Lower body',
       'Product: Garment Upper body', 'Product: Nightwear', 'Product: Others',
       'Product: Shoes', 'Product: Socks & Tights', 'Product: Swimwear',
       'Product: Underwear', 'Color Master: Baby/Beige', 'Color Master: Black',
       'Color Master: Blue', 'Color Master: Grey', 'Color Master: Others',
       'Color Master: Pink', 'Color Master: Red', 'Color Master: White',
       'x0_Bright', 'x0_Dark', 'x0_Dusty Light', 'x0_Light', 'x0_Medium',
       'x0_Medium Dusty', 'x0_Others'],
      dtype='object')

In [186]:
#rename columns

articles_final.rename(columns={'x0_Bright':'Color Value: Bright',
                               'x0_Dark':'Color Value: Dark',
                               'x0_Dusty Light':'Color Value: Dusty Light',
                               'x0_Light': 'Color Value: Light',
                               'x0_Medium':'Color Value: Medium',
                               'x0_Medium Dusty':'Color Value: Medium Dusty',
                               'x0_Others':'Color Value: Others'},
                      inplace = True)

articles_final

Unnamed: 0,article_id,product_code,graphical_appearance_name,Index: Baby/Children,Index: Divided,Index: Ladieswear,Index: Menswear,Index: Sport,Product: Accessories,Product: Garment Full body,...,Color Master: Pink,Color Master: Red,Color Master: White,Color Value: Bright,Color Value: Dark,Color Value: Dusty Light,Color Value: Light,Color Value: Medium,Color Value: Medium Dusty,Color Value: Others
0,108775015,108775,Solid,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,108775044,108775,Solid,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,108775051,108775,Stripe,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,110065001,110065,Solid,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,110065002,110065,Solid,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56691,947060001,947060,All over pattern,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
56692,947509001,947509,Solid,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
56693,949198001,949198,Solid,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
56694,949551001,949551,Solid,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


## **Graphical Appearance Name**

In [187]:
#creating instance of one-hot-encoder
encoder_gan = OneHotEncoder(handle_unknown='ignore')

#perform one-hot encoding on 'team' column 
encoder_graph_appear_name = pd.DataFrame(encoder_gan.fit_transform(articles_final[['graphical_appearance_name']]).toarray())
encoder_graph_appear_name.columns = encoder_gan.get_feature_names()

#merge one-hot encoded columns back with original DataFrame
articles_final = articles_final.join(encoder_graph_appear_name)

#drop 'product_group_name' column
articles_final.drop('graphical_appearance_name', axis=1, inplace=True)

articles_final



Unnamed: 0,article_id,product_code,Index: Baby/Children,Index: Divided,Index: Ladieswear,Index: Menswear,Index: Sport,Product: Accessories,Product: Garment Full body,Product: Garment Lower body,...,Color Value: Light,Color Value: Medium,Color Value: Medium Dusty,Color Value: Others,x0_All over pattern,x0_Denim,x0_Melange,x0_Others,x0_Solid,x0_Stripe
0,108775015,108775,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,108775044,108775,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,108775051,108775,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,110065001,110065,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,110065002,110065,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56691,947060001,947060,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
56692,947509001,947509,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
56693,949198001,949198,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
56694,949551001,949551,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [188]:
articles_final.columns

Index(['article_id', 'product_code', 'Index: Baby/Children', 'Index: Divided',
       'Index: Ladieswear', 'Index: Menswear', 'Index: Sport',
       'Product: Accessories', 'Product: Garment Full body',
       'Product: Garment Lower body', 'Product: Garment Upper body',
       'Product: Nightwear', 'Product: Others', 'Product: Shoes',
       'Product: Socks & Tights', 'Product: Swimwear', 'Product: Underwear',
       'Color Master: Baby/Beige', 'Color Master: Black', 'Color Master: Blue',
       'Color Master: Grey', 'Color Master: Others', 'Color Master: Pink',
       'Color Master: Red', 'Color Master: White', 'Color Value: Bright',
       'Color Value: Dark', 'Color Value: Dusty Light', 'Color Value: Light',
       'Color Value: Medium', 'Color Value: Medium Dusty',
       'Color Value: Others', 'x0_All over pattern', 'x0_Denim', 'x0_Melange',
       'x0_Others', 'x0_Solid', 'x0_Stripe'],
      dtype='object')

In [189]:
#rename columns

articles_final.rename(columns={'x0_All over pattern':'Graphical: All over pattern',
                               'x0_Denim':'Graphical: Denim',
                               'x0_Melange':'Graphical: Melange',
                               'x0_Others': 'Graphical: Others',
                               'x0_Solid':'Graphical: Solid',
                               'x0_Stripe':'Graphical: Stripe'},
                      inplace = True)

articles_final

Unnamed: 0,article_id,product_code,Index: Baby/Children,Index: Divided,Index: Ladieswear,Index: Menswear,Index: Sport,Product: Accessories,Product: Garment Full body,Product: Garment Lower body,...,Color Value: Light,Color Value: Medium,Color Value: Medium Dusty,Color Value: Others,Graphical: All over pattern,Graphical: Denim,Graphical: Melange,Graphical: Others,Graphical: Solid,Graphical: Stripe
0,108775015,108775,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,108775044,108775,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,108775051,108775,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,110065001,110065,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,110065002,110065,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56691,947060001,947060,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
56692,947509001,947509,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
56693,949198001,949198,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
56694,949551001,949551,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# **Split Train, Valid, Test Set**

In [149]:
from sklearn.model_selection import train_test_split

In [None]:
#train, test_pre = train_test_split(transactions2_sub, test_size=0.2, random_state=seed)
#valid, test = train_test_split(test_pre, test_size=0.5, random_state=seed)