In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
##loading the articles dataset
articles=pd.read_csv('articles.csv')
articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [3]:
##checking columns in articles
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   article_id                    105542 non-null  int64 
 1   product_code                  105542 non-null  int64 
 2   prod_name                     105542 non-null  object
 3   product_type_no               105542 non-null  int64 
 4   product_type_name             105542 non-null  object
 5   product_group_name            105542 non-null  object
 6   graphical_appearance_no       105542 non-null  int64 
 7   graphical_appearance_name     105542 non-null  object
 8   colour_group_code             105542 non-null  int64 
 9   colour_group_name             105542 non-null  object
 10  perceived_colour_value_id     105542 non-null  int64 
 11  perceived_colour_value_name   105542 non-null  object
 12  perceived_colour_master_id    105542 non-null  int64 
 13 

In [4]:
##checking a few categorical columns from the dataframe
subset=articles[['article_id','product_group_name','colour_group_name','department_name',
                 'index_name','index_group_name','section_name','garment_group_name']]
subset.head()

Unnamed: 0,article_id,product_group_name,colour_group_name,department_name,index_name,index_group_name,section_name,garment_group_name
0,108775015,Garment Upper body,Black,Jersey Basic,Ladieswear,Ladieswear,Womens Everyday Basics,Jersey Basic
1,108775044,Garment Upper body,White,Jersey Basic,Ladieswear,Ladieswear,Womens Everyday Basics,Jersey Basic
2,108775051,Garment Upper body,Off White,Jersey Basic,Ladieswear,Ladieswear,Womens Everyday Basics,Jersey Basic
3,110065001,Underwear,Black,Clean Lingerie,Lingeries/Tights,Ladieswear,Womens Lingerie,"Under-, Nightwear"
4,110065002,Underwear,White,Clean Lingerie,Lingeries/Tights,Ladieswear,Womens Lingerie,"Under-, Nightwear"


In [5]:
##checking categorical values and distributions in each of the columns in subset
target_cols=['article_id']
for col in subset.columns:
    print (f'column: {col}')
    ##check number of unique categories in column
    num_categories=subset[col].nunique()
    print(f'num_categories:{num_categories}')
    ##for the purpose of sampling, we should look for columns that do not have too many or too less unique categories
    ##too many unique categories will lead to sparse connections in the graph and may hinder a solution
    ##too less categories will make the resulting dataset large, and the solution may end up becoming too generalised
    if num_categories>=5 and num_categories<=10:
        target_cols.append(col)
        print (subset[col].value_counts())
    print('\n')
    print('-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-')
    print('\n')

print(target_cols)

column: article_id
num_categories:105542


-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-


column: product_group_name
num_categories:19


-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-


column: colour_group_name
num_categories:50


-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-


column: department_name
num_categories:250


-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-


column: index_name
num_categories:10
Ladieswear                        26001
Divided                           15149
Menswear                          12553
Children Sizes 92-140             12007
Children Sizes 134-170             9214
Baby Sizes 50-98                   8875
Ladies Accessories                 6961
Lingeries/Tights                   6775
Children Accessories, Swimwear     4615
Sport                              3392
Name: index_name, dtype: int64


-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-X-


column: index_group_name
num_categories:5
Ladieswear       39737
Baby/Children    34711
Divided          15149
Menswear         12553
Sport          

In [6]:
##reducing subset to only the target columns
subset=subset[target_cols]

In [7]:
subset.head()

Unnamed: 0,article_id,index_name,index_group_name
0,108775015,Ladieswear,Ladieswear
1,108775044,Ladieswear,Ladieswear
2,108775051,Ladieswear,Ladieswear
3,110065001,Lingeries/Tights,Ladieswear
4,110065002,Lingeries/Tights,Ladieswear


In [8]:
##importing the transactions dataset and merging it with the subset
transactions=pd.read_csv('transactions_train.csv',usecols=['t_dat','article_id','customer_id'])
subset=pd.merge(transactions,subset,on='article_id')
del(transactions)

In [9]:
##checking proportions of categorical features and their pairs in transactions data

In [10]:
subset['index_name'].value_counts(normalize=True)

Ladieswear                        0.410789
Divided                           0.224556
Lingeries/Tights                  0.175607
Ladies Accessories                0.055830
Menswear                          0.055714
Sport                             0.039210
Children Sizes 134-170            0.015839
Children Sizes 92-140             0.011027
Baby Sizes 50-98                  0.007995
Children Accessories, Swimwear    0.003435
Name: index_name, dtype: float64

In [11]:
subset['index_group_name'].value_counts(normalize=True)

Ladieswear       0.642225
Divided          0.224556
Menswear         0.055714
Sport            0.039210
Baby/Children    0.038295
Name: index_group_name, dtype: float64

In [12]:
subset[['index_group_name','index_name']].value_counts(normalize=True)

index_group_name  index_name                    
Ladieswear        Ladieswear                        0.410789
Divided           Divided                           0.224556
Ladieswear        Lingeries/Tights                  0.175607
                  Ladies Accessories                0.055830
Menswear          Menswear                          0.055714
Sport             Sport                             0.039210
Baby/Children     Children Sizes 134-170            0.015839
                  Children Sizes 92-140             0.011027
                  Baby Sizes 50-98                  0.007995
                  Children Accessories, Swimwear    0.003435
dtype: float64

In [13]:
##the 'index_name' feature is essentially a more specific catergorization of 'index_group_name'
##the three least freuqent categories in 'index_group_name' make for reasonably sized datasets for us to start working on
cat_list=['Menswear','Sport','Baby/Children']

In [14]:
##finding indices for the three aforementioned categories
cat_dict={} ##dict for storing categories and corresponding indices
for cat in cat_list:
    indices=subset[subset['index_group_name']==cat].index
    cat_dict[cat]=indices
del(cat_list,subset)
cat_dict

{'Menswear': Int64Index([   27777,    27778,    27779,    27780,    27781,    27782,
                27783,    27784,    27785,    27786,
             ...
             31788181, 31788182, 31788228, 31788234, 31788256, 31788315,
             31788316, 31788317, 31788318, 31788319],
            dtype='int64', length=1771053),
 'Sport': Int64Index([   49931,    49932,    49933,    49934,    49935,    49936,
                49937,    49938,    49939,    49940,
             ...
             31788079, 31788080, 31788081, 31788082, 31788083, 31788084,
             31788085, 31788086, 31788269, 31788304],
            dtype='int64', length=1246408),
 'Baby/Children': Int64Index([   32159,    32160,    32161,    32162,    32163,    32164,
                32165,    32166,    32167,    32168,
             ...
             31788301, 31788302, 31788307, 31788308, 31788310, 31788314,
             31788320, 31788321, 31788322, 31788323],
            dtype='int64', length=1217349)}

In [15]:
cat_dict['Children']=cat_dict['Baby/Children'] ##'/' indicates directory and causes error when saving
del cat_dict['Baby/Children']

In [16]:
##creating data subsets corresponding to each of the three categories
transactions=pd.read_csv('transactions_train.csv')
articles=pd.read_csv('articles.csv')
customers=pd.read_csv('customers.csv',usecols=['customer_id','club_member_status','fashion_news_frequency','age','postal_code'])

for cat in cat_dict.keys():
    ##reducing the transactions dataset to the required size
    main=transactions.iloc[cat_dict[cat]]
    ##merging main with customers
    main=pd.merge(main,customers,on='customer_id')
    ##merging main with articles
    main=pd.merge(main,articles,on='article_id')
    ##resetting index
    main.reset_index(inplace=True)
    main.drop('index',axis=1,inplace=True)
    ##exporting to csv
    main.to_csv(f'{cat}_category_name.csv',index_label=False)
    
del(transactions,articles,customers)

In [None]:
cat_dict.keys()

In [None]:
'Menswear'.split('/')