In [1]:
import pandas as pd
import numpy as np

import category_encoders as ce

In [2]:
DATA_PATH = 'data/data_after_EDA.csv'

# Display all of the columns when data are shown
pd.set_option('display.max_columns', 60) 

In [None]:
data =  pd.read_csv(DATA_PATH, sep=',', parse_dates=['doc_date'], low_memory=False)

In [4]:
data.head(5)

Unnamed: 0,bill_country,setting_currency_id,shop_basket_id,doc_date,exchange_currency_rate,original_currency_code,basket_total_price_with_vat,count_basket_items,basket_count_products,basket_type,item_quantity,item_type,item_unit_price_with_vat,item_unit_price_without_vat,item_total_discount_with_vat,product_id,product_code,catalog_category_id,catalog_brand_id,product_name,product_status,reviews_count,reviews_average_score_price,reviews_average_score_quality,reviews_average_score_properties,reviews_average_score_overall,reviews_average_score,is_in_stock,is_ended,is_new,is_boosted,product_purchase_price,eshop_stock_count,is_fifo,product_name_parameterize,category,tree_path,category_name_parameterized,category_status,catalog_segment_id,categories_ancestor_ids,categories_descendant_ids,category_full_name_path,default_warranty_period,brand_name,brand_parameterized,segment_name,segment_parameterized
0,BG,1,1136409,2020-04-26,1.9558,BGN,345.0,18,18,standard,1,standard,5.34,4.45,0.28,39848,225542,179.0,145,MC-2,active,83,95.192308,93.653846,93.653846,94.423077,94.277108,True,False,False,False,2.205,-2.0,False,mc-2,Kapodastre pre gitaru s kovovými strunami,001:001:019:001,kapodastre-pre-gitaru-s-kovovymi-strunami,active,1.0,"{3,4,178}",{},"{Music,Guitars,Capos,""Capo for acoustic guitar""}",24.0,Musedo,musedo,Music,music
1,BG,1,1136409,2020-04-26,1.9558,BGN,345.0,18,18,standard,1,standard,2.91,2.43,0.15,140119,305503,762.0,109,TGC026 1m,active,1,0.0,0.0,0.0,0.0,100.0,False,False,False,False,1.385,0.0,False,tgc026-1m,Hotové nástrojové káble,001:005:010:001:001,hotove-nastrojove-kable,active,1.0,"{3,607,760,761}",{},"{Music,""Studio / PA"",Cables,""Complete Cables"",...",24.0,Lewitz,lewitz,Music,music
2,BG,1,1136409,2020-04-26,1.9558,BGN,345.0,18,18,standard,1,standard,0.49,0.4,0.03,34663,220357,153.0,13,351 Shape Premium Pick Medium White Moto,active,46,92.258065,93.225806,94.516129,94.193548,94.804348,True,False,False,False,0.164984,303.0,False,351-shape-premium-pick-medium-white-moto,Medium trsátka,001:001:015:002,medium-trsatka,active,1.0,"{3,4,151}",{},"{Music,Guitars,""Guitar Picks"",""Medium Picks""}",24.0,Fender,fender,Music,music
3,BG,1,1136409,2020-04-26,1.9558,BGN,345.0,18,18,standard,1,standard,4.86,4.05,0.26,63176,248871,192.0,320,536501 Foot Rest Blue,active,10,92.5,87.5,92.5,90.0,92.5,True,False,False,False,3.243455,55.0,False,536501-foot-rest-blue,Gitarové podnožky,001:001:024,gitarove-podnozky,active,1.0,"{3,4}",{},"{Music,Guitars,""Guitar Foot Rest""}",24.0,GEWA,gewa,Music,music
4,BG,1,1136409,2020-04-26,1.9558,BGN,345.0,18,18,standard,1,standard,0.49,0.4,0.03,51524,237219,152.0,6,BPK-72-AT Celluloid Pick Thin Abalone,active,0,0.0,0.0,0.0,0.0,0.0,True,True,False,False,0.13888,0.0,False,bpk-72-at-celluloid-pick-thin-abalone,Light trsátka,001:001:015:001,light-trsatka,active,1.0,"{3,4,151}",{},"{Music,Guitars,""Guitar Picks"",""Light Picks""}",24.0,Boss,boss,Music,music


In [5]:
data.isna().sum()

bill_country                        0
setting_currency_id                 0
shop_basket_id                      0
doc_date                            0
exchange_currency_rate              0
original_currency_code              0
basket_total_price_with_vat         0
count_basket_items                  0
basket_count_products               0
basket_type                         0
item_quantity                       0
item_type                           0
item_unit_price_with_vat            0
item_unit_price_without_vat         0
item_total_discount_with_vat        0
product_id                          0
product_code                        0
catalog_category_id                 0
catalog_brand_id                    0
product_name                        0
product_status                      0
reviews_count                       0
reviews_average_score_price         0
reviews_average_score_quality       0
reviews_average_score_properties    0
reviews_average_score_overall       0
reviews_aver

# 1. Change non-numeric values to numbers

Machine learning models usually work only with numeric values (integers or floats) - that's why we need to change other formats to numbers. 

In [6]:
data.dtypes

bill_country                                object
setting_currency_id                          int64
shop_basket_id                               int64
doc_date                            datetime64[ns]
exchange_currency_rate                     float64
original_currency_code                      object
basket_total_price_with_vat                float64
count_basket_items                           int64
basket_count_products                        int64
basket_type                                 object
item_quantity                                int64
item_type                                   object
item_unit_price_with_vat                   float64
item_unit_price_without_vat                float64
item_total_discount_with_vat               float64
product_id                                   int64
product_code                                 int64
catalog_category_id                        float64
catalog_brand_id                             int64
product_name                   

At first let's start with breaking down dates to four different columns - we can extract day of the month, day of the week, month and year. We will still keep the original datetime column in code, because it can be useful to easier access date (rather then creating it from columns).

In [7]:
years, months, days, weekdays = [], [], [], []
for date in data['doc_date']:
    years.append(date.year)
    months.append(date.month)
    days.append(date.day)
    weekdays.append(date.weekday())
    

data['doc_day'] = days
data['doc_month'] = months
data['doc_year'] = years
data['doc_weekday'] = weekdays

**The next part is to find columns that already have their natural number representation - i.e. product_name_parameterize is not necesarry column as we have product_id (numeric products identification)**

In [136]:
# rename catalog_COLUMN to COLUMN only so it is easier to understand
data.rename(columns={'catalog_category_id' : 'category_id', 'catalog_segment_id' : 'segment_id', 'catalog_brand_id' : 'brand_id'}, inplace=True)

# rename other id columns with extra words to pure defining id in similar spirit as with catalog
data.rename(columns={'setting_currency_id' : 'currency_id', 'shop_basket_id' : 'basket_id'}, inplace=True)

In [106]:
def leave_only_id_column(df : pd.DataFrame(), id_column : str, other_columns : list, inplace : bool = False) -> pd.DataFrame():
    """
    Function that counts and compares if product_id is proper representation of other given columns. If yes, then drop other columns and leave id only.
    Args
        df - pandas DataFrame containing desired columns
        id_column - main column containing identificator, this column will be the only one remaining
        other_columns - list of other columns, those will be compared and possibly dropped
        inplace - If False, return a copy. Otherwise, do operation inplace and return None.
    Returns
        pd.DataFrame - DataFrame with removed columns in other_columns or None if inplace is True
    """
    id_col_len = len(data[id_column].unique())
    
    unique_combinations = len(df[other_columns + [id_column]].drop_duplicates().index)
    other_cols_string = ''
    for name in other_columns:
        other_cols_string += name+', '
    
    print(f"{id_col_len} - Unique {id_column} amount.")
    print(f"{unique_combinations} - Amount of unique combinations of {id_column} and {other_cols_string}")
    
    missmatches_amount = abs(id_col_len - unique_combinations)
    print(f"{missmatches_amount} - How many missmatches between {id_column} and other columns.")
    
    if missmatches_amount == 0:
        if inplace:
            df.drop(labels=other_columns, inplace=inplace, axis=1)
            return None
        else:
            return df.drop(labels=other_columns, inplace=inplace, axis=1)
        
    else: 
        print('There were missmatches, not dropping any columns.')
        return None

Because each product id represents one product correctly, we can drop product name as well as parameterized product name. \
We can drop product_code as well for the same reason - product id represents same products as product_code but in different encodings.

In [108]:
leave_only_id_column(data, 'product_id', ['product_name', 'product_code'], inplace=True)
print('\n')

leave_only_id_column(data, 'product_id', ['product_name_parameterize'], inplace=True)

KeyError: "['product_name', 'product_code'] not in index"

Product name parameterize has 7 different than unique values. If we look deeper into it we can see that it is only because there is -set added to the end of the name parameterized. This is deprecated way of set selling, since there is now column (item_type) to differentiate between sets and standard items. That's why we can drop product_name_parametereize as well.

In [110]:
print(data[['product_id' , 'product_name_parameterize']].drop_duplicates().product_id.value_counts().head(10))
data[data[['product_id' , 'product_name_parameterize', 'item_type']].product_id.__eq__(147573)][['product_name_parameterize']].drop_duplicates()

147631    2
147609    2
107679    2
147573    2
103795    2
91810     2
203372    2
104009    1
168354    1
278795    1
Name: product_id, dtype: int64

In [134]:
data.drop(labels='product_name_parameterize', inplace=True, axis=1)

Unnamed: 0,product_name_parameterize
45055,47rs-1-38-nylon-jazz-6-pack
1215752,47rs-1-38-nylon-jazz-6-pack-set


Similar to products, there is many alike records in data, we can take care of all of them. 

In [137]:
leave_only_id_column(data, 'category_id', ['category', 'category_name_parameterized'], inplace=True)

2455 - Unique category_id amount.
2455 - Amount of unique combinations of category_id and category, category_name_parameterized, 
0 - How many missmatches between category_id and other columns.


In [140]:
leave_only_id_column(data, 'brand_id', ['brand_name', 'brand_parameterized'], inplace=True)

5336 - Unique brand_id amount.
5336 - Amount of unique combinations of brand_id and brand_name, brand_parameterized, 
0 - How many missmatches between brand_id and other columns.


In [142]:
leave_only_id_column(data, 'currency_id', ['original_currency_code'], inplace=True)

10 - Unique currency_id amount.
10 - Amount of unique combinations of currency_id and original_currency_code, 
0 - How many missmatches between currency_id and other columns.


In [145]:
leave_only_id_column(data, 'segment_id', ['segment_parameterized', 'segment_name'], inplace=True)

15 - Unique segment_id amount.
15 - Amount of unique combinations of segment_id and segment_parameterized, segment_name, 
0 - How many missmatches between segment_id and other columns.


In [148]:
leave_only_id_column(data, 'tree_path', ['category_full_name_path'], inplace=True)

2455 - Unique tree_path amount.
2455 - Amount of unique combinations of tree_path and category_full_name_path, 
0 - How many missmatches between tree_path and other columns.


In [None]:
leave_only_id_column(data, 'tree_path', ['categories_descendant_ids', 'categories_ancestor_ids', 'category_full_name_path'])

In [149]:
data

Unnamed: 0,bill_country,currency_id,basket_id,doc_date,exchange_currency_rate,basket_total_price_with_vat,count_basket_items,basket_count_products,basket_type,item_quantity,item_type,item_unit_price_with_vat,item_unit_price_without_vat,item_total_discount_with_vat,product_id,category_id,brand_id,product_status,reviews_count,reviews_average_score_price,reviews_average_score_quality,reviews_average_score_properties,reviews_average_score_overall,reviews_average_score,is_in_stock,is_ended,is_new,is_boosted,product_purchase_price,eshop_stock_count,is_fifo,tree_path,category_status,segment_id,categories_ancestor_ids,categories_descendant_ids,default_warranty_period,doc_day,doc_month,doc_year,doc_weekday
0,BG,1,1136409,2020-04-26,1.9558,345.00,18,18,standard,1,standard,5.34,4.45,0.28,39848,179.0,145,active,83,95.192308,93.653846,93.653846,94.423077,94.277108,True,False,False,False,2.205000,-2.0,False,001:001:019:001,active,1.0,"{3,4,178}",{},24.0,26,4,2020,6
1,BG,1,1136409,2020-04-26,1.9558,345.00,18,18,standard,1,standard,2.91,2.43,0.15,140119,762.0,109,active,1,0.000000,0.000000,0.000000,0.000000,100.000000,False,False,False,False,1.385000,0.0,False,001:005:010:001:001,active,1.0,"{3,607,760,761}",{},24.0,26,4,2020,6
2,BG,1,1136409,2020-04-26,1.9558,345.00,18,18,standard,1,standard,0.49,0.40,0.03,34663,153.0,13,active,46,92.258065,93.225806,94.516129,94.193548,94.804348,True,False,False,False,0.164984,303.0,False,001:001:015:002,active,1.0,"{3,4,151}",{},24.0,26,4,2020,6
3,BG,1,1136409,2020-04-26,1.9558,345.00,18,18,standard,1,standard,4.86,4.05,0.26,63176,192.0,320,active,10,92.500000,87.500000,92.500000,90.000000,92.500000,True,False,False,False,3.243455,55.0,False,001:001:024,active,1.0,"{3,4}",{},24.0,26,4,2020,6
4,BG,1,1136409,2020-04-26,1.9558,345.00,18,18,standard,1,standard,0.49,0.40,0.03,51524,152.0,6,active,0,0.000000,0.000000,0.000000,0.000000,0.000000,True,True,False,False,0.138880,0.0,False,001:001:015:001,active,1.0,"{3,4,151}",{},24.0,26,4,2020,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3005826,PL,12,2633784,2021-08-26,4.5779,119.24,4,5,standard,1,standard,80.39,65.35,0.00,234242,2733.0,6219,active,11,0.000000,0.000000,0.000000,0.000000,90.000000,True,False,False,False,50.031875,31.0,False,014:001:005,active,16.0,"{3526,2728}",{},24.0,26,8,2021,3
3005827,CZ,4,3416860,2021-08-26,25.5310,79.82,3,3,standard,1,standard,29.77,24.60,9.01,200523,1332.0,1070,ended,0,0.000000,0.000000,0.000000,0.000000,0.000000,False,True,False,False,16.250000,0.0,False,004:040:002:002,active,4.0,"{1327,1328,1330}",{},24.0,26,8,2021,3
3005828,CZ,4,3416860,2021-08-26,25.5310,79.82,3,3,standard,1,standard,18.80,15.54,6.27,267505,2289.0,1070,active,0,0.000000,0.000000,0.000000,0.000000,0.000000,False,False,False,False,11.900000,0.0,False,005:050:009,active,9.0,"{1758,1762}",{},24.0,26,8,2021,3
3005829,HU,10,3420193,2021-08-26,348.7600,25.78,1,1,standard,1,standard,23.51,18.51,13.48,250170,1270.0,1278,active,0,0.000000,0.000000,0.000000,0.000000,0.000000,True,False,False,False,14.043333,4.0,False,003:038:002:001,active,3.0,"{1182,1267,1269}",{},24.0,26,8,2021,3


In [146]:
TODO_TYPE_CHANGE = list(data.columns)
TODO_TYPE_CHANGE.remove('doc_date')
TODO_TYPE_CHANGE.remove('doc_month')
TODO_TYPE_CHANGE.remove('doc_day')
TODO_TYPE_CHANGE.remove('doc_year')
TODO_TYPE_CHANGE.remove('doc_weekday')
TODO_TYPE_CHANGE.remove('product_id')
TODO_TYPE_CHANGE.remove('category_id')
TODO_TYPE_CHANGE

['bill_country',
 'currency_id',
 'basket_id',
 'exchange_currency_rate',
 'basket_total_price_with_vat',
 'count_basket_items',
 'basket_count_products',
 'basket_type',
 'item_quantity',
 'item_type',
 'item_unit_price_with_vat',
 'item_unit_price_without_vat',
 'item_total_discount_with_vat',
 'brand_id',
 'product_status',
 'reviews_count',
 'reviews_average_score_price',
 'reviews_average_score_quality',
 'reviews_average_score_properties',
 'reviews_average_score_overall',
 'reviews_average_score',
 'is_in_stock',
 'is_ended',
 'is_new',
 'is_boosted',
 'product_purchase_price',
 'eshop_stock_count',
 'is_fifo',
 'tree_path',
 'category_status',
 'segment_id',
 'categories_ancestor_ids',
 'categories_descendant_ids',
 'category_full_name_path',
 'default_warranty_period']

In [7]:
data.bill_country

0          BG
1          BG
2          BG
3          BG
4          BG
           ..
3005826    PL
3005827    CZ
3005828    CZ
3005829    HU
3005830    SK
Name: bill_country, Length: 3005831, dtype: object

In [13]:
ce_ordinal = ce.OrdinalEncoder(cols=['bill_country', 'original_currency_code'])
#print(f"Encoding used for \n{ce_ordinal.fit(data).mapping[0]['mapping']}") 
#ce_ordinal.fit_transform(data)

In [14]:
ce_ordinal.fit(data)

OrdinalEncoder(cols=['bill_country', 'original_currency_code'],
               mapping=[{'col': 'bill_country', 'data_type': dtype('O'),
                         'mapping': BG      1
CZ      2
AT      3
DK      4
PL      5
HU      6
SI      7
SK      8
RO      9
IE     10
PT     11
HR     12
GB     13
DE     14
FR     15
EL     16
IT     17
NL     18
LV     19
ES     20
FI     21
BE     22
SE     23
EE     24
GR     25
LT     26
LU     27
CH     28
TR     29
UA     30
RS     31
BA     32
NaN    -2
dtype: int64},
                        {'col': 'original_currency_code',
                         'data_type': dtype('O'),
                         'mapping': BGN     1
CZK     2
EUR     3
DKK     4
PLN     5
HUF     6
RON     7
HRK     8
GBP     9
SEK    10
NaN    -2
dtype: int64}])