In [1]:
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np 
import seaborn as sns
sns.set(rc={'figure.figsize':(20,8)})
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
pd.set_option('display.max_rows', 120)

## Reading / Displaying the Data 

In [2]:
df_sales = pd.read_csv('../data/clean_step2.csv')

In [3]:
df_sales.head()

Unnamed: 0,order_id,code,quantity,price,pis_cofins,icms,tax_substitution,category,liquid_cost,order_status,...,perc_icms,perc_tax_substitution,unit_revenue,unit_price,unit_liquid_price,unit_perc_pis_cofins,unit_perc_icms,unit_perc_tax_substitution,unit_liquid_cost,unit_markup
0,bcb59c839e78b2601374cbad9239ca7b,e6762ba2ffbca07ab6cee7551caeaad5,1,978.9,90.5483,0.0,191.8416,4ece547755cba9e7fc14125bc895f31b,542.7065,entrega total,...,0.0,19.6,537.4868,978.9,1080.1933,9.25,0.0,19.6,542.7065,1.990382
1,88eb0ac86af1a521c0831298d22dea8b,e6762ba2ffbca07ab6cee7551caeaad5,1,978.9,90.5483,0.0,191.8416,4ece547755cba9e7fc14125bc895f31b,542.7065,entrega total,...,0.0,19.6,537.4868,978.9,1080.1933,9.25,0.0,19.6,542.7065,1.990382
2,1c175bc61b9b659bbf011b2e5e3dcec6,e6762ba2ffbca07ab6cee7551caeaad5,1,976.05,90.2846,0.0,192.3325,4ece547755cba9e7fc14125bc895f31b,542.7065,entrega total,...,0.0,19.71,535.3914,976.05,1078.0979,9.25,0.0,19.71,542.7065,1.986521
3,a8ad36828898fa3f6efeb5bd19c076f2,e6762ba2ffbca07ab6cee7551caeaad5,1,1089.1,100.7418,185.147,0.0,4ece547755cba9e7fc14125bc895f31b,542.7065,entrega total,...,17.0,0.0,260.5047,1089.1,803.2112,9.25,17.0,0.0,542.7065,1.48001
4,9cbfaac6e04ea6ed454b843c94f2c29c,e6762ba2ffbca07ab6cee7551caeaad5,1,949.0,87.7825,170.82,0.0,4ece547755cba9e7fc14125bc895f31b,542.7065,entrega total,...,18.0,0.0,147.691,949.0,690.3975,9.25,18.0,0.0,542.7065,1.272138


In [4]:
df_sales.columns

Index(['order_id', 'code', 'quantity', 'price', 'pis_cofins', 'icms',
       'tax_substitution', 'category', 'liquid_cost', 'order_status',
       'capture_date', 'process_date', 'process_status', 'source_channel',
       'liquid_price', 'revenue', 'markup', 'week_number', 'month',
       'perc_pis_cofins', 'perc_icms', 'perc_tax_substitution', 'unit_revenue',
       'unit_price', 'unit_liquid_price', 'unit_perc_pis_cofins',
       'unit_perc_icms', 'unit_perc_tax_substitution', 'unit_liquid_cost',
       'unit_markup'],
      dtype='object')

In [5]:
df_sales['unit_liquid_cost_perc'] = df_sales['unit_liquid_cost'] / df_sales['unit_liquid_price']

## Adding some features and grouping it by product code



#### Previous analysis insights / hipothesis
* Seasonality plays a great influence (month, week number), so lets add some more fields following the same tought

In [6]:
df_sales['capture_date'] = pd.to_datetime(df_sales['capture_date'], errors='coerce')
df_sales['weekend'] = df_sales.capture_date.dt.dayofweek.apply(lambda x: 1 if x in [0,6] else 0)
df_sales['week_of_month'] = df_sales.capture_date.dt.day.apply(lambda x: (x % 4) + 1)
df_sales['quarter'] = df_sales.capture_date.dt.month.apply(lambda x: (x % 4) + 1)

In [7]:
df_sales.head()

Unnamed: 0,order_id,code,quantity,price,pis_cofins,icms,tax_substitution,category,liquid_cost,order_status,...,unit_liquid_price,unit_perc_pis_cofins,unit_perc_icms,unit_perc_tax_substitution,unit_liquid_cost,unit_markup,unit_liquid_cost_perc,weekend,week_of_month,quarter
0,bcb59c839e78b2601374cbad9239ca7b,e6762ba2ffbca07ab6cee7551caeaad5,1,978.9,90.5483,0.0,191.8416,4ece547755cba9e7fc14125bc895f31b,542.7065,entrega total,...,1080.1933,9.25,0.0,19.6,542.7065,1.990382,0.502416,0,4,3
1,88eb0ac86af1a521c0831298d22dea8b,e6762ba2ffbca07ab6cee7551caeaad5,1,978.9,90.5483,0.0,191.8416,4ece547755cba9e7fc14125bc895f31b,542.7065,entrega total,...,1080.1933,9.25,0.0,19.6,542.7065,1.990382,0.502416,1,1,3
2,1c175bc61b9b659bbf011b2e5e3dcec6,e6762ba2ffbca07ab6cee7551caeaad5,1,976.05,90.2846,0.0,192.3325,4ece547755cba9e7fc14125bc895f31b,542.7065,entrega total,...,1078.0979,9.25,0.0,19.71,542.7065,1.986521,0.503393,1,2,3
3,a8ad36828898fa3f6efeb5bd19c076f2,e6762ba2ffbca07ab6cee7551caeaad5,1,1089.1,100.7418,185.147,0.0,4ece547755cba9e7fc14125bc895f31b,542.7065,entrega total,...,803.2112,9.25,17.0,0.0,542.7065,1.48001,0.675671,1,2,3
4,9cbfaac6e04ea6ed454b843c94f2c29c,e6762ba2ffbca07ab6cee7551caeaad5,1,949.0,87.7825,170.82,0.0,4ece547755cba9e7fc14125bc895f31b,542.7065,entrega total,...,690.3975,9.25,18.0,0.0,542.7065,1.272138,0.786078,0,3,3


### Keeping most relevant fields 
* The object is to understand the price behaviour, offer and demand. Given that we are removing some total fields and keeping those focused on units and season.

In [8]:
df_sales.columns

Index(['order_id', 'code', 'quantity', 'price', 'pis_cofins', 'icms',
       'tax_substitution', 'category', 'liquid_cost', 'order_status',
       'capture_date', 'process_date', 'process_status', 'source_channel',
       'liquid_price', 'revenue', 'markup', 'week_number', 'month',
       'perc_pis_cofins', 'perc_icms', 'perc_tax_substitution', 'unit_revenue',
       'unit_price', 'unit_liquid_price', 'unit_perc_pis_cofins',
       'unit_perc_icms', 'unit_perc_tax_substitution', 'unit_liquid_cost',
       'unit_markup', 'unit_liquid_cost_perc', 'weekend', 'week_of_month',
       'quarter'],
      dtype='object')

In [10]:
# df_sales = df_sales[base_cols]

In [11]:
df_sales.head()

Unnamed: 0,order_id,code,quantity,price,pis_cofins,icms,tax_substitution,category,liquid_cost,order_status,...,unit_liquid_price,unit_perc_pis_cofins,unit_perc_icms,unit_perc_tax_substitution,unit_liquid_cost,unit_markup,unit_liquid_cost_perc,weekend,week_of_month,quarter
0,bcb59c839e78b2601374cbad9239ca7b,e6762ba2ffbca07ab6cee7551caeaad5,1,978.9,90.5483,0.0,191.8416,4ece547755cba9e7fc14125bc895f31b,542.7065,entrega total,...,1080.1933,9.25,0.0,19.6,542.7065,1.990382,0.502416,0,4,3
1,88eb0ac86af1a521c0831298d22dea8b,e6762ba2ffbca07ab6cee7551caeaad5,1,978.9,90.5483,0.0,191.8416,4ece547755cba9e7fc14125bc895f31b,542.7065,entrega total,...,1080.1933,9.25,0.0,19.6,542.7065,1.990382,0.502416,1,1,3
2,1c175bc61b9b659bbf011b2e5e3dcec6,e6762ba2ffbca07ab6cee7551caeaad5,1,976.05,90.2846,0.0,192.3325,4ece547755cba9e7fc14125bc895f31b,542.7065,entrega total,...,1078.0979,9.25,0.0,19.71,542.7065,1.986521,0.503393,1,2,3
3,a8ad36828898fa3f6efeb5bd19c076f2,e6762ba2ffbca07ab6cee7551caeaad5,1,1089.1,100.7418,185.147,0.0,4ece547755cba9e7fc14125bc895f31b,542.7065,entrega total,...,803.2112,9.25,17.0,0.0,542.7065,1.48001,0.675671,1,2,3
4,9cbfaac6e04ea6ed454b843c94f2c29c,e6762ba2ffbca07ab6cee7551caeaad5,1,949.0,87.7825,170.82,0.0,4ece547755cba9e7fc14125bc895f31b,542.7065,entrega total,...,690.3975,9.25,18.0,0.0,542.7065,1.272138,0.786078,0,3,3


## Verify the distributions

In [12]:
def summary_dist(target_col, base_col='order_id', method='count'):
    if method == 'count':
        df_summary = df_sales[[target_col,base_col]].groupby(target_col).agg({base_col:pd.Series.nunique}).reset_index()
    else:
        df_summary = df_sales[[target_col,base_col]].groupby(target_col).agg({base_col:pd.Series.sum}).reset_index()
    df_summary[base_col] = df_summary[base_col] / df_summary[base_col].sum()  * 100
    return df_summary.sort_values(base_col).tail(25)

In [13]:
df_week_of_month = summary_dist('week_of_month').sort_values('week_of_month')
df_week_of_month

Unnamed: 0,week_of_month,order_id
0,1,22.273294
1,2,27.748175
2,3,26.815517
3,4,23.163015


* Sales tend to concentrate on the middle of the months?

In [14]:
df_weekend = summary_dist('weekend')
df_weekend

Unnamed: 0,weekend,order_id
1,1,25.347195
0,0,74.652805


* Considering 2 / 7  the proportion non biased would be around 28.57% on weekends. 

### Simple Checks

In [15]:
df_sales[['code', 'category']].groupby('code').nunique().reset_index().sort_values('category').tail(10)

Unnamed: 0,code,category
41,411e1404e183718207628232e91ce5a9,1
40,40d98a2375332cb635d4cb28ab68e087,1
39,40bddb00475d65eddb68e9aeb6fab0de,1
38,3da22f1b88a20ea8efc3d83fcb872e21,1
37,3d3d13446c52ecaaed5d0bf55a933d4d,1
36,3d21b63892749e921e3ff5818753bd67,1
35,3bc993e0f0c636e9aaaefa0356bdecc0,1
34,3b4407288e2983a514a241c9b84b7094,1
32,390943ce05959ac98c702d250c2ebb54,1
129,ffdad3ddbaf6c76c9bba1b48c51e03c6,1


* There is only a category per product so we can keep a category field for it. 

In [16]:
df_sales['cat_code'] = df_sales.category.astype('category').cat.codes

In [17]:
source_channel = df_sales[['code', 'source_channel']].groupby('code').nunique().reset_index().sort_values('source_channel')
source_channel.tail(10)

Unnamed: 0,code,source_channel
25,32ceebf3efea1d04ace4183d20d4da5b,12
26,3454ea52396a4cfd3fc37414d30c7b9c,12
129,ffdad3ddbaf6c76c9bba1b48c51e03c6,12
77,727673fa3e457bc596532b3eb26b23a0,12
116,d57911cca4b08f7b46417d952c0ca1dc,12
119,dd1935ffd0ee2b6ec159ba7867d11e57,12
79,760693745e10b0c5e68c42214c729b0d,13
44,4534ea61b50410b3b6243e02b40c8cd1,13
61,5b7a30a9e6a43b170ad4d9e00d8d9359,13
23,2e35421c34fb588ba40a0c57b3971d24,14


 * one sale could come from any source, so there is a need of quantify the ammount of sales per each of the all channels. 

## Sumarizing behaviours 

In [18]:
df_products = df_sales[['code', 'cat_code']].drop_duplicates().set_index('code')
df_products

Unnamed: 0_level_0,cat_code
code,Unnamed: 1_level_1
e6762ba2ffbca07ab6cee7551caeaad5,1
d408e1b5e841dde4e15a4cfa182e3812,5
29424aaf6e27a8dbe4b7273a0a39131d,5
723f73c85e91fc31d147dfade389d4f9,0
54209126056016c7c391c0c8fd8e6eff,0
...,...
b272ba3f4adb1dd16eaac1b53940629e,5
727673fa3e457bc596532b3eb26b23a0,0
d5bc9e14d090330cd07e6ccbcb3c3e4e,0
b9d929195dcd4e6a36e5e65891746b5e,0


In [19]:
df_sales.columns

Index(['order_id', 'code', 'quantity', 'price', 'pis_cofins', 'icms',
       'tax_substitution', 'category', 'liquid_cost', 'order_status',
       'capture_date', 'process_date', 'process_status', 'source_channel',
       'liquid_price', 'revenue', 'markup', 'week_number', 'month',
       'perc_pis_cofins', 'perc_icms', 'perc_tax_substitution', 'unit_revenue',
       'unit_price', 'unit_liquid_price', 'unit_perc_pis_cofins',
       'unit_perc_icms', 'unit_perc_tax_substitution', 'unit_liquid_cost',
       'unit_markup', 'unit_liquid_cost_perc', 'weekend', 'week_of_month',
       'quarter', 'cat_code'],
      dtype='object')

In [20]:
catfields = ['source_channel', 'week_number', 'month', 'perc_pis_cofins', 'perc_icms','weekend', 'week_of_month', 'quarter']

In [21]:
def product_summary_building_ctabs(df, target_col, values_col='quantity', method = 'sum'):
    if method == 'mean':
        df_result = pd.crosstab(
            df['code'], df[target_col], 
            values=df[values_col], aggfunc=pd.Series.mean
        ).reset_index()
    else: 
        df_result = pd.crosstab(
            df['code'], df[target_col], 
            values=df[values_col], aggfunc=pd.Series.sum, normalize='index'
        ).reset_index()
                
    df_result.columns = ['code'] + [f'{target_col}_{values_col}_{col}' for col in df_result.columns if col not in ['code', target_col]]
    
    return df_result.fillna(0)

In [22]:
df_product_summary_catfields =  df_products.drop(columns=['cat_code'])
df_product_summary_catfields.head()

e6762ba2ffbca07ab6cee7551caeaad5
d408e1b5e841dde4e15a4cfa182e3812
29424aaf6e27a8dbe4b7273a0a39131d
723f73c85e91fc31d147dfade389d4f9
54209126056016c7c391c0c8fd8e6eff


In [23]:
df_product_summary_catfields =  df_products.drop(columns=['cat_code'])
for catfield in catfields: 
    df = product_summary_building_ctabs(df_sales, catfield).set_index('code')
    df_product_summary_catfields = df_product_summary_catfields.join(df, on='code', how='left')

for catfield in catfields: 
    df = product_summary_building_ctabs(df_sales, catfield, values_col='markup', method='mean').set_index('code')
    df_product_summary_catfields = df_product_summary_catfields.join(df, on='code', how='left')
    

In [24]:
df_product_summary_catfields.head(10)

Unnamed: 0_level_0,source_channel_quantity_152bf0ce464047b9499ccb9e5b9b77a8,source_channel_quantity_2934a86a91bfa55d7f20b4f08a441fac,source_channel_quantity_3ab2427543039f8c9f17d06f6f65a3a7,source_channel_quantity_5a97b8efd901c1d28ff86522b95babb9,source_channel_quantity_67c19e107de33cab7ea9a9db8bc9ccd2,source_channel_quantity_7261d300057219056592010c7bdaf5ee,source_channel_quantity_98defd6ee70dfb1dea416cecdf391f58,source_channel_quantity_9d3e0fcbc1f16d80a76026e8f1c26002,source_channel_quantity_a578e71c3216f513a84ec6a46084fd3a,source_channel_quantity_af082bb0c2fa1414655017d464aa0262,...,weekend_markup_0,weekend_markup_1,week_of_month_markup_1,week_of_month_markup_2,week_of_month_markup_3,week_of_month_markup_4,quarter_markup_1,quarter_markup_2,quarter_markup_3,quarter_markup_4
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
e6762ba2ffbca07ab6cee7551caeaad5,0.04878,0.0,0.0,0.0,0.0,0.02439,0.04878,0.146341,0.146341,0.0,...,1.535302,1.69429,1.60301,1.554748,1.55717,1.590829,1.63485,1.48959,1.64284,1.399889
d408e1b5e841dde4e15a4cfa182e3812,0.013333,0.0,0.0,0.026667,0.0,0.0,0.04,0.04,0.08,0.0,...,1.652618,1.636989,1.599132,1.604393,1.824823,1.598775,1.807785,1.560464,1.499223,1.66381
29424aaf6e27a8dbe4b7273a0a39131d,0.034091,0.0,0.0,0.0,0.0,0.0,0.017045,0.051136,0.210227,0.0,...,2.009754,1.845988,1.96123,2.02153,1.872368,1.987136,1.874995,1.844339,2.04842,2.035363
723f73c85e91fc31d147dfade389d4f9,0.034,0.0,0.0,0.002,0.0,0.034,0.044,0.078,0.116,0.004,...,1.309626,1.321108,1.293836,1.31363,1.325223,1.316149,1.31599,1.339832,1.306918,1.298785
54209126056016c7c391c0c8fd8e6eff,0.013746,0.0,0.0,0.0,0.0,0.0,0.024055,0.079038,0.09622,0.006873,...,1.764779,1.797213,1.800712,1.764965,1.717045,1.804965,1.856912,1.748652,1.657127,1.741267
c443d252c048280160fc427766d9f1f4,0.014388,0.0,0.0,0.0,0.0,0.007194,0.007194,0.079137,0.201439,0.007194,...,1.676817,1.905919,1.566389,1.89853,1.719929,1.706112,1.696369,1.65704,1.595575,1.946876
374e1947dcb8f4848f4ada6f04921edd,0.02071,0.0,0.002959,0.005917,0.0,0.0,0.017751,0.088757,0.139053,0.014793,...,1.756433,1.70893,1.745246,1.69647,1.74976,1.811664,1.764512,1.817734,1.814887,1.639851
60424117a2618c7184687046fa5693c4,0.0,0.0,0.0,0.0,0.0,0.014706,0.022059,0.036765,0.264706,0.029412,...,1.696336,1.891378,2.114008,1.677661,1.68169,1.611387,1.792955,1.642197,1.886327,1.698261
193628b6634713730d3c506f2da0ff58,0.014205,0.0,0.003472,0.00221,0.000947,0.010417,0.01673,0.026515,0.063763,0.004104,...,1.801185,1.782698,1.80798,1.839741,1.764707,1.777768,1.739147,1.790449,1.919633,1.800982
d4592ab52cb9cd5af0510943a4c8e28c,0.012807,0.0,0.001067,0.0,0.0,0.0,0.013874,0.029883,0.15048,0.006403,...,1.711065,1.599903,1.666721,1.719521,1.663021,1.681035,1.670472,1.61546,1.812766,1.716636


In [25]:
for catfield in catfields: 
    print(catfield)
    print([col for col in df_product_summary_catfields.columns if catfield in col])
    print('')

source_channel
['source_channel_quantity_152bf0ce464047b9499ccb9e5b9b77a8', 'source_channel_quantity_2934a86a91bfa55d7f20b4f08a441fac', 'source_channel_quantity_3ab2427543039f8c9f17d06f6f65a3a7', 'source_channel_quantity_5a97b8efd901c1d28ff86522b95babb9', 'source_channel_quantity_67c19e107de33cab7ea9a9db8bc9ccd2', 'source_channel_quantity_7261d300057219056592010c7bdaf5ee', 'source_channel_quantity_98defd6ee70dfb1dea416cecdf391f58', 'source_channel_quantity_9d3e0fcbc1f16d80a76026e8f1c26002', 'source_channel_quantity_a578e71c3216f513a84ec6a46084fd3a', 'source_channel_quantity_af082bb0c2fa1414655017d464aa0262', 'source_channel_quantity_b76eb9b8fc0f17098812da9117d3e500', 'source_channel_quantity_e9b49f9086ba813ca3f0b321710fef16', 'source_channel_quantity_ea2912716be1999ab62d5b9dfa4f58f9', 'source_channel_quantity_fc7020775a7cdf161ab5267985c54601', 'source_channel_markup_152bf0ce464047b9499ccb9e5b9b77a8', 'source_channel_markup_2934a86a91bfa55d7f20b4f08a441fac', 'source_channel_markup_3ab24

## Numeric summaries

In [26]:
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

## https://stackoverflow.com/questions/19894939/calculate-arbitrary-percentile-on-pandas-groupby

In [27]:

def numeric_summary(df, col):
    df_result = df[['code', col]].groupby('code').agg({
        col:['mean', 'std', 'max', 'min', percentile(50)]
    }).reset_index()
    df_result.columns = ['_'.join(tup).rstrip('_') for tup in df_result.columns.values]
    df_result = df_result.set_index('code')    
    return df_result
df = numeric_summary(df_sales, 'unit_markup')

df

Unnamed: 0_level_0,unit_markup_mean,unit_markup_std,unit_markup_max,unit_markup_min,unit_markup_percentile_50
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0671c2b9132a3f5215a4212ce0691694,1.562087,0.355623,2.656474,1.087155,1.434718
09f544ec2a74c89abeec7b0590fc2d11,1.639085,0.344274,2.528779,1.120780,1.561558
0ad316f6b5cb5e81ebff73ae2490ccfe,1.650388,0.348519,2.480452,1.106237,1.553343
0bbe09e34a11e8e31cf49d6f8df2992d,1.470783,0.246788,2.167081,1.096599,1.385264
0dca7ec6ba9b6e8f17f04f713a6be727,1.545965,0.429780,3.038978,0.943613,1.487013
...,...,...,...,...,...
f08984b2adcbf33ba61fe13fcfa5b957,1.726241,0.396015,2.932773,0.841628,1.632953
f5f92c2a12f182115c45288a6ef28e94,1.656365,0.338703,2.340672,1.090410,1.769399
f9a023f31c8087fd0c169b3bedd351d1,1.597636,0.331462,2.254752,0.981598,1.587459
fd84644da59504bd9e9dcb4b6db63bea,1.507804,0.349028,2.292459,0.950428,1.398620


In [28]:
numeric_cols = ['unit_liquid_cost_perc',
       'unit_markup']
df_product_summary_numeric =  df_products.drop(columns=['cat_code'])

for col in numeric_cols: 
    df = numeric_summary(df_sales, col)
    df_product_summary_numeric = df_product_summary_numeric.join(df, on='code', how='left')


In [29]:
df_product_summary_numeric.head()

Unnamed: 0_level_0,unit_liquid_cost_perc_mean,unit_liquid_cost_perc_std,unit_liquid_cost_perc_max,unit_liquid_cost_perc_min,unit_liquid_cost_perc_percentile_50,unit_markup_mean,unit_markup_std,unit_markup_max,unit_markup_min,unit_markup_percentile_50
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
e6762ba2ffbca07ab6cee7551caeaad5,0.664485,0.139084,0.87342,0.495834,0.713632,1.57408,0.339841,2.016804,1.144924,1.401282
d408e1b5e841dde4e15a4cfa182e3812,0.667039,0.168905,0.904901,0.198069,0.739112,1.441776,0.275338,1.999516,1.105093,1.334629
29424aaf6e27a8dbe4b7273a0a39131d,0.54761,0.128811,0.741435,0.209398,0.573237,1.79187,0.38938,2.791206,1.294306,1.574516
723f73c85e91fc31d147dfade389d4f9,0.76788,0.06362,1.014705,0.353106,0.768454,1.307263,0.099146,1.637204,0.985509,1.301313
54209126056016c7c391c0c8fd8e6eff,0.586322,0.113344,0.897872,0.263493,0.544282,1.749869,0.313052,2.351887,1.113744,1.798905


In [30]:
df_products_final = df_products.join(df_product_summary_catfields, on='code', how='left')
df_products_final = df_products_final.join(df_product_summary_numeric, on='code', how='left')
df_products_final.head()

Unnamed: 0_level_0,cat_code,source_channel_quantity_152bf0ce464047b9499ccb9e5b9b77a8,source_channel_quantity_2934a86a91bfa55d7f20b4f08a441fac,source_channel_quantity_3ab2427543039f8c9f17d06f6f65a3a7,source_channel_quantity_5a97b8efd901c1d28ff86522b95babb9,source_channel_quantity_67c19e107de33cab7ea9a9db8bc9ccd2,source_channel_quantity_7261d300057219056592010c7bdaf5ee,source_channel_quantity_98defd6ee70dfb1dea416cecdf391f58,source_channel_quantity_9d3e0fcbc1f16d80a76026e8f1c26002,source_channel_quantity_a578e71c3216f513a84ec6a46084fd3a,...,unit_liquid_cost_perc_mean,unit_liquid_cost_perc_std,unit_liquid_cost_perc_max,unit_liquid_cost_perc_min,unit_liquid_cost_perc_percentile_50,unit_markup_mean,unit_markup_std,unit_markup_max,unit_markup_min,unit_markup_percentile_50
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
e6762ba2ffbca07ab6cee7551caeaad5,1,0.04878,0.0,0.0,0.0,0.0,0.02439,0.04878,0.146341,0.146341,...,0.664485,0.139084,0.87342,0.495834,0.713632,1.57408,0.339841,2.016804,1.144924,1.401282
d408e1b5e841dde4e15a4cfa182e3812,5,0.013333,0.0,0.0,0.026667,0.0,0.0,0.04,0.04,0.08,...,0.667039,0.168905,0.904901,0.198069,0.739112,1.441776,0.275338,1.999516,1.105093,1.334629
29424aaf6e27a8dbe4b7273a0a39131d,5,0.034091,0.0,0.0,0.0,0.0,0.0,0.017045,0.051136,0.210227,...,0.54761,0.128811,0.741435,0.209398,0.573237,1.79187,0.38938,2.791206,1.294306,1.574516
723f73c85e91fc31d147dfade389d4f9,0,0.034,0.0,0.0,0.002,0.0,0.034,0.044,0.078,0.116,...,0.76788,0.06362,1.014705,0.353106,0.768454,1.307263,0.099146,1.637204,0.985509,1.301313
54209126056016c7c391c0c8fd8e6eff,0,0.013746,0.0,0.0,0.0,0.0,0.0,0.024055,0.079038,0.09622,...,0.586322,0.113344,0.897872,0.263493,0.544282,1.749869,0.313052,2.351887,1.113744,1.798905


In [31]:
df_products_final.to_csv('../data/products_features.csv')