In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns
sns.set()
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
pd.set_option('display.max_rows', 120)

## Reading / Displaying the Data 

In [2]:
df_sales = pd.read_csv('../data/clean_step1.csv')

In [3]:
df_sales.head()

Unnamed: 0,order_id,code,quantity,price,pis_cofins,icms,tax_substitution,category,liquid_cost,order_status,capture_date,process_date,process_status,source_channel,revenue,markup
0,bcb59c839e78b2601374cbad9239ca7b,e6762ba2ffbca07ab6cee7551caeaad5,1,978.9,90.5483,0.0,191.8416,4ece547755cba9e7fc14125bc895f31b,542.7065,entrega total,2016-06-11,2016-06-11,processado,b76eb9b8fc0f17098812da9117d3e500,436.1935,0.803737
1,4e91ee6b95895771dc9ee524e910a902,e6762ba2ffbca07ab6cee7551caeaad5,1,1036.29,95.8568,176.1693,0.0,4ece547755cba9e7fc14125bc895f31b,542.7065,em rota de entrega,2016-06-11,2016-06-11,processado,b76eb9b8fc0f17098812da9117d3e500,493.5835,0.909485
2,88eb0ac86af1a521c0831298d22dea8b,e6762ba2ffbca07ab6cee7551caeaad5,1,978.9,90.5483,0.0,191.8416,4ece547755cba9e7fc14125bc895f31b,542.7065,entrega total,2016-06-12,2016-06-12,processado,b76eb9b8fc0f17098812da9117d3e500,436.1935,0.803737
3,dee418152a36314b4aee6ce9cf94fcbf,e6762ba2ffbca07ab6cee7551caeaad5,1,978.9,90.5483,176.202,0.0,4ece547755cba9e7fc14125bc895f31b,542.7065,cancelado,2016-06-13,0000-00-00,captado,b76eb9b8fc0f17098812da9117d3e500,436.1935,0.803737
4,1c175bc61b9b659bbf011b2e5e3dcec6,e6762ba2ffbca07ab6cee7551caeaad5,1,976.05,90.2846,0.0,192.3325,4ece547755cba9e7fc14125bc895f31b,542.7065,entrega total,2016-06-13,2016-06-13,processado,b76eb9b8fc0f17098812da9117d3e500,433.3435,0.798486


In [4]:
df_sales.describe(percentiles=[.0001,.25, .50, .75, .90, .99, .9999])

Unnamed: 0,quantity,price,pis_cofins,icms,tax_substitution,liquid_cost,revenue,markup
count,179132.0,179132.0,179132.0,179132.0,179132.0,179132.0,179132.0,179132.0
mean,1.051498,233.924636,19.470616,25.033077,17.872476,136.036464,97.888171,0.803014
std,0.411821,161.542792,15.544559,30.451995,28.125847,83.60437,108.859098,0.758131
min,1.0,1.03,0.0,0.0,0.0,4.1141,-118.4765,-0.991381
0.01%,1.0,6.91,0.0,0.0,0.0,4.1141,-4.6149,-0.024336
25%,1.0,149.91,10.63795,0.0,0.0,78.8621,55.2935,0.542692
50%,1.0,194.4,17.5195,21.492,0.0,117.082,78.70585,0.693326
75%,1.0,309.26,28.157,38.88,30.403,205.8997,114.0443,0.892731
90%,1.0,359.91,33.2445,61.092,68.8683,213.4382,155.9618,1.104013
99%,2.0,847.6208,71.7236,136.75005,110.844769,496.9297,431.6738,3.193174


## Verify the distributions

In [5]:
def summary_dist(target_col, base_col='order_id', method='count'):
    if method == 'count':
        df_summary = df_sales[[target_col,base_col]].groupby(target_col).agg({base_col:pd.Series.nunique}).reset_index()
    else:
        df_summary = df_sales[[target_col,base_col]].groupby(target_col).agg({base_col:pd.Series.sum}).reset_index()
    df_summary[base_col] = df_summary[base_col] / df_summary[base_col].sum()  * 100
    return df_summary.sort_values(base_col).tail(25)

In [6]:
df_orders_product = summary_dist('code')
df_orders_product

Unnamed: 0,code,order_id
37,3d21b63892749e921e3ff5818753bd67,0.734096
90,a2018dae10d736a66eea5a0a349ef9ee,0.79327
74,6c82ad0e791258434fd42c51409b0239,0.796619
104,c254dc11afbcc091678f0ab49a02e7ad,0.820066
47,4557c7e5af70efd2e3ca2befd59ccdc3,0.829556
40,40bddb00475d65eddb68e9aeb6fab0de,0.966327
122,e13f7f001fe2b1af072a3d50d3058284,1.05118
82,7da116bd1d42f3475803402e710253cf,1.16897
5,0f38be2df6854b4374f06cae1bc38482,1.177902
64,5e39201e582b1bb89cae7f650e4330c8,1.240426


In [7]:
df_orders_product.tail(10).sum()

code        b08b7321c4db8f45a1a97a79d1e44dd8760693745e10b0...
order_id                                              51.1857
dtype: object

* It is interesting to see that 50% of the orders are from 10 Products 

In [8]:
df_product_quantity = summary_dist('code', base_col='quantity', method='sum')
df_product_quantity

Unnamed: 0,code,quantity
83,7e3713530b46887cff58a2e2ac433ac5,0.76132
74,6c82ad0e791258434fd42c51409b0239,0.76716
90,a2018dae10d736a66eea5a0a349ef9ee,0.785211
47,4557c7e5af70efd2e3ca2befd59ccdc3,0.804324
104,c254dc11afbcc091678f0ab49a02e7ad,0.924309
40,40bddb00475d65eddb68e9aeb6fab0de,0.945545
122,e13f7f001fe2b1af072a3d50d3058284,1.050664
82,7da116bd1d42f3475803402e710253cf,1.140919
5,0f38be2df6854b4374f06cae1bc38482,1.149944
64,5e39201e582b1bb89cae7f650e4330c8,1.351689


In [9]:
df_product_quantity.tail(10).sum()

code        b08b7321c4db8f45a1a97a79d1e44dd8760693745e10b0...
quantity                                              50.9076
dtype: object

* It is interesting to see that also 50% of  the quantity are from 10 Products 

In [10]:
df_product_revenue = summary_dist('code', base_col='revenue', method='sum')
df_product_revenue

Unnamed: 0,code,revenue
88,9e5dd3c1d252136c4351b84589dae2b5,0.843922
39,3da22f1b88a20ea8efc3d83fcb872e21,0.915465
51,4ceedf57303e127d31a164c7ae5791d8,0.926345
120,dd1935ffd0ee2b6ec159ba7867d11e57,0.965574
82,7da116bd1d42f3475803402e710253cf,1.139674
47,4557c7e5af70efd2e3ca2befd59ccdc3,1.162653
5,0f38be2df6854b4374f06cae1bc38482,1.173876
83,7e3713530b46887cff58a2e2ac433ac5,1.320499
12,1c234775cae774823f38abe6721e61a4,1.327633
94,abf2d3cb446492ee7897087db9a0b2a0,1.365793


In [11]:
df_product_revenue.tail(10).sum()

code       55447a73ff140176f4210347854c71f1fd84644da59504...
revenue                                              52.9208
dtype: object

* Now when we have that 3 products represent 30% of the revenue ?

## Deep dive on products

In [15]:
top_5_orders = df_orders_product.tail(5).code.values
top_5_product_revenue = df_product_revenue.tail(5).code.values
top_5_product_quantity = df_product_quantity.tail(5).code.values

selected_products = list(set(list(top_5_orders)+ list(top_5_product_revenue)+ list(top_5_product_quantity)))
selected_products

['0671c2b9132a3f5215a4212ce0691694',
 '4534ea61b50410b3b6243e02b40c8cd1',
 '760693745e10b0c5e68c42214c729b0d',
 '2e35421c34fb588ba40a0c57b3971d24',
 '3454ea52396a4cfd3fc37414d30c7b9c',
 '32ceebf3efea1d04ace4183d20d4da5b']

In [16]:
df_filtered_sales = df_sales[df_sales.code.isin(selected_products)]
df_filtered_sales.groupby('code').describe(percentiles=[.0001,.25, .50, .75, .90, .99, .9999]).T

Unnamed: 0,code,0671c2b9132a3f5215a4212ce0691694,2e35421c34fb588ba40a0c57b3971d24,32ceebf3efea1d04ace4183d20d4da5b,3454ea52396a4cfd3fc37414d30c7b9c,4534ea61b50410b3b6243e02b40c8cd1,760693745e10b0c5e68c42214c729b0d
quantity,count,6496.0,20944.0,8889.0,10672.0,17223.0,4865.0
quantity,mean,1.020166,1.037194,1.02205,1.044603,1.089009,1.034738
quantity,std,0.1923711,0.2485719,0.1872627,0.4689566,0.5301583,0.2537261
quantity,min,1.0,1.0,1.0,1.0,1.0,1.0
quantity,0.01%,1.0,1.0,1.0,1.0,1.0,1.0
quantity,25%,1.0,1.0,1.0,1.0,1.0,1.0
quantity,50%,1.0,1.0,1.0,1.0,1.0,1.0
quantity,75%,1.0,1.0,1.0,1.0,1.0,1.0
quantity,90%,1.0,1.0,1.0,1.0,1.0,1.0
quantity,99%,2.0,2.0,2.0,2.0,4.0,2.0


* The products are selling in orders with 1 quantity 90% of the time.
* The price deviation on 3454ea52396a4cfd3fc37414d30c7b9c and 4534ea61b50410b3b6243e02b40c8cd1 when compared to its means have a higher value than its pairs. 
* The product 4534ea61b50410b3b6243e02b40c8cd1 doesn't have to pay pis/cofins. 
* 75% of the sales for 760693745e10b0c5e68c42214c729b0d doens't do tax substitution.
* The product costs doesn't change?
* There were some sales with negative revenue (losses) 4534ea61b50410b3b6243e02b40c8cd1 and 2e35421c34fb588ba40a0c57b3971d24. 
