In [4]:
# Let's start by eliminating risk. The following are things that are worth looking at:
# - products with small unit profits pose a risk as external costs could increase
# - product with small percent margins pose a risk as they are costly to obtain if they don't sell
# - products with small number of unique customers could pose a risk (normalize by months sold)
# Let's call any product with both of these characteristics "Risky" and any product without either 
# of these characteristics is fine.
# The argument here when discussing stocking is to be cautious (e.g. little or no need to overstock, 
# little values in pursuing bulk pricing) When discussing purchases the recognition of these items as 
# Risky MAY help convince purveryors to lower their price.


import pandas as pd

# We will load in the data from the dataset
df = pd.read_csv("../data/sales_dataset.csv")
df.head(5)

Unnamed: 0,order_date,order_number,customer_id,customer_name,zipcode,city,state_id,state_name,product_code,product_name,category,unit_price,unit_profit,quantity,revenue,profit,discount_percentage,discount_amount,lead_time_weeks
0,2019-11-13,Ord-3255406,cust-00004673,Ariana Nixon,32162,The Villages,FL,Florida,Nut-48306,Macadamia Style 5 Raw 11.34kg Bag,Nuts,26.26,9.19,3,78.77,27.57,0.0,0.0,2
1,2016-04-06,Ord-6058810,cust-00038703,Mohammad Esparza,80123,Littleton,CO,Colorado,Nut-48306,Macadamia Style 5 Raw 11.34kg Bag,Nuts,26.26,9.19,1,26.26,9.19,0.0,0.0,2
2,2019-08-15,Ord-1857958,cust-00106675,Jada Porter,55433,Minneapolis,MN,Minnesota,Nut-48306,Macadamia Style 5 Raw 11.34kg Bag,Nuts,26.26,9.19,4,105.02,36.76,0.0,0.0,2
3,2019-10-01,Ord-8598538,cust-00188381,Ari Mcgrath,36201,Anniston,AL,Alabama,Nut-48306,Macadamia Style 5 Raw 11.34kg Bag,Nuts,26.26,9.19,3,78.77,27.57,0.0,0.0,2
4,2016-07-24,Ord-3416282,cust-00213804,Kaitlynn Watkins,46143,Greenwood,IN,Indiana,Nut-48306,Macadamia Style 5 Raw 11.34kg Bag,Nuts,26.26,9.19,4,105.02,36.76,0.0,0.0,2


In [5]:
# Touch of data cleaning
pd.options.display.max_rows = 100

# Confirm that the product_code:product_name relationship is 1:1
product_names = pd.unique(df.product_name.values)
product_names.sort()
print(len(product_names))

product_codes = pd.unique(df.product_code.values)
product_codes.sort()
print(len(product_codes))

95
95


In [6]:
# Let's check out the unit profits first
pd.set_option('display.max_rows', 10)
unit_profits = df.filter(["product_name", "unit_profit"]).groupby('product_name').mean("unit_profit")
unit_profits.sort_values('unit_profit')

# Let's somewhat arbitrarily call unit profit 5$ based on what we see here OR 95th percentile

Unnamed: 0_level_0,unit_profit
product_name,Unnamed: 1_level_1
Cashew LWP (Large White Pieces) 20kg Bag,4.17
Vanilla Essence 25kg Drum SPIVANIL,4.36
Sultanas Medium Choice 12.5kg Bag,5.54
Cranberries 11.34kg Bag,6.29
Pistachio In Shell Californian 21/25 11.34kg Bag,6.82
...,...
Apricot Turkish 12.5kg Bag,18.44
High Oleic Sunflower Oil *Procured on firm Order Confirmation 200kg Drum OIL02,19.33
Mango Strips 12.5kg Bag,20.58
Strawberry Dried 12.5kg Bag,20.79


In [8]:
# Let's check out the percentage profits now
percentage_profits = df.filter(["product_name", "unit_profit", "unit_price"])
percentage_profits['percentage_profit'] = percentage_profits['unit_profit'] / percentage_profits['unit_price'] * 100
percentage_profits = percentage_profits.groupby('product_name').mean('percentage_profit')
percentage_profits.filter().sort_values('percentage_profit')
# Percentage profit is pretty stable! Let's say anything below 34.5% should be flagged OR 95th percentile

TypeError: Must pass either `items`, `like`, or `regex`

In [9]:
# Next, there are three types of good that jump out as having value to assess given our 
# ecommerce model:
# - seasonal/static: does a product sell more in a particular month, or is it flat?
# - regional/universal: does a product sell more in a given state (normalized by percentage
#       of purchasers from that state)
# - novelty/staple: does a product sell once to a customer or does it continue to sell?
# - magic/normal: does purchasing this product increase the likelihood of repeat business
# Of these categories, regional/universal may have the most actionable insight in light of
# our recent work with warehouse selection.

# Let's first figure out what percentage of total purchases come from which state
num_orders = df.shape[0]
state_orders = df.filter(['state_id', 'order_number']).groupby('state_id').count()
state_orders['fraction_of_all_orders'] = state_orders['order_number']/num_orders

# Now lets get the total number of rows for each product in each state
product_state = df.filter(['product_name', 'state_id'])
observed = pd.crosstab(product_state['product_name'], product_state['state_id']) 
observed.head()

# We are testing for regional preference, so the null assertion will be that purchasers in each
# state have no preference. The number of purchasers still matters however, so we will normalize
# by our fraction_of_all_orders table from above, rather than evenly distributing orders across states
product = 'Almond Fine Meal 11.34kg Bag'
total_product_orders = observed.loc[[product]].sum(axis=1)[product]


expected_dict = {state_id: [total_product_orders * state_orders['fraction_of_all_orders'][state_id]] for state_id in observed.columns}
expected = pd.DataFrame(expected_dict, index=pd.Index([product], name='product_name'))


In [10]:
from scipy import stats

chisq, p = stats.chisquare(f_obs=observed.loc[[product]], f_exp=expected)

  relative_diff = (np.abs(f_obs_sum - f_exp_sum) /


ValueError: For each axis slice, the sum of the observed frequencies must agree with the sum of the expected frequencies to a relative tolerance of 1e-08, but the percent differences are:
[       inf 0.23742467 0.48122    0.19493838 0.13465843 0.14149942
 0.10862917 0.04929959 0.06741813 0.22644567 0.25225303 0.20094167
 0.09933063 0.007525   0.08128713 0.01557211 1.24363333 0.43128333
 0.02259643 0.25919156 0.24633864 0.4094681  0.14291667 0.35793571
 0.15135667 0.08019065 0.397875   0.29996391        inf 0.24252268
 0.19228591 0.00783529 0.84414    0.0736361  0.07856623 0.03982001
 0.24791429 0.08243    0.12385263 0.12673676 3.3255     0.53644325
        inf 0.05318589 0.04001988 0.07048668 0.05957699        inf
 0.56079617 0.3715     0.07712193 0.41472731]