In [1]:
import statsmodels.api as sm 
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action="ignore")
import plotly.express as px
import plotly.graph_objects as go

In [2]:
df_rolled = pd.read_parquet("../data/output/df_model.parquet")
df_rolled.shape

(125776, 31)

In [3]:
print("Number of unique retailer PPG combinations",df_rolled[['PPG','Retailer_Name']].drop_duplicates().shape[0])

Number of unique retailer PPG combinations 743


In [4]:
df_rolled = df_rolled.loc[df_rolled['No_Total_Sales_Flag']==0]
print(df_rolled.shape)

(70475, 31)


In [5]:
print("Number of unique retailer PPG combinations",df_rolled[['PPG','Retailer_Name']].drop_duplicates().shape[0])

Number of unique retailer PPG combinations 585


In [6]:
model_levels = ["PPG","Retailer_Name"] 

In [7]:
#Log of price and sales to  build log-log model
df_rolled['log_sales'] = np.log(df_rolled['Total_Volume'])
df_rolled['log_price'] = np.log(df_rolled['Avg_Price_Per_KG'])

In [8]:
var_dep = 'log_sales'
vars_ind = ['log_price','Distribution_wtd','Category_Seasonality','Category_Trend']
vars_all = [var_dep] + vars_ind
print(vars_all)

df_rolled[vars_all].isnull().sum()

['log_sales', 'log_price', 'Distribution_wtd', 'Category_Seasonality', 'Category_Trend']


log_sales                  0
log_price                  0
Distribution_wtd        1846
Category_Seasonality       0
Category_Trend             0
dtype: int64

In [9]:
#Distribution can be replaced with mean for now, should be okay more or less
df_rolled['Distribution_numeric'] = df_rolled['Distribution_numeric'].fillna(df_rolled.groupby(model_levels)['Distribution_numeric'].transform('mean'))
df_rolled['Distribution_wtd'] = df_rolled['Distribution_wtd'].fillna(df_rolled.groupby(model_levels)['Distribution_wtd'].transform('mean'))

In [10]:
df_rolled = df_rolled.loc[~df_rolled['Distribution_numeric'].isna()]
df_rolled = df_rolled.loc[~df_rolled['Distribution_wtd'].isna()]
df_rolled.shape

(70104, 33)

In [11]:
def regress(data, yvar, xvars):
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1.
    result = sm.OLS(Y, X).fit()
    return result.params

results_summary_coeffs = df_rolled.groupby(model_levels,as_index=False).apply(regress, var_dep, vars_ind)
results_summary_coeffs.head()

Unnamed: 0,PPG,Retailer_Name,log_price,Distribution_wtd,Category_Seasonality,Category_Trend,intercept
0,BRAND A SUBBRAND A.1 ADULT ZAK GROENTE_KIP_RIJ...,Retailer A,2.469018,0.080496,-0.097917,-0.057558,0.95909
1,BRAND A SUBBRAND A.1 ADULT ZAK GROENTE_KIP_RIJ...,Retailer B,-2.670311,0.029349,0.025018,-0.05685,6.036856
2,BRAND A SUBBRAND A.1 ADULT ZAK GROENTE_KIP_RIJ...,Retailer C,-2.578562,0.061256,0.01125,0.005565,6.801467
3,BRAND A SUBBRAND A.2 ADULT ZAK GRONT_RND 1400_G,Retailer A,-1.877735,0.056422,-0.026015,-0.060092,6.019304
4,BRAND A SUBBRAND A.2 ADULT ZAK GRONT_RND 1400_G,Retailer B,-3.529319,0.023102,0.046967,-0.019693,7.836755


In [12]:
def regress(data, yvar, xvars):
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1.
    result = sm.OLS(Y, X).fit()
    return result.rsquared

results_summary_score = df_rolled.groupby(model_levels,as_index=False).apply(regress, var_dep, vars_ind)
results_summary_score.columns = ['PPG','Retailer_Name','RSquared']
results_summary_score.RSquared = results_summary_score.RSquared*100 
results_summary_score.head()

Unnamed: 0,PPG,Retailer_Name,RSquared
0,BRAND A SUBBRAND A.1 ADULT ZAK GROENTE_KIP_RIJ...,Retailer A,59.358799
1,BRAND A SUBBRAND A.1 ADULT ZAK GROENTE_KIP_RIJ...,Retailer B,87.075018
2,BRAND A SUBBRAND A.1 ADULT ZAK GROENTE_KIP_RIJ...,Retailer C,71.562287
3,BRAND A SUBBRAND A.2 ADULT ZAK GRONT_RND 1400_G,Retailer A,97.760265
4,BRAND A SUBBRAND A.2 ADULT ZAK GRONT_RND 1400_G,Retailer B,89.204141


In [13]:
def calculate_mape(pred,actual):    
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / actual)) * 100

In [14]:
results_summary_mape = pd.DataFrame(columns=['PPG_Retailer_Combo','MAPE'])

df_rolled['PPG_Retailer_Combo'] = df_rolled['PPG'] +  "*" + df_rolled['Retailer_Name']

for each_combo in df_rolled['PPG_Retailer_Combo'].unique().tolist():
    curr_df = df_rolled.loc[df_rolled['PPG_Retailer_Combo']==each_combo]
    Y = curr_df[var_dep]
    X = curr_df[vars_ind]
    X['intercept'] = 1.
    result = sm.OLS(Y, X).fit()
    curr_df['predicted'] = result.get_prediction(X).summary_frame()['mean']
    curr_mape = calculate_mape(curr_df['predicted'] ,curr_df[var_dep])
    results_summary_mape.loc[len(results_summary_mape)] = [each_combo,curr_mape]
    # print(curr_mape)


In [15]:
results_summary_mape[['PPG','Retailer_Name']] = results_summary_mape['PPG_Retailer_Combo'].str.split("*",expand=True)
del results_summary_mape['PPG_Retailer_Combo']
results_summary_mape.head()

Unnamed: 0,MAPE,PPG,Retailer_Name
0,1.017005,BRAND D SUBBRAND D.1.1 ADULT PAK GRONT_KP_RND ...,Retailer A
1,1.407051,BRAND D SUBBRAND D.1.1 ADULT PAK GRONT_KP_RND ...,Retailer B
2,0.585851,BRAND D SUBBRAND D.1.1 ADULT PAK GRONT_KP_RND ...,Retailer C
3,40.552355,BRAND D SUBBRAND D.1.3 KITTEN PAK GRANN_KP_MLK...,Retailer B
4,36.299911,BRAND D SUBBRAND D.1.3 KITTEN PAK GRANN_KP_MLK...,Retailer C


In [16]:
results_combined = pd.merge(results_summary_coeffs,results_summary_score)
results_combined = pd.merge(results_combined,results_summary_mape)
results_combined.head()

Unnamed: 0,PPG,Retailer_Name,log_price,Distribution_wtd,Category_Seasonality,Category_Trend,intercept,RSquared,MAPE
0,BRAND A SUBBRAND A.1 ADULT ZAK GROENTE_KIP_RIJ...,Retailer A,2.469018,0.080496,-0.097917,-0.057558,0.95909,59.358799,35.015876
1,BRAND A SUBBRAND A.1 ADULT ZAK GROENTE_KIP_RIJ...,Retailer B,-2.670311,0.029349,0.025018,-0.05685,6.036856,87.075018,2.07886
2,BRAND A SUBBRAND A.1 ADULT ZAK GROENTE_KIP_RIJ...,Retailer C,-2.578562,0.061256,0.01125,0.005565,6.801467,71.562287,1.026281
3,BRAND A SUBBRAND A.2 ADULT ZAK GRONT_RND 1400_G,Retailer A,-1.877735,0.056422,-0.026015,-0.060092,6.019304,97.760265,2.442645
4,BRAND A SUBBRAND A.2 ADULT ZAK GRONT_RND 1400_G,Retailer B,-3.529319,0.023102,0.046967,-0.019693,7.836755,89.204141,2.384168


In [17]:
results_combined_orig = results_combined.copy(deep=True)
print(results_combined.shape)
results_combined = results_combined.loc[(results_combined['log_price']<=0)&(results_combined['log_price']>=-10)]
results_combined = results_combined.loc[results_combined['RSquared']<=100]
results_combined = results_combined.loc[results_combined['MAPE']<=100]
print(results_combined.shape)

(532, 9)
(350, 9)


In [18]:
fig = px.scatter(results_combined,y='RSquared',x='MAPE',color='log_price',width=800,height=800,hover_data=['PPG','Retailer_Name'])
fig.show()

In [19]:
results_combined_orig.columns

Index(['PPG', 'Retailer_Name', 'log_price', 'Distribution_wtd',
       'Category_Seasonality', 'Category_Trend', 'intercept', 'RSquared',
       'MAPE'],
      dtype='object')

In [20]:
results_combined_orig.head()
for each_col in ['log_price', 'Distribution_wtd',
        'Category_Seasonality', 'Category_Trend',
       'intercept']:
    results_combined_orig.rename(columns={each_col:'coef_'+each_col},inplace=True)
results_combined_orig.head()

Unnamed: 0,PPG,Retailer_Name,coef_log_price,coef_Distribution_wtd,coef_Category_Seasonality,coef_Category_Trend,coef_intercept,RSquared,MAPE
0,BRAND A SUBBRAND A.1 ADULT ZAK GROENTE_KIP_RIJ...,Retailer A,2.469018,0.080496,-0.097917,-0.057558,0.95909,59.358799,35.015876
1,BRAND A SUBBRAND A.1 ADULT ZAK GROENTE_KIP_RIJ...,Retailer B,-2.670311,0.029349,0.025018,-0.05685,6.036856,87.075018,2.07886
2,BRAND A SUBBRAND A.1 ADULT ZAK GROENTE_KIP_RIJ...,Retailer C,-2.578562,0.061256,0.01125,0.005565,6.801467,71.562287,1.026281
3,BRAND A SUBBRAND A.2 ADULT ZAK GRONT_RND 1400_G,Retailer A,-1.877735,0.056422,-0.026015,-0.060092,6.019304,97.760265,2.442645
4,BRAND A SUBBRAND A.2 ADULT ZAK GRONT_RND 1400_G,Retailer B,-3.529319,0.023102,0.046967,-0.019693,7.836755,89.204141,2.384168


In [21]:
df_waterfall_calc = df_rolled.merge(results_combined_orig,how='left',on=['Retailer_Name','PPG'])
print(df_waterfall_calc.shape)

(70104, 41)


In [22]:
df_waterfall_calc['contri_price'] = np.exp(df_waterfall_calc['log_price']*df_waterfall_calc['coef_log_price'])
df_waterfall_calc['contri_Distribution_wtd'] = df_waterfall_calc['Distribution_wtd']*df_waterfall_calc['coef_Distribution_wtd']
df_waterfall_calc['contri_Category_Seasonality'] = df_waterfall_calc['Category_Seasonality']*df_waterfall_calc['coef_Category_Seasonality']
df_waterfall_calc['contri_Category_Trend'] = df_waterfall_calc['Category_Trend']*df_waterfall_calc['coef_Category_Trend']
df_waterfall_calc['contri_intercept'] = df_waterfall_calc['coef_intercept']

In [23]:
df_waterfall_calc['predicted_sales'] = df_waterfall_calc['contri_price'] \
    + df_waterfall_calc['contri_Distribution_wtd'] \
        +df_waterfall_calc['contri_Category_Seasonality'] \
        + df_waterfall_calc['contri_Category_Trend']\
        + df_waterfall_calc['contri_intercept']
        

In [24]:
df_waterfall_calc['perc_contri_price'] = np.round(((df_waterfall_calc['contri_price']/df_waterfall_calc['predicted_sales'])*100),2)
df_waterfall_calc['perc_contri_Distribution_wtd'] = np.round(((df_waterfall_calc['contri_Distribution_wtd']/df_waterfall_calc['predicted_sales'])*100),2)
df_waterfall_calc['perc_contri_Category_Seasonality'] = np.round(((df_waterfall_calc['contri_Category_Seasonality']/df_waterfall_calc['predicted_sales'])*100),2)
df_waterfall_calc['perc_contri_Category_Trend'] = np.round(((df_waterfall_calc['contri_Category_Trend']/df_waterfall_calc['predicted_sales'])*100),2)
df_waterfall_calc['perc_contri_Baseline'] = np.round(((df_waterfall_calc['contri_intercept']/df_waterfall_calc['predicted_sales'])*100),2)


In [25]:
df_waterfall_calc[['Total_Volume','predicted_sales','contri_price','perc_contri_price','perc_contri_Distribution_wtd','perc_contri_Baseline']]

Unnamed: 0,Total_Volume,predicted_sales,contri_price,perc_contri_price,perc_contri_Distribution_wtd,perc_contri_Baseline
0,19818.0,11.853610,0.189442,1.60,42.46,56.63
1,3312.0,9.591620,0.322152,3.36,36.27,60.93
2,16431.0,10.661958,0.908553,8.52,11.19,80.72
3,780.0,971.472969,971.460738,100.00,0.43,-0.54
4,3216.0,8.952779,1.795947,20.06,3.54,35.65
...,...,...,...,...,...,...
70099,129.7,8.995722,0.018835,0.21,45.24,53.04
70100,85.4,8.179287,5.412750,66.18,40.96,-7.49
70101,160.8,10.635576,0.004457,0.04,22.78,74.78
70102,41.4,14.716581,0.000035,0.00,0.42,98.53


In [26]:
print(df_waterfall_calc.shape)
filtered_waterfall = df_waterfall_calc.loc[df_waterfall_calc['RSquared']>90]
filtered_waterfall = filtered_waterfall.loc[filtered_waterfall['MAPE']<5]
# filtered_waterfall = filtered_waterfall.loc[(filtered_waterfall['log_price']<=0)&(filtered_waterfall['log_price']>=-3)]
filtered_waterfall = filtered_waterfall.loc[(filtered_waterfall['perc_contri_price']<=55)&(filtered_waterfall['perc_contri_price']>=30)]


filtered_waterfall.shape

(70104, 52)


(645, 52)

In [27]:
filtered_waterfall.groupby(['PPG','Retailer_Name'],as_index=False).agg({'perc_contri_price':'mean'})['PPG'].unique()

array(['BRAND B SUBBRAND B.1 ADULT POUCH RND_RD_KP_WRT_LM 1200_G_12_ST',
       'BRAND B SUBBRAND B.2 ADULT ZAK GROENTE_KIP 1500_G',
       'BRAND B SUBBRAND B.3 SENIOR BLIK GROENTE_KIP 400_G',
       'BRAND D SUBBRAND D.3 ADULT POUCH RND_GV_KP_KLK_KN_LV_ND_LM 1200_G_12_ST',
       'BRAND F SUBBRAND F.1 ADULT BLIK RND_KLK_ND_ZLM_LV_KP 1020_G_12_ST',
       'BRAND H SUBBRAND H.1 PUPPY POUCH KIP_RST_LM_GV_RND 1200_G_12_ST',
       'BRAND N SUBBRAND N.1 ADULT PAK RUND_ZLM_GRNT 1000_G',
       'BRAND O SUBBRAND G.3 KITTEN POUCH KIP_KN_RND_LM 1200_G_12_ST'],
      dtype=object)

In [28]:
filtered_waterfall.groupby(['PPG','Retailer_Name'],as_index=False).agg({'perc_contri_price':'mean'})

Unnamed: 0,PPG,Retailer_Name,perc_contri_price
0,BRAND B SUBBRAND B.1 ADULT POUCH RND_RD_KP_WRT...,Retailer C,53.959608
1,BRAND B SUBBRAND B.2 ADULT ZAK GROENTE_KIP 1500_G,Retailer B,43.97
2,BRAND B SUBBRAND B.3 SENIOR BLIK GROENTE_KIP 4...,Retailer B,43.933333
3,BRAND D SUBBRAND D.3 ADULT POUCH RND_GV_KP_KLK...,Retailer C,42.89129
4,BRAND F SUBBRAND F.1 ADULT BLIK RND_KLK_ND_ZLM...,Retailer C,54.317083
5,BRAND H SUBBRAND H.1 PUPPY POUCH KIP_RST_LM_GV...,Retailer C,31.00775
6,BRAND N SUBBRAND N.1 ADULT PAK RUND_ZLM_GRNT 1...,Retailer C,34.24609
7,BRAND O SUBBRAND G.3 KITTEN POUCH KIP_KN_RND_L...,Retailer C,31.03


In [30]:
PPG = 'BRAND B SUBBRAND B.1 ADULT POUCH RND_RD_KP_WRT_LM 1200_G_12_ST'
RETAILER = 'Retailer C'
col_list = 'perc_contri_Baseline', 'perc_contri_price','perc_contri_Distribution_wtd',\
    'perc_contri_Category_Seasonality','perc_contri_Category_Trend','predicted_sales'
value_list = list()
for each_col in col_list:
    val = filtered_waterfall.loc[(filtered_waterfall['PPG']==PPG)&(filtered_waterfall['Retailer_Name']==RETAILER),each_col].iloc[0]
    value_list.append(val)

In [31]:


fig = go.Figure(go.Waterfall(
    name = "%Contribution to Sales Volume", orientation = "v",
    measure = ["absolute", "relative", "relative", "relative", "relative", "total"],
    x = ["Baseline", "Price", "Distribution", "Category Seasonality", "Category Trend", "Final Volume"],
    textposition = "outside",
    text =  [f'{i*1:.2f}%' for i in value_list],
    y = value_list,
    connector = {"line":{"color":"rgb(63, 63, 63)"}},
))

fig.update_layout(
        title = "PPG || " + PPG + "||" + RETAILER,
        showlegend = True,height=600
)

fig.show()

In [39]:
import plotly.io as pio
pio.write_image(fig, '../plots/waterfall.jpg',height=600,width=1200,)