In [7]:
# Libraries
import pandas as pd
import numpy as np
import hvplot.pandas

In [87]:
# Ingest data
df = pd.read_csv("./examples/oj.csv")

#### Data Exploration

In [84]:
df.head()

Unnamed: 0,sales,price,brand,feat
0,8256.0,3.87,tropicana,0
1,6144.0,3.87,tropicana,0
2,3840.0,3.87,tropicana,0
3,8000.0,3.87,tropicana,0
4,8896.0,3.87,tropicana,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28947 entries, 0 to 28946
Data columns (total 4 columns):
sales    28947 non-null float64
price    28947 non-null float64
brand    28947 non-null object
feat     28947 non-null int64
dtypes: float64(2), int64(1), object(1)
memory usage: 904.7+ KB


In [5]:
df["brand"].unique()

array(['tropicana', 'minute.maid', 'dominicks'], dtype=object)

In [88]:
df["brand"] = df["brand"].astype("category")
df["log_sales"] = np.log(df["sales"])
df["log_price"] = np.log(df["price"])
df.hvplot.box("log_price", by="brand", color="brand", legend=False).opts(cmap="tab10")

In [58]:
df.hvplot.scatter(x="log_price", y="log_sales", color="brand", legend=True).opts(cmap="tab10", alpha=0.2)

> Dominick’s is the budget option, Tropicana is the luxury option, and Minute Maid lives between.

Taddy, Matt. Business Data Science: Combining Machine Learning and Economics to Optimize, Automate, and Accelerate Business Decisions . McGraw-Hill Education. Kindle Edition. 

#### Why Logarithms?
> Whenever you are working with linear (i.e., additive) models, it is crucial that you try to work in the space where you expect to find linearity. For variables that change multiplicatively with other factors, this is usually the log scale.

Taddy, Matt. Business Data Science: Combining Machine Learning and Economics to Optimize, Automate, and Accelerate Business Decisions . McGraw-Hill Education. Kindle Edition. 

#### Generalized Linear Model No Interaction

In [91]:
# We use use stasmodels and not SK-Learn because the former is developed for inferential statistics; the 
# latter is developed for machine learning prediction. 
import statsmodels.formula.api as smf
# Build the model using the formula api; the C() command treats the brand as a dummy variable automatically. 
smf.glm(formula="log_sales ~ log_price + C(brand)", data=df).fit().summary()


0,1,2,3
Dep. Variable:,log_sales,No. Observations:,28947.0
Model:,GLM,Df Residuals:,28943.0
Model Family:,Gaussian,Df Model:,3.0
Link Function:,identity,Scale:,0.62968
Method:,IRLS,Log-Likelihood:,-34377.0
Date:,"Sat, 16 Nov 2019",Deviance:,18225.0
Time:,16:37:17,Pearson chi2:,18200.0
No. Iterations:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,10.8288,0.015,745.041,0.000,10.800,10.857
C(brand)[T.minute.maid],0.8702,0.013,67.320,0.000,0.845,0.896
C(brand)[T.tropicana],1.5299,0.016,93.808,0.000,1.498,1.562
log_price,-3.1387,0.023,-136.888,0.000,-3.184,-3.094


#### Generalized Linear Model, Brand-Price Interaction

In [92]:
# By including this interaction effect we are creating a slope and intercept for each brand; not a single model for all.
smf.glm(formula="log_sales ~ log_price * C(brand)", data=df).fit().summary()

0,1,2,3
Dep. Variable:,log_sales,No. Observations:,28947.0
Model:,GLM,Df Residuals:,28941.0
Model Family:,Gaussian,Df Model:,5.0
Link Function:,identity,Scale:,0.62588
Method:,IRLS,Log-Likelihood:,-34289.0
Date:,"Sat, 16 Nov 2019",Deviance:,18114.0
Time:,16:43:28,Pearson chi2:,18100.0
No. Iterations:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,10.9547,0.021,529.136,0.000,10.914,10.995
C(brand)[T.minute.maid],0.8883,0.042,21.376,0.000,0.807,0.970
C(brand)[T.tropicana],0.9624,0.046,20.719,0.000,0.871,1.053
log_price,-3.3775,0.036,-93.322,0.000,-3.448,-3.307
log_price:C(brand)[T.minute.maid],0.0568,0.057,0.991,0.322,-0.055,0.169
log_price:C(brand)[T.tropicana],0.6658,0.054,12.439,0.000,0.561,0.771


#### Calculating the Brand Price Elasticities

> The reference category is dominicks; this brand is absorbed into both the intercept and the main slope term on log price. You find the elasticities for the other brands by adding the log(price):brand interaction terms to this main slope.

Taddy, Matt. Business Data Science: Combining Machine Learning and Economics to Optimize, Automate, and Accelerate Business Decisions . McGraw-Hill Education. Kindle Edition. 

In [96]:
# Calculating the elasticities from the price/brand interactions
dom_elas = -3.37
min_elas = -3.37 + 0.0568
trop_elas = -3.37 + 0.6658
print(
    "The elasticity for Dominicks is {}, Minute Maid is {}, and Tropicana is {}. Tropicana customers are less sensitive than the others.".format(
        dom_elas, min_elas, trop_elas
    )
)

The elasticity for Dominicks is -3.37, Minute Maid is -3.3132, and Tropicana is -2.7042. Tropicana customers are less sensitive than the others.
