In [2]:
import os
import numpy as np
import pandas as pd

### Settings & Data Load

In [70]:
data_root = "./UnbiasedLearningCausal/data"
raw_dir = f"{data_root}/raw"
preprocess_dir = f"{data_root}/preprocessed"
product_dir = f"{preprocess_dir}/dunn_mailer_10_10_1_1"

rate_prior = 0.9
capping = 0.01

promo_df = pd.read_csv(f"{raw_dir}/causal_data.csv")
print(f"Values in 'mailer' column: \n{promo_df['mailer'].unique()}")
promo_df[["PRODUCT_ID", "WEEK_NO", "mailer"]].sample(5)

Values in 'mailer' column: 
['A' '0' 'H' 'D' 'F' 'J' 'C' 'L' 'X' 'P' 'Z']


Unnamed: 0,PRODUCT_ID,WEEK_NO,mailer
23140708,1979767,35,C
35446177,13133577,93,0
18294114,1303452,15,A
18978324,1367655,91,A
27356607,5665551,27,A


* Weekly mailer in causal_data.csv is used as treatment variable.
* Mailer '0' means not on ad and others mean on ad.

In [138]:
trans_df = pd.read_csv(f"{raw_dir}/transaction_data.csv")
trans_df[["household_key", "PRODUCT_ID", "QUANTITY", "WEEK_NO"]].sample(5)

Unnamed: 0,household_key,BASKET_ID,DAY,PRODUCT_ID,QUANTITY,SALES_VALUE,STORE_ID,RETAIL_DISC,TRANS_TIME,WEEK_NO,COUPON_DISC,COUPON_MATCH_DISC
0,2375,26984851472,1,1004906,1,1.39,364,-0.60,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.00,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.30,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.00,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.50,364,-0.39,1631,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2595727,1598,42305362535,711,92130,1,0.99,3228,0.00,1520,102,0.0,0.0
2595728,1598,42305362535,711,114102,1,8.89,3228,0.00,1520,102,0.0,0.0
2595729,1598,42305362535,711,133449,1,6.99,3228,0.00,1520,102,0.0,0.0
2595730,1598,42305362535,711,6923644,1,4.50,3228,-0.49,1520,102,0.0,0.0


In [74]:
product_df = pd.read_csv(f"{product_dir}/cnt_logs.csv")
num_data = product_df.shape[0]
num_users = np.max(product_df.loc[:, "idx_user"].values) + 1
num_items = np.max(product_df.loc[:, "idx_item"].values) + 1
print(f"# of obs   : {num_data}")
print(f"# of users : {num_users}")
print(f"# of items : {num_items}")

# of obs   : 25947990
# of users : 2290
# of items : 11331


In [137]:
product_df[product_df["num_outcome"]!=0].sample(5)

Unnamed: 0,idx_user,idx_item,num_visit,num_treatment,num_outcome,num_treated_outcome
22928354,2023,5741,72,2,1,0
9295268,820,3848,71,10,2,0
2194665,193,7782,61,14,3,0
14746773,1301,5142,26,8,1,1
1071760,94,6646,33,0,1,0


* idx_user / idx_item : the observed outcome pair user-item
* num_visit : the number of visit of the user, i.e., $\sum_{t} V_{ut} = \sum_{t} \mathbb{I}_{\{\sum_{i} Y_{uit} > 0\}}$
* num_treatment : the number of recommendations, i.e., $\sum_{t} Z_{uit}$
* num_outcome : the number of purchases, i.e., $\sum_{t} Y_{uit}$
* num_treated_outcome : the number of purchases with recommendation, i.e., $\sum_{t}Y_{uit}^{1} = \sum_{t} Z_{uit} Y_{uit}$

In [35]:
product_df.loc[:, 'num_control'] = product_df.loc[:, 'num_visit'] - product_df.loc[:, 'num_treatment']
product_df.loc[:, 'num_control_outcome'] = product_df.loc[:, 'num_outcome'] - product_df.loc[:, 'num_treated_outcome']
product_df.head(5)

Unnamed: 0,idx_user,idx_item,num_visit,num_treatment,num_outcome,num_treated_outcome,num_control,num_control_outcome
0,0,0,64,5,0,0,59,0
1,0,1,64,6,0,0,58,0
2,0,2,64,9,0,0,55,0
3,0,3,64,3,0,0,61,0
4,0,4,64,9,0,0,55,0


* num_control : the number of no recommendations, i.e., $\sum_{t}(1 - Z_{uit}) = \sum_{t} V_{ut} - \sum_{t} Z_{uit}$
* num_control_outcome : the number of purchases without recommendation, i.e., $\sum_{t} Y_{uit}^{0} = \sum_{t} (1 - Z_{uit}) Y_{uit} = \sum_{t} Y_{uit} - \sum_{t} Z_{uit} Y_{uit}$

### 1. Modeling purchase probabilities

#### (1) Calculating $\mathbb{P}(Y_{ui}^1 = 1)$, $\mathbb{P}(Y_{ui}^0 = 1)$, and $\mathbb{P}(Y_{ui} = 1)$

In [54]:
prob_df = product_df.copy()
prob_df.loc[:, 'prob_outcome_treated'] = product_df.loc[:, 'num_treated_outcome'] / product_df.loc[:, 'num_treatment']
prob_df.loc[:, 'prob_outcome_control'] = product_df.loc[:, 'num_control_outcome'] / product_df.loc[:, 'num_control']
prob_df.loc[:, 'prob_outcome'] = product_df.loc[:, 'num_outcome'] / product_df.loc[:, 'num_visit']
rand_indices = pd.Series(prob_df.index).sample(5).tolist()
prob_df.loc[rand_indices, ["idx_user", "idx_item", "prob_outcome_treated", "prob_outcome_treated", "prob_outcome"]].head()

Unnamed: 0,idx_user,idx_item,prob_outcome_treated,prob_outcome_treated.1,prob_outcome
10785123,951,9342,0.0,0.0,0.0
6481707,572,375,0.0,0.0,0.0
15213336,1342,7134,0.0,0.0,0.0
12099329,1067,9152,0.0,0.0,0.0
20544033,1813,930,0.0,0.0,0.0


#### (2) Averaing over all users

In [37]:
df_mean = product_df.loc[:, ["idx_item", 'num_treated_outcome', 'num_control_outcome',
                                'num_treatment', 'num_control', 'num_outcome', 'num_visit']]
df_mean = df_mean.groupby("idx_item", as_index=False).mean()
df_mean.head(5)

Unnamed: 0,idx_item,num_treated_outcome,num_control_outcome,num_treatment,num_control,num_outcome,num_visit
0,0,0.011354,0.034498,3.671616,44.304803,0.045852,47.976419
1,1,0.004367,0.015284,4.422271,43.554148,0.019651,47.976419
2,2,0.020524,0.10131,5.272489,42.70393,0.121834,47.976419
3,3,0.001747,0.00917,2.982969,44.99345,0.010917,47.976419
4,4,0.10131,0.070306,7.282533,40.693886,0.171616,47.976419


#### (3) Accounting for uncertainty using above average values

In [56]:
df_mean = df_mean.rename(columns={'num_treated_outcome': 'num_treated_outcome_mean',
                                    'num_control_outcome': 'num_control_outcome_mean',
                                    'num_treatment': 'num_treatment_mean',
                                    'num_control': 'num_control_mean',
                                    'num_outcome': 'num_outcome_mean',
                                    'num_visit': 'num_visit_mean'})

merged_df = pd.merge(product_df, df_mean, on=["idx_item"], how='left')

merged_df.loc[:, 'prob_outcome_treated'] = \
            (merged_df.loc[:, 'num_treated_outcome'] + rate_prior * merged_df.loc[:, 'num_treated_outcome_mean']) / \
            (merged_df.loc[:, 'num_treatment'] + rate_prior * merged_df.loc[:, 'num_treatment_mean'])

merged_df.loc[:, 'prob_outcome_control'] = \
    (merged_df.loc[:, 'num_control_outcome'] + rate_prior * merged_df.loc[:,'num_control_outcome_mean']) / \
    (merged_df.loc[:, 'num_control'] + rate_prior * merged_df.loc[:, 'num_control_mean'])

merged_df.loc[:, 'prob_outcome'] = \
    (merged_df.loc[:, 'num_outcome'] + rate_prior * merged_df.loc[:, 'num_outcome_mean']) / \
    (merged_df.loc[:, 'num_visit'] + rate_prior * merged_df.loc[:, 'num_visit_mean'])

merged_df.loc[rand_indices, ["idx_user", "idx_item", "prob_outcome_treated", "prob_outcome_treated", "prob_outcome"]].head()

Unnamed: 0,idx_user,idx_item,prob_outcome_treated,prob_outcome_treated.1,prob_outcome
10785123,951,9342,0.000184,0.000184,0.000112
6481707,572,375,0.000865,0.000865,0.000419
15213336,1342,7134,0.001519,0.001519,0.001216
12099329,1067,9152,0.000484,0.000484,0.000196
20544033,1813,930,0.004509,0.004509,0.000495


### 2. Modeling propensities

#### (1) Based on original dataset

In [64]:
# def assign_propensity(self, capping = 0.01, mode='original', scale_factor=1.0, num_rec=100, df_train=None):
capping = 0.000001
merged_df.loc[:, "propensity"] = \
    (merged_df.loc[:, 'num_treatment'] + rate_prior * merged_df.loc[:,'num_treatment_mean']) / \
    (merged_df.loc[:, 'num_visit'] + rate_prior * merged_df.loc[:, 'num_visit_mean'])
merged_df.loc[:, "pred"] = 0.0

merged_df.loc[merged_df.loc[:, "propensity"] < capping, "propensity"] = capping
merged_df.loc[merged_df.loc[:, "propensity"] > 1 - capping, "propensity"] = 1 - capping

merged_df.loc[rand_indices, ["idx_user", "idx_item", "propensity", "pred"]].head()

Unnamed: 0,idx_user,idx_item,propensity,pred
10785123,951,9342,0.101502,0.0
6481707,572,375,0.117945,0.0
15213336,1342,7134,0.367718,0.0
12099329,1067,9152,0.223423,0.0
20544033,1813,930,0.087,0.0


#### (2) Based on personalized recommender simulation