In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score


# import kagglehub

# Download latest version
# path = kagglehub.dataset_download('C:/Users/WoolsteJ/code/causal_modeling/uplift-modeling/data/')

# print("Path to dataset files:", path)

datapath = 'C:/Users/WoolsteJ/code/causal_modeling/uplift-modeling/data/'

The dataset was created by The Criteo AI Lab. The dataset consists of 13M rows, each one representing a user with 12 features, a treatment indicator and 2 binary labels (visits and conversions). Positive labels mean the user visited/converted on the advertiser website during the test period (2 weeks). The global treatment ratio is 84.6%. It is usual that advertisers keep only a small control population as it costs them in potential revenue.

Following is a detailed description of the features:

- f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11: feature values (dense, float)
- treatment: treatment group (1 = treated, 0 = control)
- conversion: whether a conversion occured for this user (binary, label)
- visit: whether a visit occured for this user (binary, label)
- exposure: treatment effect, whether the user has been effectively exposed (binary)

https://www.kaggle.com/datasets/arashnic/uplift-modeling

In [4]:
# Load criteo-uplift data set
df = pd.read_csv(datapath + 'criteo-uplift-v2.1.csv')

In [5]:
df.shape[0]

13979592

In [8]:
df.head(20)

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,treatment,conversion,visit,exposure
0,12.616365,10.059654,8.976429,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0
1,12.616365,10.059654,9.002689,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0
2,12.616365,10.059654,8.964775,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0
3,12.616365,10.059654,9.002801,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0
4,12.616365,10.059654,9.037999,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0
5,12.616365,10.059654,8.904507,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0
6,12.616365,10.059654,8.78334,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0
7,12.616365,10.059654,8.964528,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0
8,12.616365,10.059654,9.037809,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0
9,12.616365,10.059654,9.015128,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0


# Naive OLS

In [15]:
# naive estimate of the treatment effect using OLS
# Create a list of features to control for including the treatement variable of interest
features=['treatment']
for i in range(12):
   features.append(f'f{i}')
print(features)

X = df[features]
X = sm.add_constant(X)
# identify the target or outcome to be predicted
y = df['conversion']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

['treatment', 'f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11']


In [16]:
# Fit the model using statsmodels
model = sm.OLS(y, X).fit()

# Get the summary of the model
summary = model.summary()

In [17]:
summary

0,1,2,3
Dep. Variable:,conversion,R-squared:,0.119
Model:,OLS,Adj. R-squared:,0.119
Method:,Least Squares,F-statistic:,145200.0
Date:,"Fri, 31 Jan 2025",Prob (F-statistic):,0.0
Time:,09:35:25,Log-Likelihood:,21871000.0
No. Observations:,13979592,AIC:,-43740000.0
Df Residuals:,13979578,BIC:,-43740000.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.2676,0.003,-79.792,0.000,-0.274,-0.261
treatment,0.0009,3.79e-05,24.160,0.000,0.001,0.001
f0,-2.392e-05,3.78e-06,-6.320,0.000,-3.13e-05,-1.65e-05
f1,0.0042,0.000,22.168,0.000,0.004,0.005
f2,-0.0013,6.85e-05,-18.695,0.000,-0.001,-0.001
f3,-0.0008,1.51e-05,-55.257,0.000,-0.001,-0.001
f4,0.0396,8.11e-05,488.368,0.000,0.039,0.040
f5,-0.0022,6.73e-05,-32.030,0.000,-0.002,-0.002
f6,-0.0001,4.42e-06,-29.773,0.000,-0.000,-0.000

0,1,2,3
Omnibus:,27247049.164,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,51285065251.706
Skew:,15.775,Prob(JB):,0.0
Kurtosis:,298.043,Cond. No.,8140.0


Based on this simple OLS model using the full data set, being exposed to the treatment increases expected conversion on average by .0009 units. 

# X - Learner
X - learner is a multi step technique aimed at identifying the conditional average treatement efffect (CATE) or individual treatement effect (ITE) directly.
This is accomplished by splitting the data into two groups and estimating a model for each group with the same set of features.
 1. treated group (We'll call this ML_t)
 2. control group (We'll call this ML_c)
Generate predictions for each group using these models (Pred_t, and Pred_c)


Estimate counterfactual outcomes for each group to understand the effect of treatement for each observations by using the other model to generate predictions
 - For observations who received the treatment, we use the untreated model (ML_c) to predict what their outcome would have been without the treatment (cfact_t)
 - For observations who did not receive the treatement, we use the treated model (ML_t) to predict what their outcome would have been with the treatement (cfact_c)

 We then calcuate the difference (D) between the observed and counterfactual predictions
 - For the treated group we subtract the counterfactual from the predicted value (Pred_t - cfact_t) = D_t
 - For the control group we subtract the predicted value from the counterfactual (cfact_c - Pred_c) = D_c

 Finally, we train 2 new models using these differences as the explanatory variable against the outcome (Y)
 - For the treated group: Yhat_t = f(D_t)
 - For the control group: Yhat_c = f(D_c)

For new customers, we generate predictions using both models and subtract to obtain the predicted individual treatment effect.
- For greater accuracy this difference can be weighted using propensity scores (the probablity of being treated given a customer's characteristics). This accounts for scenarios where there are few treated individuals or few untreated individuals.


In [None]:
# Use the testing data to split the group into treated and control groups
