In [2]:
# Calculate the Intent-to-treat effect

from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd
from statsmodels.api import OLS, add_constant

from pathlib import Path
PROJECT_ROOT = Path().resolve().parents[0]   # or parents[1] if notebook lives deeper
DATA_DIR = PROJECT_ROOT / "data"
RAW_DIR = DATA_DIR / "raw"

data = pd.read_csv(RAW_DIR / "criteo-uplift-v2.1.csv")

In [3]:
# Calculate ITT

model = OLS(data['conversion'], add_constant(data['treatment'])).fit(cov_type = "HC1")
print(model.summary())
itt_effect = model.params['treatment']
itt_se = model.bse['treatment']
print(f"Intent-to-Treat Effect: {itt_effect}")
print(f"Intent-to-Treat SE: {itt_se}")

                            OLS Regression Results                            
Dep. Variable:             conversion   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1123.
Date:                Sun, 23 Nov 2025   Prob (F-statistic):          3.27e-246
Time:                        20:53:16   Log-Likelihood:             2.0986e+07
No. Observations:            13979592   AIC:                        -4.197e+07
Df Residuals:                13979590   BIC:                        -4.197e+07
Df Model:                           1                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0019   3.04e-05     63.804      0.0

Since not everyone assigned to treatment actually receives treatment, we will use assignment to treatment as IV to back out the Local Average Treatment Effect (LATE)

In [6]:
from linearmodels.iv import IV2SLS
import pandas as pd

# 2SLS: conversion ~ exposure_hat, instrumented by treatment
res = IV2SLS.from_formula(
    "conversion ~ 1 + [exposure ~ treatment]",
    data=data
).fit(cov_type="robust")  # heteroskedasticity-robust SE

print(res.summary)
print(data[data['treatment'] == 0]['conversion'].mean() ) 




                          IV-2SLS Estimation Summary                          
Dep. Variable:             conversion   R-squared:                      0.0238
Estimator:                    IV-2SLS   Adj. R-squared:                 0.0238
No. Observations:            13979592   F-statistic:                    1129.6
Date:                Sun, Nov 23 2025   P-value (F-stat)                0.0000
Time:                        20:54:22   Distribution:                  chi2(1)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept      0.0019  3.037e-05     63.804     0.0000      0.0019      0.0020
exposure       0.0320     0.0010     33.610     0.00

Among compliers, the probability of conversion increases by approximately 3.2 percentage points. Given that the baseline conversion rate in the control group is 0.19, this suggests a substantial uplift effect for those who actually receive the treatment. Exposure increases conversion by ~1630% relative to baseline. 

In [None]:
# Calculate ITT for visits

model_visit = OLS(data['visit'], add_constant(data['treatment'])).fit(cov_type = "HC1")
print(model_visit.summary())


                            OLS Regression Results                            
Dep. Variable:                  visit   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     4996.
Date:                Sun, 23 Nov 2025   Prob (F-statistic):               0.00
Time:                        20:54:42   Log-Likelihood:             1.8756e+06
No. Observations:            13979592   AIC:                        -3.751e+06
Df Residuals:                13979590   BIC:                        -3.751e+06
Df Model:                           1                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0382      0.000    288.594      0.0

In [8]:
# 2SLS: conversion ~ exposure_hat, instrumented by treatment
res_visit = IV2SLS.from_formula(
    "visit ~ 1 + [exposure ~ treatment]",
    data=data
).fit(cov_type="robust")  # heteroskedasticity-robust SE

print(res_visit.summary)
print(data[data['treatment'] == 0]['visit'].mean() ) 


                          IV-2SLS Estimation Summary                          
Dep. Variable:                  visit   R-squared:                      0.0897
Estimator:                    IV-2SLS   Adj. R-squared:                 0.0897
No. Observations:            13979592   F-statistic:                    5090.6
Date:                Sun, Nov 23 2025   P-value (F-stat)                0.0000
Time:                        20:56:18   Distribution:                  chi2(1)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept      0.0382     0.0001     288.59     0.0000      0.0379      0.0385
exposure       0.2870     0.0040     71.349     0.00

In [None]:
Among compliers, the probability of visit increases by approximately 28 percentage points. Given that the baseline conversion rate in the control group is 3.8%, this suggests a substantial uplift effect for those who actually receive the treatment. Exposure increases visit by ~736% relative to baseline. 

In [None]:
# We don't want to do regresion adjustment here because our SEs are already really small given the large N. 
# But let's see what happens if we do it anyway.
# Note that f variables are assumed to be pre-treatment variables. Criteo documentation does not confirm whether this is true.
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
f_vars = [f"f{i}" for i in range(10)]
X = data[["treatment"] + f_vars]
X = sm.add_constant(X)     # add intercept
y = data["conversion"]
model_adj = sm.OLS(y, X).fit(cov_type="HC1")
print(model_adj.params['treatment'])
print(model_adj.bse['treatment'])

# Does not seem to be much of a difference. They might not be pre-treatment variables!



0.0008829151087941316
3.273213274328386e-05
