## Notebook Purpose:

1. The primary goal of this notebook is to validate the results produced by the platform.
2. We are picking one experiment = `exp_big_exp_run` and we are going to validate following:
    - DiD Results
3. This validation will ensure if the platform is trustworthy or not.



In [None]:
spark.stop()

In [3]:

from pyspark.sql import SparkSession
spark = (
        SparkSession.builder
        .appName("Laptop_ROG_CUPED_validation")
        .master("spark://10.0.0.80:7077")

        .config("spark.executor.instances", "2")
        .config("spark.executor.cores", "10")
        .config("spark.executor.memory", "18g")
        .config("spark.executor.memoryOverhead", "4g")

        .config("spark.driver.memory", "10g")
        .config("spark.driver.maxResultSize", "2g")
        .config("spark.driver.host", "10.0.0.80")
        .config("spark.driver.bindAddress", "0.0.0.0")

        # AQE + shuffle
        .config("spark.sql.adaptive.enabled", "true")
        .config("spark.sql.shuffle.partitions", "288")
        .config("spark.sql.files.maxPartitionBytes", "256m")

        # Don’t set spark.local.dir here; use SPARK_LOCAL_DIRS on the worker
        # .config("spark.local.dir", "...")  <-- remove
            # -------- MinIO / S3A (must match Iceberg) --------
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
        .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
        .config("spark.hadoop.fs.s3a.endpoint", "http://10.0.0.80:9100")  # <-- changed
        .config("spark.hadoop.fs.s3a.path.style.access", "true")
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
        .config("spark.hadoop.fs.s3a.aws.credentials.provider",
                "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")

        # -------- Iceberg REST catalog --------
        .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog")
        .config("spark.sql.catalog.iceberg.type", "hadoop")
        # .config("spark.sql.catalog.iceberg.uri", "http://10.0.0.59:8181")
        .config("spark.sql.catalog.iceberg.warehouse", "s3a://iceberg-warehouse/warehouse/")
        .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
        .config("spark.sql.defaultCatalog", "iceberg")
        .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")

        # Iceberg's own S3 settings (again, pointing to MinIO)
        .config("spark.sql.catalog.iceberg.s3.endpoint", "http://10.0.0.80:9100")
        .config("spark.sql.catalog.iceberg.s3.path-style-access", "true")
        .config("spark.sql.catalog.iceberg.s3.access-key-id", "minioadmin")
        .config("spark.sql.catalog.iceberg.s3.secret-access-key", "minioadmin")
        .config(
            "spark.jars",
            "/opt/spark/jars/iceberg-spark-runtime-3.4_2.12-1.6.0.jar,"
            "/opt/spark/jars/iceberg-aws-bundle-1.6.0.jar")
        .getOrCreate()
    )

26/02/19 04:17:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [14]:
v_sql = """
create or replace temporary view did_results as

with t1 as(
select distinct experiment_id, user_id, variant
from iceberg.exp.exposures
where experiment_id = 'exp_big_exp_run'
),

t2 as (
select experiment_id, user_id, metric_name, value
from iceberg.exp.outcomes
where metric_name in ('revenue','pre_revenue')
and experiment_id = 'exp_big_exp_run'
)

select t1.experiment_id, t2.user_id, t1.variant,
max(case when metric_name = 'revenue' then value else 0 end) as post,
max(case when metric_name = 'pre_revenue' then value else 0 end) as pre

from t1
inner join t2
on t1.experiment_id =  t2.experiment_id
and t1.user_id =  t2.user_id

group by  t1.experiment_id, t2.user_id, t1.variant


"""

spark.sql(v_sql).show(truncate=False)

++
||
++
++



In [13]:
v_sql = """
select distinct experiment_id,metric_name from iceberg.exp.outcomes
where experiment_id = 'exp_big_exp_run'
"""

spark.sql(v_sql).show()

+---------------+-----------+
|  experiment_id|metric_name|
+---------------+-----------+
|exp_big_exp_run|pre_revenue|
|exp_big_exp_run| conversion|
|exp_big_exp_run|    revenue|
+---------------+-----------+



In [15]:
v_sql = """
select * from did_results 
where post <> 0
limit 20
"""

spark.sql(v_sql).show()

26/02/19 04:27:42 WARN DataSourceV2Strategy: Can't translate true to source filter, unsupported expression
[Stage 36:>                                                         (0 + 1) / 1]

+---------------+---------+---------+------------------+------------------+
|  experiment_id|  user_id|  variant|              post|               pre|
+---------------+---------+---------+------------------+------------------+
|exp_big_exp_run|u_0000028|  control| 4.660211329628044| 7.942785605843696|
|exp_big_exp_run|u_0000044|  control|15.045331295098066| 17.61707980444741|
|exp_big_exp_run|u_0000045|  control|10.761676001676506|14.615228707203336|
|exp_big_exp_run|u_0000046|treatment|12.961100944601199|12.344331046091192|
|exp_big_exp_run|u_0000059|  control|  7.10428510891265|15.075387570944553|
|exp_big_exp_run|u_0000062|treatment| 9.219429144458251|16.087307296706186|
|exp_big_exp_run|u_0000079|treatment|12.455239950701042| 12.94523091436157|
|exp_big_exp_run|u_0000082|treatment| 9.623341036448572|  9.81912435744377|
|exp_big_exp_run|u_0000083|treatment|13.554762100421534|13.503712096797475|
|exp_big_exp_run|u_0000084|  control|14.657482410076426|15.376523068107865|
|exp_big_exp

                                                                                

In [37]:
v_sql = """
select variant , avg(pre) pre_mean,
avg(post) as post_mean
from did_results
group by variant


"""

# pre_post_means = spark.sql(v_sql).toPandas()
pre_post_means
did = (pre_post_means[pre_post_means['variant']=='treatment']['post_mean'].values[0] - pre_post_means[pre_post_means['variant']=='treatment']['pre_mean'].values[0] 
          - pre_post_means[pre_post_means['variant']=='control']['post_mean'].values[0] + pre_post_means[pre_post_means['variant']=='control']['pre_mean'].values[0])
print(f"Difference in Differences Estimate:  {did}")


Difference in Differences Estimate:  0.6196836438164492


In [21]:
v_sql = """
with t1 as (select case when variant = 'treatment' then 1 else 0 end as group, 1 as period,
avg(post) as measure
from did_results
group by case when variant = 'treatment' then 1 else 0 end
union all
select case when variant = 'treatment' then 1 else 0 end as group, 0 as period,
avg(pre) as measure
from did_results
group by case when variant = 'treatment' then 1 else 0 end)

select t1.*, group*period as interaction
from t1


"""

matrix = spark.sql(v_sql).toPandas()

matrix


26/02/19 04:34:02 WARN DataSourceV2Strategy: Can't translate true to source filter, unsupported expression
26/02/19 04:34:02 WARN DataSourceV2Strategy: Can't translate true to source filter, unsupported expression
                                                                                

Unnamed: 0,group,period,measure,interaction
0,1,1,1.877119,1
1,0,1,1.25252,0
2,1,0,12.504177,0
3,0,0,12.499261,0


In [25]:
spark.sql("select * from did_results limit 10").show()

26/02/19 04:44:26 WARN DataSourceV2Strategy: Can't translate true to source filter, unsupported expression
[Stage 41:>                                                         (0 + 1) / 1]

+---------------+---------+---------+----+------------------+
|  experiment_id|  user_id|  variant|post|               pre|
+---------------+---------+---------+----+------------------+
|exp_big_exp_run|u_0000001|treatment| 0.0|16.311533793776313|
|exp_big_exp_run|u_0000002|treatment| 0.0| 11.32790644644474|
|exp_big_exp_run|u_0000003|  control| 0.0|14.174728490227805|
|exp_big_exp_run|u_0000004|treatment| 0.0|10.847600353402715|
|exp_big_exp_run|u_0000005|  control| 0.0| 8.275909435378757|
|exp_big_exp_run|u_0000006|  control| 0.0|12.443259526513518|
|exp_big_exp_run|u_0000007|treatment| 0.0|15.835280615716401|
|exp_big_exp_run|u_0000008|treatment| 0.0| 7.090048635555965|
|exp_big_exp_run|u_0000010|  control| 0.0|  9.99508481757065|
|exp_big_exp_run|u_0000011|treatment| 0.0|16.328538904577922|
+---------------+---------+---------+----+------------------+



                                                                                

In [26]:
v_sql = """
select user_id, case when variant = 'treatment' then 1 else 0 end as group,
1 as period, post as measure
from did_results 
union all
select user_id, case when variant = 'treatment' then 1 else 0 end as group,
0 as period, pre as measure
from did_results 

"""
raw_df = spark.sql(v_sql).toPandas()

26/02/19 04:49:11 WARN DataSourceV2Strategy: Can't translate true to source filter, unsupported expression
26/02/19 04:49:11 WARN DataSourceV2Strategy: Can't translate true to source filter, unsupported expression
                                                                                

In [27]:
raw_df.count()

user_id    1699422
group      1699422
period     1699422
measure    1699422
dtype: int64

In [28]:
import statsmodels.formula.api as smf

model = smf.ols(
    "measure ~ group + period + group:period",
    data=raw_df
).fit(cov_type="cluster", cov_kwds={"groups": raw_df["user_id"]})

print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                measure   R-squared:                       0.688
Model:                            OLS   Adj. R-squared:                  0.688
Method:                 Least Squares   F-statistic:                 1.288e+06
Date:                Thu, 19 Feb 2026   Prob (F-statistic):               0.00
Time:                        04:49:53   Log-Likelihood:            -4.6277e+06
No. Observations:             1699422   AIC:                         9.255e+06
Df Residuals:                 1699418   BIC:                         9.256e+06
Df Model:                           3                                         
Covariance Type:              cluster                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       12.4993      0.005   2719.518   

## Conclusion:

As shown below when we compare manually calculated values with platform values there are exactly the same and this further shows that platform results can be trusted.

**Manual Calculation:**

` Difference in Differences Estimate:  0.6196836438164492 `

``` 
OLS Regression Results                            
==============================================================================
Dep. Variable:                measure   R-squared:                       0.688
Model:                            OLS   Adj. R-squared:                  0.688
Method:                 Least Squares   F-statistic:                 1.288e+06
Date:                Thu, 19 Feb 2026   Prob (F-statistic):               0.00
Time:                        04:49:53   Log-Likelihood:            -4.6277e+06
No. Observations:             1699422   AIC:                         9.255e+06
Df Residuals:                 1699418   BIC:                         9.256e+06
Df Model:                           3                                         
Covariance Type:              cluster                                         
================================================================================
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       12.4993      0.005   2719.518      0.000      12.490      12.508
group            0.0049      0.007      0.756      0.450      -0.008       0.018
period         -11.2467      0.008  -1496.408      0.000     -11.261     -11.232
group:period     0.6197      0.011     54.833      0.000       0.598       0.642
==============================================================================
Omnibus:                   664246.152   Durbin-Watson:                   2.005
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          2663178.806
Skew:                           1.949   Prob(JB):                         0.00
Kurtosis:                       7.735   Cond. No.                         6.86
==============================================================================

Notes:
[1] Standard Errors are robust to cluster correlation (cluster)
```



**Platform results:**


metric_pre: pre_revenue metric_post: revenue metric_name (stored): revenue

window: 2026-01-01 00:00:00 → 2026-01-29 00:00:00

Cell means
- control_pre: 12.4993
- control_post: 1.25252
- treatment_pre: 12.5042
- treatment_post: 1.87712


Estimate
- did: 0.619684
- se: 0.011303999892893988
- p-value: 0.0
- ci95: [0.5975278040263842, 0.6418394836065285]