## Notebook Purpose:

1. The primary goal of this notebook is to validate the results produced by the platform.
2. We are picking one experiment = `exp_big_exp_run` and we are going to validate following:
    - DiD Pretrends Assumption
3. This validation will ensure if the platform is trustworthy or not.



In [None]:
spark.stop()

In [2]:

from pyspark.sql import SparkSession
spark = (
        SparkSession.builder
        .appName("Laptop_ROG_CUPED_validation")
        .master("spark://10.0.0.80:7077")

        .config("spark.executor.instances", "2")
        .config("spark.executor.cores", "10")
        .config("spark.executor.memory", "18g")
        .config("spark.executor.memoryOverhead", "4g")

        .config("spark.driver.memory", "10g")
        .config("spark.driver.maxResultSize", "2g")
        .config("spark.driver.host", "10.0.0.80")
        .config("spark.driver.bindAddress", "0.0.0.0")

        # AQE + shuffle
        .config("spark.sql.adaptive.enabled", "true")
        .config("spark.sql.shuffle.partitions", "288")
        .config("spark.sql.files.maxPartitionBytes", "256m")

        # Donâ€™t set spark.local.dir here; use SPARK_LOCAL_DIRS on the worker
        # .config("spark.local.dir", "...")  <-- remove
            # -------- MinIO / S3A (must match Iceberg) --------
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
        .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
        .config("spark.hadoop.fs.s3a.endpoint", "http://10.0.0.80:9100")  # <-- changed
        .config("spark.hadoop.fs.s3a.path.style.access", "true")
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
        .config("spark.hadoop.fs.s3a.aws.credentials.provider",
                "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")

        # -------- Iceberg REST catalog --------
        .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog")
        .config("spark.sql.catalog.iceberg.type", "hadoop")
        # .config("spark.sql.catalog.iceberg.uri", "http://10.0.0.59:8181")
        .config("spark.sql.catalog.iceberg.warehouse", "s3a://iceberg-warehouse/warehouse/")
        .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
        .config("spark.sql.defaultCatalog", "iceberg")
        .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")

        # Iceberg's own S3 settings (again, pointing to MinIO)
        .config("spark.sql.catalog.iceberg.s3.endpoint", "http://10.0.0.80:9100")
        .config("spark.sql.catalog.iceberg.s3.path-style-access", "true")
        .config("spark.sql.catalog.iceberg.s3.access-key-id", "minioadmin")
        .config("spark.sql.catalog.iceberg.s3.secret-access-key", "minioadmin")
        .config(
            "spark.jars",
            "/opt/spark/jars/iceberg-spark-runtime-3.4_2.12-1.6.0.jar,"
            "/opt/spark/jars/iceberg-aws-bundle-1.6.0.jar")
        .getOrCreate()
    )

26/02/19 06:33:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/19 06:33:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [39]:
v_sql = """
CREATE OR REPLACE TEMP VIEW did_pretrends AS
WITH exposures AS (
  SELECT
    experiment_id, user_id, variant, exposure_ts
  FROM iceberg.exp.exposures
  WHERE experiment_id = 'exp_big_exp_run'
),
outcomes AS (
  SELECT
    experiment_id, user_id, metric_name, value, ts AS outcome_ts
  FROM iceberg.exp.outcomes
  WHERE experiment_id = 'exp_big_exp_run'
    AND metric_name = 'pre_revenue'
)
SELECT
  e.experiment_id,
  e.user_id,
  e.variant,
  o.metric_name,
  o.value,
  DATEDIFF(o.outcome_ts, e.exposure_ts) AS k_days
FROM exposures e
JOIN outcomes o
  ON e.experiment_id = o.experiment_id
 AND e.user_id       = o.user_id
WHERE DATEDIFF(o.outcome_ts, e.exposure_ts) < 0;


"""
spark.sql(v_sql).show(truncate=False)

++
||
++
++



In [40]:
v_sql = """
with t1 as (
SELECT
  k_days,
  variant,
  AVG(value) AS mean_val
FROM did_pretrends
GROUP BY k_days, variant
)

select k_days, 
(max(case when variant = 'treatment' then mean_val else null end) -
max(case when variant = 'control' then mean_val else null end)) as delta

from t1
group by k_days
having max(case when variant = 'treatment' then mean_val else null end)  is not null
and max(case when variant = 'control' then mean_val else null end) is not null


"""

df_trends = spark.sql(v_sql).toPandas()

26/02/19 07:05:20 WARN DataSourceV2Strategy: Can't translate true to source filter, unsupported expression
                                                                                

In [38]:
import statsmodels.formula.api as smf

model = smf.ols(
    "delta ~ k_days",
    data=df_trends
).fit(cov_type="HC3")
print(model.summary())




ValueError: zero-size array to reduction operation maximum which has no identity

In [41]:
import statsmodels.formula.api as smf

model = smf.ols(
    "delta ~ k_days",
    data=df_trends
).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  delta   R-squared:                       0.078
Model:                            OLS   Adj. R-squared:                  0.055
Method:                 Least Squares   F-statistic:                     3.306
Date:                Thu, 19 Feb 2026   Prob (F-statistic):             0.0767
Time:                        07:05:30   Log-Likelihood:                 74.261
No. Observations:                  41   AIC:                            -144.5
Df Residuals:                      39   BIC:                            -141.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0144      0.013     -1.116      0.2

In [26]:
import statsmodels.formula.api as smf

model = smf.ols(
    "delta ~ k_days",
    data=df_trends
).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                  delta   R-squared:                       0.078
Model:                            OLS   Adj. R-squared:                  0.055
Method:                 Least Squares   F-statistic:                     3.306
Date:                Thu, 19 Feb 2026   Prob (F-statistic):             0.0767
Time:                        07:00:27   Log-Likelihood:                 74.261
No. Observations:                  41   AIC:                            -144.5
Df Residuals:                      39   BIC:                            -141.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0144      0.013     -1.116      0.2