# [Wip] Cate Example

The objective os this notebook is to build a causal inference model based on the paper by Athey, Imbens 2015 using dataset from cunning 2021.

Specifically we want to:
1. Explain the dataset;
2. Compute the Real ATE;
3. Compute The propensity score & transform Y (used in both next two stages);
4. Estimate the ATE from biased data;
5. Estimate the Cate from biased data. 

In [12]:
!which python

/Users/vini/Dev-Files/Poetry/virtualenvs/ds-toolbox-HgP-t3cq-py3.8/bin/python


In [13]:
# Session Setup
import pandas as pd
from pyspark.sql import functions as F
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, DecisionTreeRegressor, GBTRegressor
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import FloatType

from ds_toolbox.utils import start_local_spark

pd.set_option('display.max_columns', 500)
spark = start_local_spark(max_mem=1, n_cores=1)

# Functions
def read_data(file): 
    return pd.read_stata("https://raw.github.com/scunning1975/mixtape/master/" + file)

get_p1 = F.udf(lambda value: value[1].item(), FloatType())

## 1) The dataset

Describe the datasets here

In [14]:
# DataFrame Experimental
dfs_experimental = spark.createDataFrame(read_data('nsw_mixtape.dta'))

# DataFrame with Bias selection
dfs_biased = dfs_experimental.union(spark.createDataFrame(read_data('cps_mixtape.dta')))\
    .withColumn('age2', F.col('age')**2)\
    .withColumn('age3', F.col('age')**3)\
    .withColumn('educ2', F.col('educ')**2)\
    .withColumn('educ_re74', F.col('educ')*F.col('re74'))\
    .withColumn('u74', F.when(F.col('re74')==0, 1).otherwise(0))\
    .withColumn('u75', F.when(F.col('re75')==0, 1).otherwise(0))


dfs_biased.show(5)

+--------------------+-----+----+----+-----+----+----+--------+----+----+-----------------+------+-------+-----+---------+---+---+
|             data_id|treat| age|educ|black|hisp|marr|nodegree|re74|re75|             re78|  age2|   age3|educ2|educ_re74|u74|u75|
+--------------------+-----+----+----+-----+----+----+--------+----+----+-----------------+------+-------+-----+---------+---+---+
|Dehejia-Wahba Sample|  1.0|37.0|11.0|  1.0| 0.0| 1.0|     1.0| 0.0| 0.0|  9930.0458984375|1369.0|50653.0|121.0|      0.0|  1|  1|
|Dehejia-Wahba Sample|  1.0|22.0| 9.0|  0.0| 1.0| 0.0|     1.0| 0.0| 0.0| 3595.89404296875| 484.0|10648.0| 81.0|      0.0|  1|  1|
|Dehejia-Wahba Sample|  1.0|30.0|12.0|  1.0| 0.0| 0.0|     0.0| 0.0| 0.0|   24909.44921875| 900.0|27000.0|144.0|      0.0|  1|  1|
|Dehejia-Wahba Sample|  1.0|27.0|11.0|  1.0| 0.0| 0.0|     1.0| 0.0| 0.0| 7506.14599609375| 729.0|19683.0|121.0|      0.0|  1|  1|
|Dehejia-Wahba Sample|  1.0|33.0| 8.0|  1.0| 0.0| 0.0|     1.0| 0.0| 0.0|289.789886

## 2) The Real ATE
Since this was as random experiment we can easily compute the real average treatment effect to use as a benchmark.

In [18]:
mean0 = dfs_experimental.filter(F.col('treat')==0).select(F.avg('re78')).collect()[0][0]
mean1 = dfs_experimental.filter(F.col('treat')==1).select(F.avg('re78')).collect()[0][0]
print(f'The Real ATE = {round(mean1, 2)} - {round(mean0, 2)} = {round(mean1-mean0, 2)}')

print('---------')
mean0 = dfs_biased.filter(F.col('treat')==0).select(F.avg('re78')).collect()[0][0]
mean1 = dfs_biased.filter(F.col('treat')==1).select(F.avg('re78')).collect()[0][0]
print(f'The Bias ATE = {round(mean1, 2)} - {round(mean0, 2)} = {round(mean1-mean0, 2)}')

The Real ATE = 6349.14 - 4554.8 = 1794.34
---------
The Bias ATE = 6349.14 - 14682.01 = -8332.87


## 3) Computing Propensity Score & Transform Y

In [19]:
features=['age', 'age2', 'age3', 'educ', 'educ2', 'marr', 'nodegree', 'black', 'hisp', 're74', 're75', 'u74', 'u75', 'educ_re74']
assembler = VectorAssembler(inputCols=features, outputCol='features')
pipeline = Pipeline(stages = [assembler, LogisticRegression(labelCol='treat', fitIntercept=True)])
fitted_classifier = pipeline.fit(dfs_biased)


dfs_predicted_with_ps_th = fitted_classifier.transform(dfs_biased)\
    .withColumn('ps', get_p1(F.col('probability')))\
    .withColumn(
        'th',
        F.col('re78')*(F.col('treat')*F.col('ps'))/(F.col('ps')*(1-F.col('ps')))
    )

dfs_predicted_with_ps_th.sample(fraction=0.05).orderBy(F.col('treat').desc()).toPandas().drop(columns=['features']).head()

Unnamed: 0,data_id,treat,age,educ,black,hisp,marr,nodegree,re74,re75,re78,age2,age3,educ2,educ_re74,u74,u75,rawPrediction,probability,prediction,ps,th
0,Dehejia-Wahba Sample,1.0,23.0,11.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,529.0,12167.0,121.0,0.0,1,1,"[0.6276138681249268, -0.6276138681249268]","[0.6519482173540334, 0.3480517826459666]",0.0,0.348052,0.0
1,Dehejia-Wahba Sample,1.0,20.0,11.0,1.0,0.0,0.0,1.0,0.0,0.0,3972.540039,400.0,8000.0,121.0,0.0,1,1,"[0.9882236807497957, -0.9882236807497957]","[0.7287369232969796, 0.2712630767030204]",0.0,0.271263,5451.267532
2,Dehejia-Wahba Sample,1.0,25.0,11.0,1.0,0.0,0.0,1.0,0.0,0.0,485.229797,625.0,15625.0,121.0,0.0,1,1,"[0.4402154675084553, -0.4402154675084553]","[0.6083103711428268, 0.3916896288571732]",0.0,0.39169,797.668169
3,Dehejia-Wahba Sample,1.0,43.0,9.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1849.0,79507.0,81.0,0.0,1,1,"[0.8906931022971243, -0.8906931022971243]","[0.7090331838382283, 0.29096681616177167]",0.0,0.290967,0.0
4,Dehejia-Wahba Sample,1.0,19.0,10.0,1.0,0.0,0.0,1.0,0.0,385.274109,8124.714844,361.0,6859.0,100.0,0.0,1,0,"[0.719669028722274, -0.719669028722274]","[0.6725341307727755, 0.32746586922722454]",0.0,0.327466,12080.747442


## 4) Estimating the CATE and ATE from Biased Data

Steps
* a) Compute the propensity score os treatment;
* b) Transform the Y variable into Y*;
* c) Predict Y* (normal prediction procedures);
* d) Evaluate step c.

In [None]:
target = 'th'
train, test = dfs_predicted_with_ps_th.select('features', 'treat', 're78', 'th').randomSplit([0.8, 0.2], seed=12345)

spark_classifiers = {
    'logistic_regression': LinearRegression(labelCol=target),
    'decision_tree': DecisionTreeRegressor(labelCol=target),
    'random_forest': RandomForestRegressor(labelCol=target),
    'gradient_boosting': GBTRegressor(labelCol=target)
}


df_evaluate = pd.DataFrame()

for classifier_name, classifier in spark_classifiers.items():
    print(f'{classifier_name}: Starting')
    pipeline = Pipeline(stages = [classifier])
    # Fit no Modelo de Predict nos test
    print(f'{classifier_name}: Fitting Pipeline')
    fitted_classifier = pipeline.fit(train)
    print(f'{classifier_name}: Making Predictions on test data')
    prediction_on_test = fitted_classifier.transform(test)

    df_temp = pd.DataFrame({
        'model':[classifier_name],
        'msqe': [prediction_on_test.withColumn('sqe', -(F.col('th')-F.col('prediction'))**2).select(F.sum('sqe')).collect()[0][0]],
        'ate': [prediction_on_test.filter(F.col('treat')==1).select(F.avg('prediction')).collect()[0][0]-prediction_on_test.filter(F.col('treat')==0).select(F.avg('prediction')).collect()[0][0]]
    })
    df_evaluate = df_evaluate.append(df_temp)
    

In [None]:
df_evaluate.sort_values('msqe', ascending=False)

## Rascunho

In [None]:
mean0 = dfs_biased.filter(F.col('treat')==0).select(F.avg('re78')).collect()[0][0]
mean1 = dfs_biased.filter(F.col('treat')==1).select(F.avg('re78')).collect()[0][0]

print(f'ATE = {round(mean1, 2)} - {round(mean0, 2)} = {round(mean1-mean0, 2)}')