# Propensity Score Matching for Estimating the CATE

For a full explanation see this [link](https://mixtape.scunning.com/matching-and-subclassification.html#propensity-score-methods).

## Session Setup

In [5]:
import pandas as pd
from pyspark.sql import SparkSession, functions as F
from pyspark_ds_toolbox.ml.data_prep.features_vector import get_features_vector

import pyspark_ds_toolbox.causal_inference.ps_matching as ps

In [2]:
spark = SparkSession.builder\
                .appName('Spark-Toolbox') \
                .master('local[1]') \
                .config('spark.executor.memory', '3G') \
                .config('spark.driver.memory', '3G') \
                .config('spark.memory.offHeap.enabled', 'true') \
                .config('spark.memory.offHeap.size', '3G') \
                .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/18 14:26:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/01/18 14:26:36 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Reading the Dataset

In [6]:
df = pd.read_csv('../tests/data/df_causal_inference.csv')
df.reset_index(level=0, inplace=True)

df = spark.createDataFrame(df.drop(columns=['data_id']))\
    .withColumn('age2', F.col('age')**2)\
    .withColumn('age3', F.col('age')**3)\
    .withColumn('educ2', F.col('educ')**2)\
    .withColumn('educ_re74', F.col('educ')*F.col('re74'))\
    .withColumn('u74', F.when(F.col('re74')==0, 1).otherwise(0))\
    .withColumn('u75', F.when(F.col('re75')==0, 1).otherwise(0))

features=['age', 'age2', 'age3', 'educ', 'educ2', 'marr', 'nodegree', 'black', 'hisp', 're74', 're75', 'u74', 'u75', 'educ_re74']
df_assembled = get_features_vector(df=df, num_features=features)
df_assembled.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+-----+-----+----+----+-----+----+----+--------+----+----+--------+------+-------+-----+---------+---+---+--------------------+
|index|treat| age|educ|black|hisp|marr|nodegree|re74|re75|    re78|  age2|   age3|educ2|educ_re74|u74|u75|            features|
+-----+-----+----+----+-----+----+----+--------+----+----+--------+------+-------+-----+---------+---+---+--------------------+
|    0|  1.0|37.0|11.0|  1.0| 0.0| 1.0|     1.0| 0.0| 0.0|9930.046|1369.0|50653.0|121.0|      0.0|  1|  1|[37.0,1369.0,5065...|
|    1|  1.0|22.0| 9.0|  0.0| 1.0| 0.0|     1.0| 0.0| 0.0|3595.894| 484.0|10648.0| 81.0|      0.0|  1|  1|[22.0,484.0,10648...|
|    2|  1.0|30.0|12.0|  1.0| 0.0| 0.0|     0.0| 0.0| 0.0|24909.45| 900.0|27000.0|144.0|      0.0|  1|  1|(14,[0,1,2,3,4,7,...|
|    3|  1.0|27.0|11.0|  1.0| 0.0| 0.0|     1.0| 0.0| 0.0|7506.146| 729.0|19683.0|121.0|      0.0|  1|  1|[27.0,729.0,19683...|
|    4|  1.0|33.0| 8.0|  1.0| 0.0| 0.0|     1.0| 0.0| 0.0|289.7899|1089.0|35937.0| 64.0|      0.0|  1|  

                                                                                

## Estimating the Propensity Scores

In [7]:
df_ps, df_eval = ps.compute_propensity_score(
    df=df_assembled,
    y='re78',
    treat='treat',
    id='index',
    featuresCol='features',
    train_size=0.8
)

logistic_regression: Starting
logistic_regression: Fitting Pipeline


                                                                                

logistic_regression: Making Predictions on test data


                                                                                

decision_tree: Starting
decision_tree: Fitting Pipeline


                                                                                

decision_tree: Making Predictions on test data
random_forest: Starting
random_forest: Fitting Pipeline
random_forest: Making Predictions on test data
gradient_boosting: Starting
gradient_boosting: Fitting Pipeline
gradient_boosting: Making Predictions on test data


In [8]:
df_ps.show(5)

+-----+-----------+-----+--------+
|index|         ps|treat|    re78|
+-----+-----------+-----+--------+
|    0|  0.1703515|  1.0|9930.046|
|    1|0.121535905|  1.0|3595.894|
|    2| 0.36807796|  1.0|24909.45|
|    3| 0.43302533|  1.0|7506.146|
|    4|  0.5246924|  1.0|289.7899|
+-----+-----------+-----+--------+
only showing top 5 rows



In [10]:
df_eval.head(5)

Unnamed: 0,model,ks_max,at_decile,precision,recall
0,logistic_regression,0.7954507,1.0,0.072072,0.888889
0,decision_tree,1.110223e-16,10.0,0.008098,1.0
0,random_forest,0.706078,3.0,0.027027,1.0
0,gradient_boosting,0.8327902,1.0,0.075075,0.925926


### Calculating the ATE

In [12]:
ate = ps.estimate_causal_effect(df_ps=df_ps, y='re78', treat='treat', ps='ps')

ate

                                                                                

1718.586581036139