# Shap Values

In [1]:
!which python

/Users/vini/Dev-Files/Poetry/virtualenvs/pyspark-ds-toolbox-H0pw_EKR-py3.8/bin/python


In [3]:
# #https://github.com/manuel-calzolari/shapicant
import pandas as pd
# from pyspark.sql.window import Window
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
# from pyspark.sql.types import FloatType, StructField, StructType, StringType
# from pyspark.ml.linalg import VectorUDT

import pyspark.ml.feature as FF
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.regression import GBTRegressor


In [5]:
from pyspark_ds_toolbox.ml.data_prep import get_features_vector
from pyspark_ds_toolbox.ml.eval import get_p1, estimate_individual_shapley_values

In [6]:
spark = SparkSession.builder\
                .appName('Ml-Pipes') \
                .master('local[1]') \
                .config('spark.executor.memory', '3G') \
                .config('spark.driver.memory', '3G') \
                .config('spark.memory.offHeap.enabled', 'true') \
                .config('spark.memory.offHeap.size', '3G') \
                .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

21/12/04 14:37:57 WARN Utils: Your hostname, matrix.local resolves to a loopback address: 127.0.0.1; using 172.20.10.3 instead (on interface en0)
21/12/04 14:37:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/12/04 14:37:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [7]:
def read_data(file): 
    return pd.read_stata("https://raw.github.com/scunning1975/mixtape/master/" + file)

df = read_data('nsw_mixtape.dta')
df = pd.concat((df, read_data('cps_mixtape.dta')))
df.reset_index(level=0, inplace=True)

df = spark.createDataFrame(df.drop(columns=['data_id']))\
    .withColumn('age2', F.col('age')**2)\
    .withColumn('age3', F.col('age')**3)\
    .withColumn('educ2', F.col('educ')**2)\
    .withColumn('educ_re74', F.col('educ')*F.col('re74'))\
    .withColumn('u74', F.when(F.col('re74')==0, 1).otherwise(0))\
    .withColumn('u75', F.when(F.col('re75')==0, 1).otherwise(0))

features=['age', 'age2', 'age3', 'educ', 'educ2', 'marr', 'nodegree', 'black', 'hisp', 're74', 're75', 'u74', 'u75', 'educ_re74']
df_assembled = get_features_vector(df=df, num_features=features)

# assembler = FF.VectorAssembler(inputCols=features, outputCol='features')
# pipeline = Pipeline(stages = [assembler])
# df_assembled = pipeline.fit(df).transform(df)

In [16]:
train_size=0.8
train, test = df_assembled.randomSplit([train_size, (1-train_size)], seed=12345)

row_of_interest = df_assembled.filter(F.col('index')==3).first()

## Using in a Regression Problem

In [17]:
# Regression
model_regressor = GBTRegressor(labelCol='re78')
p_regression = Pipeline(stages=[model_regressor]).fit(train)

In [19]:
sdf_shap_regression = estimate_individual_shapley_values(
    spark=spark,
    df = df_assembled,
    id_col='index',
    model = p_regression,
    problem_type='regression',
    row_of_interest = row_of_interest,
    feature_names = features,
    features_col='features',
    print_shap_values=False
)

sdf_shap_regression.show(5)



+-----+-------+------------------+
|index|feature|              shap|
+-----+-------+------------------+
|    3|    age| 935.1710310392384|
|    3|   age2|131.16309833128466|
|    3|   age3|122.37308194603968|
|    3|   educ| -450.768881698199|
|    3|  educ2|114.58859573810376|
+-----+-------+------------------+
only showing top 5 rows



In [20]:

print('Estimated re78 from shap values decomposition:')
print(df_assembled.select('re78').toPandas().re78.mean() + sdf_shap_regression.select(F.sum('shap')).collect()[0][0])

print('Observed re78:')
v = df_assembled.filter(F.col('index')==3).select('re78').collect()[0][0]
print(v)

Estimated re78 from shap values decomposition:
10959.500462485266
Observed re78:
7506.14599609375


## Using in a Classification Problem

In [22]:
# Regression
model_classifier = GBTClassifier(labelCol='treat')
p_classification = Pipeline(stages=[model_classifier]).fit(train)

In [23]:
sdf_shap_classification = estimate_individual_shapley_values(
    spark=spark,
    df = df_assembled,
    id_col='index',
    model = p_classification,
    problem_type='classification',
    row_of_interest = row_of_interest,
    feature_names = features,
    features_col='features',
    print_shap_values=False
)

sdf_shap_classification.show(5)



+-----+-------+--------------------+
|index|feature|                shap|
+-----+-------+--------------------+
|    3|    age|-0.00763664486855...|
|    3|   age2|-0.00403245308362...|
|    3|   age3|-0.00430382868062...|
|    3|   educ|0.006123442326710661|
|    3|  educ2|-0.00407094088591659|
+-----+-------+--------------------+
only showing top 5 rows



In [25]:

print('Estimated treat prob from shap values decomposition:')
# print(df_assembled.select('p1').toPandas().p1.mean() + sdf_shap_classification.select(F.sum('shap')).collect()[0][0])
sdf_shap_classification.select(F.sum('shap')).collect()[0][0]

# print('Observed re78:')
# v = df_assembled.filter(F.col('index')==3).select('re78').collect()[0][0]
# print(v)

Estimated treat prob from shap values decomposition:


0.28614638016471244

In [10]:
df_predicted = df_assembled.withColumn('p1', get_p1(F.col('probability')))

print(df_predicted.select('p1').toPandas().p1.mean() + a.select(F.sum('shap')).collect()[0][0])

v = df_predicted.filter(F.col('index')==3).select('p1').collect()[0][0]
print(v)

AnalysisException: cannot resolve '`probability`' given input columns: [age, age2, age3, black, educ, educ2, educ_re74, features, hisp, index, marr, nodegree, re74, re75, re78, treat, u74, u75];
'Project [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10, age2#22, age3#35, educ2#49, educ_re74#64, u74#80, u75#97, features#138, <lambda>('probability) AS p1#15227]
+- Project [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10, age2#22, age3#35, educ2#49, educ_re74#64, u74#80, u75#97, features#138]
   +- Project [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10, age2#22, age3#35, educ2#49, educ_re74#64, u74#80, u75#97, num#118, UDF(struct(num, num#118)) AS features#138]
      +- Project [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10, age2#22, age3#35, educ2#49, educ_re74#64, u74#80, u75#97, UDF(struct(age, age#2, age2, age2#22, age3, age3#35, educ, educ#3, educ2, educ2#49, marr, marr#6, nodegree, nodegree#7, black, black#4, hisp, hisp#5, re74, re74#8, re75, re75#9, u74_double_VectorAssembler_870952b30f34, cast(u74#80 as double), ... 4 more fields)) AS num#118]
         +- Project [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10, age2#22, age3#35, educ2#49, educ_re74#64, u74#80, CASE WHEN (re75#9 = cast(0 as double)) THEN 1 ELSE 0 END AS u75#97]
            +- Project [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10, age2#22, age3#35, educ2#49, educ_re74#64, CASE WHEN (re74#8 = cast(0 as double)) THEN 1 ELSE 0 END AS u74#80]
               +- Project [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10, age2#22, age3#35, educ2#49, (educ#3 * re74#8) AS educ_re74#64]
                  +- Project [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10, age2#22, age3#35, POWER(educ#3, cast(2 as double)) AS educ2#49]
                     +- Project [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10, age2#22, POWER(age#2, cast(3 as double)) AS age3#35]
                        +- Project [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10, POWER(age#2, cast(2 as double)) AS age2#22]
                           +- LogicalRDD [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10], false


In [None]:

# Classification
model_classifier = GBTClassifier(labelCol='treat')
p_classifier = Pipeline(stages=[model_classifier]).fit(train)

AnalysisException: cannot resolve '`probability`' given input columns: [age, age2, age3, black, educ, educ2, educ_re74, features, hisp, index, marr, nodegree, re74, re75, re78, treat, u74, u75];
'Project [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10, age2#22, age3#35, educ2#49, educ_re74#64, u74#80, u75#97, features#138, <lambda>('probability) AS p1#15227]
+- Project [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10, age2#22, age3#35, educ2#49, educ_re74#64, u74#80, u75#97, features#138]
   +- Project [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10, age2#22, age3#35, educ2#49, educ_re74#64, u74#80, u75#97, num#118, UDF(struct(num, num#118)) AS features#138]
      +- Project [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10, age2#22, age3#35, educ2#49, educ_re74#64, u74#80, u75#97, UDF(struct(age, age#2, age2, age2#22, age3, age3#35, educ, educ#3, educ2, educ2#49, marr, marr#6, nodegree, nodegree#7, black, black#4, hisp, hisp#5, re74, re74#8, re75, re75#9, u74_double_VectorAssembler_870952b30f34, cast(u74#80 as double), ... 4 more fields)) AS num#118]
         +- Project [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10, age2#22, age3#35, educ2#49, educ_re74#64, u74#80, CASE WHEN (re75#9 = cast(0 as double)) THEN 1 ELSE 0 END AS u75#97]
            +- Project [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10, age2#22, age3#35, educ2#49, educ_re74#64, CASE WHEN (re74#8 = cast(0 as double)) THEN 1 ELSE 0 END AS u74#80]
               +- Project [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10, age2#22, age3#35, educ2#49, (educ#3 * re74#8) AS educ_re74#64]
                  +- Project [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10, age2#22, age3#35, POWER(educ#3, cast(2 as double)) AS educ2#49]
                     +- Project [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10, age2#22, POWER(age#2, cast(3 as double)) AS age3#35]
                        +- Project [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10, POWER(age#2, cast(2 as double)) AS age2#22]
                           +- LogicalRDD [index#0L, treat#1, age#2, educ#3, black#4, hisp#5, marr#6, nodegree#7, re74#8, re75#9, re78#10], false


In [20]:
df_predicted.select('re78').toPandas().re78.mean() + a.select(F.sum('shap')).collect()[0][0]

18014.104953822396

In [None]:
print(df_predicted.select('probability').toPandas().probability.mean() + a.select(F.sum('shap')).collect()[0][0])

print(f'{v}')



0.04829068469926582
0.04364077374339104




In [None]:
print(df_predicted.select('probability').toPandas().probability.mean() + a.select(F.sum('shap')).collect()[0][0])

print(f'{v}')



0.052278824448785434
0.043646443635225296


