In [None]:
#!/usr/bin/env python
"""xgboost shap and lime"""

In [None]:
# parameter
MODEL = "onTravelV6C"
N_SAMPLES = 500
TRAIN_DATA_FILE = "train_" + MODEL + ".txt"
SAMPLE_FILE = "sample_train_" + MODEL + ".txt"
FEATURE_MAP_FILE = "feature_map_" + MODEL + ".json"
MODEL_FILE = MODEL + ".bin"
SAMPLE_FILE = "sample_" + str(N_SAMPLES) + "_" + TRAIN_DATA_FILE

In [None]:
%%bash
# prepare

# parameter
MODEL="onTravelV6C"
N_SAMPLES=500
TRAIN_DATA_FILE="train_${MODEL}.txt"
SAMPLE_FILE="sample_train_${MODEL}.txt"
FEATURE_MAP_FILE="feature_map_${MODEL}.json"
MODEL_FILE="${MODEL}.bin"
SAMPLE_FILE="sample_${N_SAMPLES}_${TRAIN_DATA_FILE}"

# train data file
if [[ ! -f ${TRAIN_DATA_FILE} ]]; then
    echo "Train Data File Not Exist"
    echo "Copy File Begin"
    cp /mfw_data/algo/wanglei/spark_offline/train_data/onTravel/${TRAIN_DATA_FILE} ./
    echo "Copy File End"
fi

# feature map data file
if [[ ! -f ${FEATURE_MAP_FILE} ]]; then
    echo "Feature Map File Not Exist"
    echo "Get File Begin"
    hadoop fs -text /user/wanglei3/featureMap/onTravel/${MODEL}/part-00000.snappy > ${FEATURE_MAP_FILE}
    echo "Get File End"
fi

# xgboost model file
if [[ ! -f ${MODEL_FILE} ]]; then
    echo "Model File Not Exist"
    echo "Copy File Begin"
    cp /opt/tomcat/webapps/model/${MODEL} ./
    mv ${MODEL} ${MODEL}.bin
    echo "Copy File End"
fi

# random sampling
if [[ ! -f ${SAMPLE_FILE} ]]; then
    echo "Sample File Not Exist"
    echo "Sampling Begin"
    shuf -n ${N_SAMPLES} ${TRAIN_DATA_FILE} -o sample_${N_SAMPLES}_${TRAIN_DATA_FILE}
    echo "Sampling End"
fi

ls

In [None]:
# ipython core option  
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# package
from sklearn.datasets import load_svmlight_file
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("seaborn")
import shap
import lime
import json
import re

In [None]:
# feature map
with open(FEATURE_MAP_FILE) as fp:
    feature_map = json.load(fp)
cols = []
i = 0
for fm in feature_map:
    if i == 0:
        pass
    else:
        print(fm)
        cols.append(re.search(r"\t(.*)\t", fm).group(1))
    i += 1   

In [None]:
# load libsvm format file
X, y = load_svmlight_file(SAMPLE_FILE, n_features=len(cols))
print(X[0].todense().shape)
print(y[0])

In [None]:
# create dataframe
df = pd.DataFrame(X.todense())
df.columns = cols
df["repair"] = np.zeros(N_SAMPLES)
df["label"] = y
df = df[["repair"]+cols+["label"]]
df.head()

In [None]:
IS_TRAIN = False

In [None]:
# train xgboost model
if IS_TRAIN:
#     from sklearn.ensemble import GradientBoostingClassifier
#     param = {
#         "loss": "deviance",
#         "learning_rate": 0.1,
#         "max_depth": 7,
#         "subsample": 0.8,
#         "n_estimators": 300
#     }
#     sk_gbt = GradientBoostingClassifier(**param)
#     sk_gbt.fit(df[["repair"]+cols], df["label"])

#     param = {
#         "objective": "binary:logistic",
#         "learning_rate": 0.1,
#         "max_depth": 7,
#         "min_child_weight": 1,
#         "gamma": 0,
#         "subsample": 0.8,
#         "colsample_bytree": 0.8,
#         "scale_pos_weight": 1,
#         "n_estimators": 300,
#     }
#     sk_xgb = xgb.XGBClassifier(**param)
#     sk_xgb.fit(df[["repair"]+cols], df["label"])

    param = {
        "objective": "binary:logistic",
        "eta": 0.1,
        "max_depth": 7,
        "min_child_weight": 1,
        "gamma": 0,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "scale_pos_weight": 1,
        "silent": True
    }
    num_boost_round = 300
    dtrain = xgb.DMatrix(df[["repair"]+cols], label=df["label"])
    bst_xgb = xgb.train(param, dtrain, num_boost_round=num_boost_round)
else:
    bst = xgb.Booster(model_file=MODEL_FILE)

In [None]:
if IS_TRAIN == True:
    model = bst_xgb
else:
    model = bst

In [None]:
# margin or probability
MODEL_OUTPUT = "probability"

In [None]:
# shap
if MODEL_OUTPUT == "margin":
    # margin explanation
    shap_explainer = shap.TreeExplainer(model)
if MODEL_OUTPUT == "probability":
    # probability explanation
    BACKGROUND_DATASET_SIZE = 1000
    if len(df[["repair"]+cols]) <= BACKGROUND_DATASET_SIZE:
        background_dataset = df[["repair"]+cols]
    else:
        background_dataset = df[["repair"]+cols].sample(BACKGROUND_DATASET_SIZE)
    shap_explainer = shap.TreeExplainer(model, background_dataset.values, model_output="probability", feature_dependence="independent")


In [None]:
shap_values = shap_explainer.shap_values(df[["repair"]+cols])
print("shap_values: ", shap_values.shape)
y_base = shap_explainer.expected_value
print("y_base: ", y_base)

In [None]:
if MODEL_OUTPUT == "margin":
    # margin explanation
    df["pred"] = model.predict(xgb.DMatrix(df[["repair"]+cols], label=df["label"]), output_margin=True)
if MODEL_OUTPUT == "probability":
    # probability explanation
    df["pred"] = model.predict(xgb.DMatrix(df[["repair"]+cols], label=df["label"]), output_margin=False)
print("pred mean: ", df["pred"].mean())
df.head()

In [None]:
shap.force_plot(shap_explainer.expected_value, shap_values, df[["repair"]+cols])

In [None]:
shap.summary_plot(shap_values, df[["repair"]+cols], plot_type="bar")

In [None]:
shap.summary_plot(shap_values, df[["repair"]+cols])

In [None]:
if MODEL_OUTPUT == "margin":
    shap_interaction_values = shap_explainer.shap_interaction_values(df[["repair"]+cols])
    shap.summary_plot(shap_interaction_values, df[["repair"]+cols], max_display=4)

In [None]:
# j = np.random.randint(N_SAMPLES)

In [None]:
i = np.random.choice(df[df["pred"] <= 0.5].index.tolist())
print("negative sample")
player_explainer = pd.DataFrame()
player_explainer['feature'] = ["repair"]+cols
player_explainer['feature_value'] = df[["repair"]+cols].iloc[i].values
player_explainer['shap_value'] = shap_values[i]
player_explainer
print("y_base + sum_of_shap_values: %.2f" % (y_base + player_explainer["shap_value"].sum()))
print("y_pred: %.2f" % (df["pred"].iloc[i]))

In [None]:
shap.initjs()
shap.force_plot(shap_explainer.expected_value, shap_values[i], df[["repair"]+cols].iloc[i])

In [None]:

j = np.random.choice(df[df["pred"] >= 0.5].index.tolist())
print("positive sample")
player_explainer = pd.DataFrame()
player_explainer['feature'] = ["repair"]+cols
player_explainer['feature_value'] = df[["repair"]+cols].iloc[j].values
player_explainer['shap_value'] = shap_values[j]
player_explainer
print("y_base + sum_of_shap_values: %.2f" % (y_base + player_explainer["shap_value"].sum()))
print("y_pred: %.2f" % (df["pred"].iloc[j]))

In [None]:
shap.initjs()
shap.force_plot(shap_explainer.expected_value, shap_values[j], df[["repair"]+cols].iloc[j])

In [None]:
FEATURE="doubleFlow_article_ctr_30_v1"
INTERACTION="doubleFlow_user_view_30"
shap.dependence_plot(FEATURE, shap_values, df[["repair"]+cols], interaction_index=None, show=False)
shap.dependence_plot(FEATURE, shap_values, df[["repair"]+cols], interaction_index=INTERACTION, show=False) 

In [None]:
# lime
lime_explainer = lime.lime_tabular.LimeTabularExplainer(df[["repair"]+cols].values, 
                                                   feature_names=["repair"]+cols,
                                                   class_names=["0", "1"], 
                                                   verbose=True)

In [None]:
model.feature_names = None
def predict_fn(x):
    preds = model.predict(xgb.DMatrix(x))
    return np.array([[1-p, p] for p in preds])

In [None]:
i = np.random.choice(df[df["pred"] <= 0.5].index.tolist())
print("negative sample")
player_explainer = pd.DataFrame()
player_explainer['feature'] = ["repair"]+cols
player_explainer['feature_value'] = df[["repair"]+cols].iloc[i].values
player_explainer['shap_value'] = shap_values[i]
player_explainer

In [None]:
exp = lime_explainer.explain_instance(df[["repair"]+cols].values[i], predict_fn, num_features=5)
exp.show_in_notebook(show_table=True)

In [None]:
exp.as_list()
fig = exp.as_pyplot_figure()
fig.show()

In [None]:
j = np.random.choice(df[df["pred"] >= 0.5].index.tolist())
print("positive sample")
player_explainer = pd.DataFrame()
player_explainer['feature'] = ["repair"]+cols
player_explainer['feature_value'] = df[["repair"]+cols].iloc[j].values
player_explainer['shap_value'] = shap_values[j]
player_explainer

In [None]:
exp = lime_explainer.explain_instance(df[["repair"]+cols].values[j], predict_fn, num_features=5)
exp.show_in_notebook(show_table=True)

In [None]:
exp.as_list()
fig = exp.as_pyplot_figure()
fig.show()

In [None]:
!jupyter nbconvert --to python xgboost-shap-and-lime.ipynb

In [None]:
!jupyter nbconvert --to html xgboost-shap-and-lime.ipynb

In [None]:
# !jupyter nbconvert --to pdf xgboost-shap-and-lime.ipynb

In [None]:
!ls