In [21]:
# Author：F.T Weng
# Purpose： Example of using xgboost to calculate Shapley value, China | 7 days |Indicator
# Email: xmftweng@163.com
# Import necessary packages
import warnings
warnings.filterwarnings("ignore")

# !pip install shap
# !pip install xgboost
# !pip install pandas
# !pip install scikit-learn

In [23]:
import shap
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import max_error
from sklearn.metrics import median_absolute_error
import math
import pandas as pd

In [24]:
# path of dataset
path = '../dataset/sevenVol.csv'
lookback_window = 7  # time step

In [25]:
# Obtain the characteristic matrix and label, calculate the effect value, and do not divide the training and test sets
# If it needs to be modified,  train_test_split() method can be used.
def get_data(path, lookback_window):
    """
    :param path: Data path
    :param lookback_window: Prediction step, that is, how many days ago the data is used to predict future data
    :return: Feature matrix and label
    """
    df = pd.read_csv(path, low_memory=False)

    china_name = ['date','china_vol','Credit','Equity valuation','Safe assets','Funding','Volatility']
    china_data = df.loc[:, china_name]

    data = china_data.values
    input_data = data[:, 1:]  

    label = input_data[:, 0]  # label
    x, y = [], []
    for i in range(lookback_window, len(label)):
        # print(i)
        x.append(input_data[i - lookback_window:i])
        y.append(label[i])
    x = np.array(x) 
    y = np.array(y)   

    X = x.reshape(len(x), np.shape(x)[1] * np.shape(x)[2])
    Y = y.reshape(-1, 1)  # Convert array dimensions
    
    return X, Y

In [26]:
X, Y = get_data(path, lookback_window)
print(np.shape(X))
print(np.shape(Y))

(1713, 42)
(1713, 1)


In [27]:
# build model
# this is also applicable to other models, such as SVR, SGD, RF etc.
model = xgb.train({'objective': 'reg:squarederror'}, xgb.DMatrix(X, label=Y), 100)

In [28]:
# explain the model's predictions using SHAP
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
print(np.shape(shap_values))

(1713, 42)


In [29]:
explainer.expected_value # base value

0.009562194

In [16]:
# save the shapley value
df = pd.read_csv(path, low_memory=False)
date = df.loc[lookback_window:, ['date']]
SHAP = shap_values
all_data = np.hstack((date.values, SHAP)) 
my_shape = pd.DataFrame(all_data)
my_shape.columns =  ['date','volT7','CreditT7','EquityValuationT7','SafeAssetsT7','FundingT7','VolatilityT7',
                            'volT6','CreditT6','EquityValuationT6','SafeAssetsT6','FundingT6','VolatilityT6',
                            'volT5','CreditT5','EquityValuationT5','SafeAssetsT5','FundingT5','VolatilityT5',
                            'volT4','CreditT4','EquityValuationT4','SafeAssetsT4','FundingT4','VolatilityT4',
                            'volT3','CreditT3','EquityValuationT3','SafeAssetsT3','FundingT3','VolatilityT3',
                            'volT2','CreditT2','EquityValuationT2','SafeAssetsT2','FundingT2','VolatilityT2',
                            'volT1','CreditT1','EquityValuationT1','SafeAssetsT1','FundingT1','VolatilityT1']
my_shape.to_csv('../result/China7Vol_indicator.csv')