# SHAP Plots

### Import Libraries

In [None]:
import matplotlib.pyplot as plt
import shap
import pandas as pd
import numpy as np
import joblib
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn import metrics

from sklearn.preprocessing import StandardScaler

np.random.seed(0)

### Fit model and get SHAP values

In [None]:
def shap_plots(data, fold_type, explainer_type, model_type, cols=None):
    model = data['model']
    df_train = data['df_train']
    df_train_target = data['df_train_target']
    features = data['features']
    test = data['test']
    train_os = data['train_os']
    mask = data['mask']
    
    df_train_target.loc[df_train_target['TrialID'].isin([0,1,2,3]), 'Species'] = 0
    df_train_target.loc[df_train_target['TrialID'].isin([4,5,6,7,8]),'Species'] = 1
    df_train_target.loc[df_train_target['TrialID'].isin([9,10,11,12]),'Species'] = 2
    df_train_target.loc[df_train_target['TrialID'].isin([13,14,15,16]),'Species'] = 3
    df_train_target.loc[df_train_target['TrialID'].isin([0,1,2,3]),'Target'] = 1
    df_train_target.loc[df_train_target['TrialID'].isin([13,14,15,16]),'Target'] = 0
    df_train_target.loc[df_train_target['TrialID'].isin([4,5,6,7,8]),'Target'] = 0
    df_train_target.loc[df_train_target['TrialID'].isin([9,10,11,12]),'Target'] = 1

    print('Initalising SHAP...')
    if explainer_type == 'TreeExplainer':
        explainer = shap.TreeExplainer(model, data=train_os, model_output='probability')
    elif explainer_type == 'Explainer':
        explainer = shap.Explainer(model, train_os)

    print('Computing SHAP values...')
    try:
        test = test.sample(5000, random_state=1)
    except:
        pass

    shap_vals = explainer.shap_values(test)

    print('Generating plots...')

    plt.figure(figsize=(12,15), dpi=300)
    if model_type == 'Random Forests':
        shap.summary_plot(shap_vals[1], test, show=False, plot_size=[12,15], max_display=len(test.columns), alpha=0.7, feature_names=cols)
    else:
        shap.summary_plot(shap_vals, test, show=False, plot_size=[12,15], max_display=len(test.columns), alpha=0.7, feature_names=cols)
    #plt.title(f'SUMMARY PLOT - {model_type} - {fold_type}')
    plt.show()

    plt.figure(figsize=(12,15), dpi=300)
    shap.summary_plot(shap_vals, test, plot_type="bar", show=False, plot_size=[12,15], max_display=len(test.columns), feature_names=cols)
    #plt.title(f'BAR PLOT - {model_type} - {fold_type}')
    plt.show()
    '''
    for index, species in enumerate(['banfora', 'kisumu', 'ngoussu', 'vk7']):
        data = df_train[features][(df_train_target['Species'] == index) & (~mask)]
        indexes = list(set(data.index).intersection(test.index))
        indexes = test.index.isin(indexes)
        data = test.loc[indexes]

        if model_type == 'Random Forests':
            species_shap_vals = []
            species_shap_vals.append(shap_vals[0][indexes])
            species_shap_vals.append(shap_vals[1][indexes])
        else:
            species_shap_vals = shap_vals[indexes]
        
        plt.figure(figsize=(12,12), dpi=300)
        if model_type == 'Random Forests':
            shap.summary_plot(species_shap_vals[1], data, show=False, plot_size=[12,12], max_display=len(test.columns), alpha=0.7)
        else:
            shap.summary_plot(species_shap_vals, data, show=False, plot_size=[12,12], max_display=len(test.columns), alpha=0.7)
        plt.title(f'SUMMARY PLOT - {model_type} - {fold_type} - {species}')
        plt.show()

        plt.figure(figsize=(12,12), dpi=300)
        shap.summary_plot(species_shap_vals, data, plot_type="bar", show=False, plot_size=[12,12], max_display=len(test.columns))
        plt.title(f'BAR PLOT - {model_type} - {fold_type} - {species}')
        plt.show()
    '''
    
def shap_scatter(data, fold_type, explainer_type, model_type):
    model = data['model']
    df_train = data['df_train']
    df_train_target = data['df_train_target']
    features = data['features']
    test = data['test']
    train_os = data['train_os']
    mask = data['mask']
    
    df_train_target.loc[df_train_target['TrialID'].isin([0,1,2,3]), 'Species'] = 0
    df_train_target.loc[df_train_target['TrialID'].isin([4,5,6,7,8]),'Species'] = 1
    df_train_target.loc[df_train_target['TrialID'].isin([9,10,11,12]),'Species'] = 2
    df_train_target.loc[df_train_target['TrialID'].isin([13,14,15,16]),'Species'] = 3
    df_train_target.loc[df_train_target['TrialID'].isin([0,1,2,3]),'Target'] = 1
    df_train_target.loc[df_train_target['TrialID'].isin([13,14,15,16]),'Target'] = 0
    df_train_target.loc[df_train_target['TrialID'].isin([4,5,6,7,8]),'Target'] = 0
    df_train_target.loc[df_train_target['TrialID'].isin([9,10,11,12]),'Target'] = 1

    print('Initalising SHAP...')
    if explainer_type == 'TreeExplainer':
        explainer = shap.TreeExplainer(model, data=train_os, model_output='probability')
    elif explainer_type == 'Explainer':
        explainer = shap.Explainer(model, train_os)

    print('Computing SHAP values...')
    test = test.sample(5000, random_state=0)

    shap_vals = explainer(test)

    print('Generating plots...')
    return shap_vals, test


def plot_heatmap(data, fold_type, explainer_type, model_type):
    model = data['model']
    df_train = data['df_train']
    features = data['features']
    test = data['test']
    train_os = data['train_os']
    mask = data['mask']

    print('Initalising SHAP...')
    if explainer_type == 'TreeExplainer':
        explainer = shap.TreeExplainer(model, data=train_os, model_output='probability')
    elif explainer_type == 'Explainer':
        explainer = shap.Explainer(model, train_os)

    print('Computing SHAP values...')
    test = test.sample(5000, random_state=0)

    #values = train_os.sample(5000, random_state=0)

    shap_vals = explainer(test)

    print('Generating plots...')
    plt.figure(figsize=(12,15), dpi=300)
    if model_type == 'Random Forests':
        shap.plots.heatmap(shap_vals[1], instance_order=shap_vals[1].sum(1), show=False, max_display=20, plot_width=12)
    else:
        shap.plots.heatmap(shap_vals, instance_order=shap_vals.mean(1), show=False, max_display=20, plot_width=12)
    plt.title(f'HEATMAP - {model_type} - {fold_type}')
    plt.show()


#### LOGISTIC MODEL

In [None]:
results_path = 'E:/IR_VS_IS/tuned model/logistic-regression-mutual/'
data_path = results_path + 'data/'

In [None]:
index = 12
shap_data = joblib.load(data_path+f'shap/logistic_shap_dump_{index}.dat')
shap_plots(shap_data, 'Best', 'Explainer', 'Logistic Regression')

In [None]:
index = 5
shap_data = joblib.load(data_path+f'shap/logistic_shap_dump_{index}.dat')
shap_plots(shap_data, 'Worst', 'Explainer', 'Logistic Regression')

#### RANDOM FORESTS

In [None]:
results_path = 'E:/IR_VS_IS/tuned model/random-forests-mutual/'
data_path = results_path + 'data/'

In [None]:
index = 2
shap_data = joblib.load(data_path+f'shap/random_forests_shap_dump_{index}.dat')
shap_plots(shap_data, 'Best', 'TreeExplainer', 'Random Forests')

In [None]:
index = 17
shap_data = joblib.load(data_path+f'shap/random_forests_shap_dump_{index}.dat')
shap_plots(shap_data, 'Worst', 'TreeExplainer', 'Random Forests')

### XGBOOST

In [None]:
results_path = 'E:/IR_VS_IS/tuned model/xgboost/'
data_path = results_path + 'data/'

In [None]:
index = 15
shap_data = joblib.load(data_path+f'shap/xgboost_shap_dump_{index}.dat')
shap_plots(shap_data, 'Best', 'TreeExplainer', 'XGBoost')

In [None]:
index = 15
shap_data = joblib.load(data_path+f'shap/xgboost_shap_dump_{index}.dat')
shap_plots(shap_data, 'Best', 'TreeExplainer', 'XGBoost')

In [None]:
index = 15
data = joblib.load(data_path+f'shap/xgboost_shap_dump_{index}.dat')

model = data['model']
df_train = data['df_train']
features = data['features']
test = data['test']
train_os = data['train_os']
mask = data['mask']

explainer = shap.TreeExplainer(model, data=train_os, model_output='probability')

print('Computing SHAP values...')
train_os = train_os.sample(5000, random_state=0)

shap_vals = explainer(train_os)

print('Generating plots...')
plt.figure(figsize=(12,15), dpi=300)

shap.plots.heatmap(shap_vals, instance_order=shap_vals.mean(1), show=False, max_display=20, plot_width=12)
plt.title(f'HEATMAP - XGBoost - train')
plt.show()

In [None]:
index = 3
shap_data = joblib.load(data_path+f'shap-1/xgboost_shap_dump_{index}.dat')
plot_heatmap(shap_data, 'Best', 'TreeExplainer', 'XGBoost')

In [None]:
index = 15
data = joblib.load(data_path+f'shap/xgboost_shap_dump_{index}.dat')

model = data['model']
df_train = data['df_train']
features = data['features']
test = data['test']
train_os = data['train_os']
mask = data['mask']

explainer = shap.TreeExplainer(model)
test = test.sample(5000, random_state=0)

shap_interaction = explainer.shap_interaction_values(test)
mean_shap = np.abs(shap_interaction).mean(0)
df = pd.DataFrame(mean_shap, index=test.columns, columns=test.columns)
df.where(df.values == np.diagonal(df),df.values*2, inplace=True)

fig = plt.figure(figsize=(45, 30), edgecolor='r', dpi=100)
ax = fig.add_subplot()
sns.heatmap(df.round(decimals=3), cmap='coolwarm', annot=True, fmt='.6g', cbar=False, ax=ax, )
ax.tick_params(axis='x', labelsize=15, rotation=90)
ax.tick_params(axis='y', labelsize=15)

plt.title("SHAP interaction values", fontsize=60)
plt.yticks(rotation=0) 
plt.show()

In [None]:
index = 15
data = joblib.load(data_path+f'shap/xgboost_shap_dump_{index}.dat')

model = data['model']
df_train = data['df_train']
df_train_target = data['df_train_target']
features = data['features']
test = data['test']
train_os = data['train_os']
mask = data['mask']



In [None]:
test_targets = df_train_target[~mask]

In [None]:
from statistics import mode

def get_track_prediction(y_true, scores, preds, groups):
    unique_groups = groups.unique()
    track_preds = []
    track_true = []
    avg_scores = []
    for val in unique_groups:
        indexes = np.where(groups == val)[0]
        track_true.append(mode(y_true.values[indexes]))
        avg_scores.append(np.mean(scores[indexes]))
        if np.mean(preds[indexes]) >= 0.5: 
            track_preds.append(1)
        else:
            track_preds.append(0)

    return track_true, track_preds, avg_scores

labels = model.predict(test)
scores = model.predict_proba(test)[:,1]

track_true, track_preds, avg_scores = get_track_prediction(
    test_targets['Target'], scores, labels, test_targets['TrackGroup'])

print('segment bal acc: ', metrics.balanced_accuracy_score(test_targets['Target'], labels))
print('segment roc auc: ', metrics.roc_auc_score(test_targets['Target'], scores))
print('track  bal acc: ', metrics.balanced_accuracy_score(track_true, track_preds))

In [None]:
index = 15
shap_data = joblib.load(data_path+f'shap/xgboost_shap_dump_{index}.dat')
shap_vals, test = shap_scatter(shap_data, 'Best', 'TreeExplainer', 'XGBoost')

df_train = shap_data['df_train']
features = shap_data['features']
test_targets = shap_data['test_targets']
targets = test_targets.loc[test.index.values]
targets = targets['Target']

train = df_train.iloc[~df_train.index.isin(shap_data['test'].index)]
scaler = StandardScaler().fit(train[features])
df = pd.DataFrame(scaler.inverse_transform(test), columns=features, index=test.index)

path_to_save = 'E:/IR_VS_IS/tuned model/xgboost/shap/probability/scatter-plots/'

shap_vals.data = df.values

for feature in df.columns.values:
    mask = targets.values == 1
    plt.figure(figsize=(12,8), dpi=300)
    shap.plots.scatter(shap_vals[:,feature], alpha=0.7, show=False)
    plt.grid(True)
    plt.title(f'SHAP Scatter plot of {feature}')
    #plt.show()
    #break
    plt.savefig(path_to_save+f'{feature}.png', bbox_inches='tight')
    plt.close()

In [None]:

fig, axes = plt.subplots(2,3, figsize=(15,10), dpi=300, gridspec_kw=dict(hspace=0.3, wspace=0.3))

feature = 'Y Velocity (3rd quartile)'
shap.plots.scatter(shap_vals[:,feature], alpha=0.7, show=False, ax=axes[0][0])
axes[0][0].set_title('(A)')
axes[0][0].set_ylabel('SHAP Value')
axes[0][0].set_xlabel('Vertical Velocity (3rd quartile)')
axes[0][0].grid(True)

feature = 'Y Velocity (1st quartile)'
shap.plots.scatter(shap_vals[:,feature], alpha=0.7, show=False, ax=axes[0][1])
axes[0][1].set_title('(B)')
axes[0][1].set_ylabel('SHAP Value')
axes[0][1].set_xlabel('Vertical Velocity (1st quartile)')
axes[0][1].grid(True)

feature = 'Y Velocity (std)'
shap.plots.scatter(shap_vals[:,feature], alpha=0.7, show=False, ax=axes[0][2])
axes[0][2].set_title('(C)')
axes[0][2].set_ylabel('SHAP Value')
axes[0][2].set_xlabel('Vertical Velocity (standard deviation)')
axes[0][2].grid(True)

feature = 'Y Velocity (kurtosis)'
shap.plots.scatter(shap_vals[:,feature], alpha=0.7, show=False, ax=axes[1][0])
axes[1][0].set_title('(D)')
axes[1][0].set_ylabel('SHAP Value')
axes[1][0].set_xlabel('Vertical Velocity (kurtosis)')
axes[1][0].grid(True)

feature = 'Y Velocity (skewness)'
shap.plots.scatter(shap_vals[:,feature], alpha=0.7, show=False, ax=axes[1][1])
axes[1][1].set_title('(E)')
axes[1][1].set_ylabel('SHAP Value')
axes[1][1].set_xlabel('Vertical Velocity (skewness)')
axes[1][1].grid(True)

#plt.tight_layout()
axes[1][2].set_visible(False)
axes[1][0].set_position([0.24,0.125,0.228,0.343])
axes[1][1].set_position([0.55,0.125,0.228,0.343])

plt.show()


### SAME CLASS

In [None]:
def shap_plots(data, fold_type, explainer_type, model_type):
    model = data['model']
    df_train = data['df_train']
    df_train_target = data['df_train_target']
    features = data['features']
    test = data['test']
    train_os = data['train_os']
    mask = data['mask']

    #df_train_target.loc[df_train_target['TrialID'].isin([0,1,2,3]), 'Species'] = 0
    #df_train_target.loc[df_train_target['TrialID'].isin([4,5,6,7,8]),'Species'] = 1
    #df_train_target.loc[df_train_target['TrialID'].isin([9,10,11,12]),'Species'] = 2
    #df_train_target.loc[df_train_target['TrialID'].isin([13,14,15,16]),'Species'] = 3
    #df_train_target.loc[df_train_target['TrialID'].isin([0,1,2,3]),'Target'] = 1        # banfora
    #df_train_target.loc[df_train_target['TrialID'].isin([13,14,15,16]),'Target'] = 0   # vk7
    #df_train_target.loc[df_train_target['TrialID'].isin([4,5,6,7,8]),'Target'] = 0     # kismum
    #df_train_target.loc[df_train_target['TrialID'].isin([9,10,11,12]),'Target'] = 1     # ngoussu

    print('Initalising SHAP...')
    if explainer_type == 'TreeExplainer':
        explainer = shap.TreeExplainer(model)
    elif explainer_type == 'Explainer':
        explainer = shap.Explainer(model, train_os)

    print('Computing SHAP values...')

    test = test.sample(500, random_state=0)

    shap_vals = explainer.shap_values(test)

    print('Generating plots...')

    plt.figure(figsize=(12,15), dpi=300)
    if model_type == 'Random Forests':
        shap.summary_plot(shap_vals[0], test, show=False, plot_size=[12,15], max_display=len(test.columns), alpha=0.7)
    else:
        shap.summary_plot(shap_vals, test, show=False, plot_size=[12,15], max_display=len(test.columns), alpha=0.7)
    plt.title(f'SUMMARY PLOT - {model_type} - {fold_type}')
    plt.show()

    plt.figure(figsize=(12,15), dpi=300)
    shap.summary_plot(shap_vals, test, plot_type="bar", show=False, plot_size=[12,15], max_display=len(test.columns))
    plt.title(f'BAR PLOT - {model_type} - {fold_type}')
    plt.show()


In [None]:
data_path = 'E:/IR_VS_IS/tuned model/banfora-vs-vk7/data/'
index = 0
shap_data = joblib.load(data_path+f'shap/xgboost_shap_dump_{index}.dat')
shap_plots(shap_data, 'Best', 'TreeExplainer', 'XGBoost')

In [None]:
index = 1
shap_data = joblib.load(data_path+f'shap/xgboost_shap_dump_{index}.dat')
shap_plots(shap_data, 'Worst', 'TreeExplainer', 'XGBoost')