## Breast Cancer Wisconsin (Diagnostic) Data Set
Predict whether the cancer is benign or malignant

In [1]:
import shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns  #for plotting
import featuretools as ft  # featuretools for automated feature engineering

from sklearn.model_selection import KFold 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from featuretools import selection
from warnings import simplefilter  # import warnings filter

pd.options.mode.chained_assignment = None  #hide any pandas warnings
simplefilter(action='ignore', category=FutureWarning)  # ignore all future warnings
np.random.seed(123) #ensure reproducibility

### Download dataset from [Kaggle](https://www.kaggle.com/uciml/breast-cancer-wisconsin-data/downloads/data.csv/2)

In [2]:
# Read breast cancer data set as Pandas dataframe
df_raw = pd.read_csv("data.csv")
df_raw.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


### Column Description (details can be found at [Kaggle](https://www.kaggle.com/uciml/breast-cancer-wisconsin-data#data.csv))
diagnosis: The diagnosis of breast tissues (M = malignant, B = benign)  
radius: Distances from center to points on the perimeter  
texture: Standard deviation of gray-scale values  
perimeter: Mean size of the core tumor  
smoothness: Local variation in radius lengths  
compactness: perimeter^2 / area - 1.0  
concavity: Sverity of concave portions of the contour  
concave points: Number of concave portions of the contour   
fractal_dimension: "coastline approximation" - 1  

### Clean up dataframe

In [3]:
# rearrange columns; remove coluum "Unnamed: 32"; set id as index
fixed_columns = [df_raw.columns[0]]+list(df_raw.columns[2:-1])+[df_raw.columns[1]]
df_data = df_raw[fixed_columns]

In [4]:
# Convert (M=malignant, B=benign) to (1,0)
df_data.loc[df_data.index[df_data['diagnosis']=='B'],'diagnosis'] = 0
df_data.loc[df_data.index[df_data['diagnosis']=='M'],'diagnosis'] = 1

In [5]:
df_data.head()

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis
0,842302,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
1,842517,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
2,84300903,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
3,84348301,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1
4,84358402,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1


### Compare the performance of machine leanring models

#### 1. Setup function for K-fold cross validation

In [6]:
def cross_validatoin(fold, model, X, y):
    """
    Perform K-fold cross validation
    compare the sensitivity, specificity, accuracy, and F1-score of input models
    """
    sensitivity=[]
    specificity=[]
    accuracy=[]
    F1scores=[]
    
    kf = KFold(n_splits=fold,shuffle=True) 
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index] 
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        cm = confusion_matrix(y_test,y_pred)
        sensitivity.append(cm[0,0]/(cm[0,0]+cm[1,0]))
        specificity.append(cm[1,1]/(cm[1,1]+cm[0,1]))
        accuracy.append(accuracy_score(y_test,y_pred))
        F1scores.append(f1_score(y_test, y_pred, pos_label=1))
        
    return [np.mean(sensitivity),np.mean(specificity),np.mean(accuracy),np.mean(F1scores)]

In [7]:
# Set the number of fold for cross validation
fold = 10

#### 2. Apply and compare machine learning models

In [8]:
# Prepare input matrix for machine learning models
feature_names = df_data.columns.tolist()[1:-1]
X = df_data[feature_names].values
y = df_data['diagnosis'].values

In [9]:
# Create dictionary to collect results
d_Model_eva = {} 

# Logistic regression
d_Model_eva['Logistic Regression'] = cross_validatoin(fold, LogisticRegression(), X, y)

# Decision tree
d_Model_eva['Decision Tree'] = cross_validatoin(fold, DecisionTreeClassifier(), X, y)

# Random forest
d_Model_eva['Random Forest'] = cross_validatoin(fold, RandomForestClassifier(), X, y)

#### 3. Determine the best model for each metric

In [10]:
# Create output dataframe
df_eva = pd.DataFrame(d_Model_eva, index=['Sensitivity','Specificity','Accuracy','F1-score'])
df_eva.round(3).T.sort_values('F1-score',ascending=False)

Unnamed: 0,Sensitivity,Specificity,Accuracy,F1-score
Random Forest,0.961,0.957,0.96,0.946
Logistic Regression,0.952,0.949,0.951,0.93
Decision Tree,0.939,0.875,0.914,0.88


### Automated Feature Engineering

In [11]:
# Create new entityset
es = ft.EntitySet(id = 'breastcancer')

df_data_ft = df_data.loc[:,['id']+feature_names]

# Create an entity from the breast cancer dataframe
es = es.entity_from_dataframe(entity_id = 'breastcancer', dataframe = df_data_ft, index='id')

In [12]:
# Generate new features
df_feature_matrix, feature_defs = ft.dfs(entityset = es, target_entity = 'breastcancer',
                                      trans_primitives = ['multiply_numeric'])
df_feature_matrix.head()

Unnamed: 0_level_0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,perimeter_se * smoothness_se,fractal_dimension_se * smoothness_worst,compactness_se * concave points_se,concave points_worst * perimeter_mean,perimeter_se * symmetry_mean,fractal_dimension_se * perimeter_se,concavity_se * smoothness_mean,concave points_worst * smoothness_se,concave points_mean * concavity_se,perimeter_worst * symmetry_se
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8670,15.46,19.48,101.7,748.9,0.1092,0.1223,0.1466,0.08087,0.1931,0.05796,...,0.019307,0.00038,0.000162,15.39738,0.597451,0.007614,0.003072,0.000945,0.002275,1.744853
8913,12.89,13.12,81.89,515.9,0.06955,0.03729,0.0226,0.01171,0.1337,0.05581,...,0.005275,0.0002,7.9e-05,4.394217,0.149076,0.00232,0.001149,0.000254,0.000193,1.415006
8915,14.96,19.1,97.03,687.3,0.08992,0.09823,0.0594,0.04819,0.1879,0.05852,...,0.011576,0.00037,0.000251,14.447767,0.407931,0.006111,0.001381,0.000794,0.00074,1.660502
9047,12.94,16.17,83.18,507.6,0.09879,0.08836,0.03296,0.0239,0.1735,0.062,...,0.00288,0.000231,9.4e-05,6.977138,0.173066,0.001967,0.001593,0.000242,0.000386,1.677203
85715,13.17,18.66,85.98,534.6,0.1158,0.1231,0.1226,0.0734,0.2128,0.06777,...,0.012391,0.000651,0.000284,17.952624,0.403682,0.006911,0.003364,0.001364,0.002132,1.791804


### Remove Highly Correlated Columns
Drop Highly Correlated Features, the code is adapted from [work](https://chrisalbon.com/machine_learning/feature_selection/drop_highly_correlated_features/) by Chris Albon.

In [13]:
# Define the threshold for removing correlated variables
threshold = 0.99

In [14]:
# Get correlation of each variables
corr_matrix = df_feature_matrix.corr().abs()
# Upper triangle of correlations
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,perimeter_se * smoothness_se,fractal_dimension_se * smoothness_worst,compactness_se * concave points_se,concave points_worst * perimeter_mean,perimeter_se * symmetry_mean,fractal_dimension_se * perimeter_se,concavity_se * smoothness_mean,concave points_worst * smoothness_se,concave points_mean * concavity_se,perimeter_worst * symmetry_se
radius_mean,,0.323782,0.997855,0.987357,0.170581,0.506124,0.676764,0.822529,0.147741,0.311631,...,0.416859,0.019347,0.201345,0.880862,0.634165,0.378405,0.215278,0.527274,0.530063,0.528315
texture_mean,,,0.329533,0.321086,0.023389,0.236702,0.302418,0.293464,0.071401,0.076437,...,0.17648,0.067127,0.148579,0.328898,0.259537,0.180979,0.133244,0.240879,0.225694,0.211794
perimeter_mean,,,,0.986507,0.207278,0.556936,0.716136,0.850977,0.183027,0.261477,...,0.438818,0.022864,0.239349,0.901607,0.660856,0.41388,0.252412,0.561825,0.567687,0.552282
area_mean,,,,,0.177028,0.498502,0.685983,0.823269,0.151293,0.28311,...,0.481598,0.003005,0.210014,0.882454,0.679317,0.416843,0.230119,0.530919,0.549782,0.542419
smoothness_mean,,,,,,0.659123,0.521984,0.553695,0.557775,0.584792,...,0.353225,0.484033,0.262755,0.418047,0.377407,0.319769,0.375239,0.594911,0.422419,0.309396


In [15]:
# Select columns with correlations above threshold
col_to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
print('Columns with correlation > {}:\n-{}'.format(threshold,"\n-".join(col_to_drop)))

Columns with correlation > 0.99:
-perimeter_mean
-perimeter_worst
-concavity_worst * perimeter_mean
-concave points_mean * radius_worst
-concavity_worst * radius_worst
-perimeter_mean * radius_se
-area_worst * perimeter_mean
-compactness_mean * perimeter_worst
-area_mean * radius_mean
-area_worst * perimeter_se
-perimeter_worst * radius_worst
-perimeter_mean * radius_mean
-area_mean * perimeter_worst
-compactness_se * radius_mean
-area_se * radius_worst
-compactness_se * perimeter_worst
-concave points_mean * perimeter_worst
-fractal_dimension_mean * perimeter_worst
-compactness_mean * radius_worst
-perimeter_mean * smoothness_mean
-perimeter_mean * texture_worst
-concavity_se * radius_worst
-concavity_mean * texture_worst
-fractal_dimension_mean * perimeter_mean
-concavity_se * perimeter_worst
-fractal_dimension_worst * perimeter_worst
-perimeter_worst * texture_mean
-radius_mean * smoothness_se
-radius_worst * smoothness_worst
-radius_mean * radius_se
-compactness_mean * radius_mean


In [16]:
df_feature_matrix_dropcorr = df_feature_matrix.drop(columns = col_to_drop)
df_feature_matrix_dropcorr.head()

Unnamed: 0_level_0,radius_mean,texture_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,...,concavity_worst * smoothness_mean,concavity_se * symmetry_mean,perimeter_se * smoothness_se,fractal_dimension_se * smoothness_worst,compactness_se * concave points_se,perimeter_se * symmetry_mean,fractal_dimension_se * perimeter_se,concavity_se * smoothness_mean,concave points_worst * smoothness_se,concave points_mean * concavity_se
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8670,15.46,19.48,748.9,0.1092,0.1223,0.1466,0.08087,0.1931,0.05796,0.4743,...,0.041398,0.005432,0.019307,0.00038,0.000162,0.597451,0.007614,0.003072,0.000945,0.002275
8913,12.89,13.12,515.9,0.06955,0.03729,0.0226,0.01171,0.1337,0.05581,0.1532,...,0.008249,0.002209,0.005275,0.0002,7.9e-05,0.149076,0.00232,0.001149,0.000254,0.000193
8915,14.96,19.1,687.3,0.08992,0.09823,0.0594,0.04819,0.1879,0.05852,0.2877,...,0.016222,0.002886,0.011576,0.00037,0.000251,0.407931,0.006111,0.001381,0.000794,0.00074
9047,12.94,16.17,507.6,0.09879,0.08836,0.03296,0.0239,0.1735,0.062,0.1458,...,0.017881,0.002799,0.00288,0.000231,9.4e-05,0.173066,0.001967,0.001593,0.000242,0.000386
85715,13.17,18.66,534.6,0.1158,0.1231,0.1226,0.0734,0.2128,0.06777,0.2871,...,0.057969,0.006182,0.012391,0.000651,0.000284,0.403682,0.006911,0.003364,0.001364,0.002132


In [17]:
# Remove columns with missing values
df_feature_matrix_dropcorr.replace([np.inf, -np.inf], np.nan, inplace=True)
col_without_nan = df_feature_matrix_dropcorr.columns[~df_feature_matrix_dropcorr.isna().any()]
df_feature_matrix_dropcorr_dropnan = df_feature_matrix_dropcorr[col_without_nan]

# Remove columns with too little information (less than ten distinct values)
selection.remove_low_information_features(df_feature_matrix_dropcorr_dropnan)

df_feature_matrix_dropcorr_dropnan.head()

Unnamed: 0_level_0,radius_mean,texture_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,...,concavity_worst * smoothness_mean,concavity_se * symmetry_mean,perimeter_se * smoothness_se,fractal_dimension_se * smoothness_worst,compactness_se * concave points_se,perimeter_se * symmetry_mean,fractal_dimension_se * perimeter_se,concavity_se * smoothness_mean,concave points_worst * smoothness_se,concave points_mean * concavity_se
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8670,15.46,19.48,748.9,0.1092,0.1223,0.1466,0.08087,0.1931,0.05796,0.4743,...,0.041398,0.005432,0.019307,0.00038,0.000162,0.597451,0.007614,0.003072,0.000945,0.002275
8913,12.89,13.12,515.9,0.06955,0.03729,0.0226,0.01171,0.1337,0.05581,0.1532,...,0.008249,0.002209,0.005275,0.0002,7.9e-05,0.149076,0.00232,0.001149,0.000254,0.000193
8915,14.96,19.1,687.3,0.08992,0.09823,0.0594,0.04819,0.1879,0.05852,0.2877,...,0.016222,0.002886,0.011576,0.00037,0.000251,0.407931,0.006111,0.001381,0.000794,0.00074
9047,12.94,16.17,507.6,0.09879,0.08836,0.03296,0.0239,0.1735,0.062,0.1458,...,0.017881,0.002799,0.00288,0.000231,9.4e-05,0.173066,0.001967,0.001593,0.000242,0.000386
85715,13.17,18.66,534.6,0.1158,0.1231,0.1226,0.0734,0.2128,0.06777,0.2871,...,0.057969,0.006182,0.012391,0.000651,0.000284,0.403682,0.006911,0.003364,0.001364,0.002132


### Add diagnosis column back to feature matrix

In [18]:
df_outcomes = df_data.loc[:,['id','diagnosis']]
df_outcomes.set_index('id',inplace=True)

df_feature_matrix_outcomes = pd.merge(df_feature_matrix_dropcorr_dropnan,df_outcomes,
                                      left_index=True, right_index=True)
df_feature_matrix_outcomes.head()

Unnamed: 0_level_0,radius_mean,texture_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,...,concavity_se * symmetry_mean,perimeter_se * smoothness_se,fractal_dimension_se * smoothness_worst,compactness_se * concave points_se,perimeter_se * symmetry_mean,fractal_dimension_se * perimeter_se,concavity_se * smoothness_mean,concave points_worst * smoothness_se,concave points_mean * concavity_se,diagnosis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8670,15.46,19.48,748.9,0.1092,0.1223,0.1466,0.08087,0.1931,0.05796,0.4743,...,0.005432,0.019307,0.00038,0.000162,0.597451,0.007614,0.003072,0.000945,0.002275,1
8913,12.89,13.12,515.9,0.06955,0.03729,0.0226,0.01171,0.1337,0.05581,0.1532,...,0.002209,0.005275,0.0002,7.9e-05,0.149076,0.00232,0.001149,0.000254,0.000193,0
8915,14.96,19.1,687.3,0.08992,0.09823,0.0594,0.04819,0.1879,0.05852,0.2877,...,0.002886,0.011576,0.00037,0.000251,0.407931,0.006111,0.001381,0.000794,0.00074,0
9047,12.94,16.17,507.6,0.09879,0.08836,0.03296,0.0239,0.1735,0.062,0.1458,...,0.002799,0.00288,0.000231,9.4e-05,0.173066,0.001967,0.001593,0.000242,0.000386,0
85715,13.17,18.66,534.6,0.1158,0.1231,0.1226,0.0734,0.2128,0.06777,0.2871,...,0.006182,0.012391,0.000651,0.000284,0.403682,0.006911,0.003364,0.001364,0.002132,1


In [19]:
# Prepare input matrix for machine learning models
feature_names_ft = df_feature_matrix_outcomes.columns.tolist()[1:-1]
X_ft = df_feature_matrix_outcomes[feature_names_ft].values
y_ft = df_feature_matrix_outcomes['diagnosis'].values

In [20]:
# Logistic regression
d_Model_eva['Logistic Regression (Feature Engineering)'] = cross_validatoin(fold, LogisticRegression(), X_ft, y_ft)

# Decision tree
d_Model_eva['Decision Tree (Feature Engineering)'] = cross_validatoin(fold, DecisionTreeClassifier(), X_ft, y_ft)

# Random forest
d_Model_eva['Random Forest (Feature Engineering)'] = cross_validatoin(fold, RandomForestClassifier(), X_ft, y_ft)

In [21]:
# Create output dataframe
df_eva = pd.DataFrame(d_Model_eva, index=['Sensitivity','Specificity','Accuracy','F1-score'])
df_eva.round(3).T.sort_values('F1-score',ascending=False)

Unnamed: 0,Sensitivity,Specificity,Accuracy,F1-score
Logistic Regression (Feature Engineering),0.96,0.962,0.961,0.947
Random Forest,0.961,0.957,0.96,0.946
Random Forest (Feature Engineering),0.956,0.955,0.954,0.938
Logistic Regression,0.952,0.949,0.951,0.93
Decision Tree (Feature Engineering),0.955,0.937,0.947,0.926
Decision Tree,0.939,0.875,0.914,0.88


In [22]:
# Print out conclusion
print('{}-fold cross validation shows:'.format(fold))
for index, row in df_eva.iterrows():
    print("- {} has the best {} score = {:.3f}.".format(df_eva.loc[index,:].idxmax(axis=1), index,
                                         df_eva.loc[index, df_eva.loc[index,:].idxmax(axis=1)]))

10-fold cross validation shows:
- Random Forest has the best Sensitivity score = 0.961.
- Logistic Regression (Feature Engineering) has the best Specificity score = 0.962.
- Logistic Regression (Feature Engineering) has the best Accuracy score = 0.961.
- Logistic Regression (Feature Engineering) has the best F1-score score = 0.947.


## Evaluate the impact of each feature with SHAP Values  
Link to [SHAP (SHapley Additive exPlanations)](https://www.kaggle.com/dansbecker/shap-values)

In [None]:
explainer = shap.TreeExplainer(RandomForestClassifier().fit(X_ft, y_ft)) # Create object that can calculate shap values
shap_values = explainer.shap_values(X) 

shap.summary_plot(shap_values[1], X, feature_names=feature_names_ft,plot_type="bar")

In [None]:
shap.summary_plot(shap_values[1],X_ft,feature_names=feature_names_ft)

In [None]:
# Create dataframe of feature importance according to SHAP
shap_sum = np.abs(shap_values[1]).mean(axis=0)
df_importance = pd.DataFrame([feature_names, shap_sum.tolist()]).T
df_importance.columns = ['Features','SHAP_importance']
df_importance = df_importance.sort_values('SHAP_importance', ascending=False)
df_importance.head()                             