In [1]:
import pandas as pd
import numpy as np
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.io as pio
import os 

In [2]:
init_notebook_mode(connected=True)

In [3]:
var_roles = pd.read_pickle('/opt/notebooks/demo/celery/processed/renamed_user_template.pkl')
cat_mappings = pd.read_pickle('/opt/notebooks/demo/celery/processed/renamed_user_template_catsMap.pkl')
tuned_params = pd.read_pickle('/opt/notebooks/demo/celery/processed/renamed_user_template_params.pkl')
y_label_mapping = pd.read_pickle('/opt/notebooks/demo/celery/processed/renamed_user_template_labelsMap.pkl')
scored_test_pool = pd.read_pickle('/opt/notebooks/demo/celery/processed/renamed_user_template_test_pred.pkl')
m_test = pd.read_pickle('/opt/notebooks/demo/celery/processed/renamed_user_template_mTest.pkl')  
x_test = pd.read_pickle('/opt/notebooks/demo/celery/processed/renamed_user_template_xTest.pkl')
y_test = pd.read_pickle('/opt/notebooks/demo/celery/processed/renamed_user_template_yTest.pkl')

In [4]:
from catboost import Pool, CatBoostClassifier

In [5]:
tuned_model = CatBoostClassifier()
tuned_model.load_model('/opt/notebooks/demo/celery/processed/renamed_user_template_fitted.dump')

<catboost.core.CatBoostClassifier at 0x7f8dca742a58>

In [6]:
from catboost import Pool
test_pool = Pool(x_test, np.ravel(y_test), cat_features=cat_mappings.Index.to_list())

In [7]:
predictions = tuned_model.predict(test_pool)

In [8]:
import sklearn.metrics 
from catboost.utils import get_roc_curve
fpr, tpr, thresholds = get_roc_curve(tuned_model, test_pool)
auc = sklearn.metrics.auc(fpr, tpr)

In [9]:
lw = 2

trace1 = go.Scatter(x=fpr, 
                    y=tpr, 
                    text=['Threshold: ' + str(np.round(each, 4)) for each in thresholds],
                    mode='lines', 
                    hoverinfo = 'text+x+y',
                    line=dict(color='darkorange', width=lw),
                    name='ROC curve (area = %0.2f)' % auc
                   )

trace2 = go.Scatter(x=[0, 1], y=[0, 1], 
                    mode='lines', 
                    line=dict(color='navy', width=lw, dash='dash'),
                    showlegend=False)

layout = go.Layout(
                    title='Area Under the Receiver Operating Characteristics',
                    xaxis=dict(title='False Positive Rate'),
                    yaxis=dict(title='True Positive Rate', hoverformat = '.2f'), 
                    #paper_bgcolor = 'rgb(233,233,233)', 
                    #plot_bgcolor = 'rgb(233,233,233)'
                  )

fig = go.Figure(data=[trace1, trace2], layout=layout)

iplot(fig)

In [10]:
# save classification report to a dictionary
prfs_report = sklearn.metrics.classification_report(np.ravel(y_test),
                      predictions, 
                      target_names= y_label_mapping.Label.to_list(),
                      output_dict = True
                     )

# move the nested dict to a dataframe
prfs_values = []
for k,v in prfs_report.items():
    for k2, v2 in v.items():
        prfs_values.append([k, k2, v2])

prfs_df = pd.DataFrame(prfs_values, columns=['Label', 'Metric', 'Value'])

# set colors 

N = len(prfs_df.Label.unique()) + \
sum(1 for i in prfs_df.Label.unique() if i not in ['micro avg', 'macro avg', 'weighted avg']) # account for support accounts

c = ['hsl('+str(h)+',75%'+',75%)' for h in np.linspace(0, 360, N)]

trace_groups=[]

for k in prfs_df.Label.unique():
    _precision = prfs_df[(prfs_df.Label==k) & (prfs_df.Metric=='precision')].Value.values[0]
    _recall = prfs_df[(prfs_df.Label==k) & (prfs_df.Metric=='recall')].Value.values[0]
    _f1 = prfs_df[(prfs_df.Label==k) & (prfs_df.Metric=='f1-score')].Value.values[0]
    
    trace = go.Bar(
        x=['precision', 'recall', 'f1-score'],
        y=[_precision, _recall, _f1],
        name=k,
        marker=dict(color=c[np.where(prfs_df.Label.unique() == k)[0][0]])
    )
    trace_groups.append(trace) 
    
for k in prfs_df.Label.unique():
    if k not in ['micro avg', 'macro avg', 'weighted avg']:
        _support = prfs_df[(prfs_df.Label==k) & (prfs_df.Metric=='support')].Value.values[0]
        trace = go.Bar(
            x=['support'],
            y=[_support],
            yaxis='y2',
            name=k,
            marker=dict(color=c[np.where(prfs_df.Label.unique() == k)[0][0]]), 
            showlegend = False
        )
        trace_groups.append(trace) 

data = trace_groups
layout = go.Layout(
    title=go.layout.Title(
        text='Classification Performance Report',
        xref='paper',
    ),
    legend=dict(orientation="h"),
    barmode='group', 
    yaxis=dict(
        title='Percentage',
        range=[0, 1]
    ),
    yaxis2=dict(
        title='Count',
        range=[0, 10000],
        overlaying='y',
        side='right'
    ),
    paper_bgcolor = 'rgb(233,233,233)', 
    plot_bgcolor = 'rgb(233,233,233)'
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [11]:
from eli5.permutation_importance import get_score_importances

# ... load data, define score function
def score(X, y):
    y_pred = tuned_model.predict(X)
    return sklearn.metrics.roc_auc_score(y, y_pred)

base_score, score_decreases = get_score_importances(score, x_test.to_numpy(), y_test.to_numpy(), 
                                                   random_state=42, 
                                                   n_iter=50)
feature_importances = np.mean(score_decreases, axis=0)

In [12]:
N = len(x_test.columns) # Number of boxes

c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, N)]

data = [{
    'y': np.array(score_decreases)[:, i]*100, 
    'name': var_roles[var_roles.Feature==x_test.columns[i]].Variable.values[0],
    'type':'box',
    'marker':{'color': c[i]}
    } for i in range(int(N))]

# format the layout
layout = go.Layout(
            title=go.layout.Title(
                text='Mean Decrease In AUC',
                xref='paper',
            ),
            legend=dict(orientation="h"),
            barmode='group', 
            xaxis=dict(
                showgrid=False,
                zeroline=False, 
                tickangle=60,
                showticklabels=False
            ),
            yaxis=dict(
                title='Basis Points Drop',
                zeroline=True,
                gridcolor='white', 
                #range=[0, 10]
            ), 
            paper_bgcolor = 'rgb(233,233,233)', 
            plot_bgcolor = 'rgb(233,233,233)'
)


fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [13]:
joint_importance = tuned_model.get_feature_importance(test_pool,prettified = True, type = "Interaction")

features1 = [item[0] for item in joint_importance]
features2 = [item[1] for item in joint_importance]
joint_imp = [item[2] for item in joint_importance]
str_features = []
first_feature = []
second_feature = []

for i in range(0,len(features1)):
    for j in range(0,len(x_test.columns)):
        if features1[i] == j:
            str_a = var_roles[var_roles.Feature==x_test.columns[j]].Variable.values[0]   #x_test.columns[j]
        if features2[i] == j:
            str_b = var_roles[var_roles.Feature==x_test.columns[j]].Variable.values[0]   #x_test.columns[j]
    str_features.append(str_a + " & " + str_b)
    first_feature.append(str_a)
    second_feature.append(str_b)
    
#Graph in Plotly
trace = go.Bar(
            y=joint_imp[:5],
            x=str_features[:5],
            name='Joint Feature Importance',
            orientation = 'v',
            marker = dict(
                color = 'rgb(228,81,73)',
                line = dict(
                    color = 'rgb(228,81,73)',
                    width = 3)
            )
)

data = [trace]
layout = go.Layout(
    barmode='stack',
    title=go.layout.Title(
        text='5 Most Important Pairwise Interactions',
        xref='paper',
        x=0.5
    ),
    paper_bgcolor = 'rgb(233,233,233)', 
    plot_bgcolor = 'rgb(233,233,233)',
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text='Features',
            font=dict(
                family='Courier New, monospace',
                size=18,
                color='#7f7f7f'
            )
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='Feature Importance',
            font=dict(
                family='Courier New, monospace',
                size=18,
                color='#7f7f7f'
            )
        )
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [14]:
# N = len(x_test.iloc[:,features2[0]].cat.categories) # Number of boxes

# c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, N)]

In [17]:
import shap
explainer = shap.TreeExplainer(tuned_model, model_output='margin')
shap_values = explainer.shap_values(test_pool)


fstr_type soon be deprecated, use type instead

The model has complex ctrs, so the SHAP values will be calculated approximately.


In [18]:
x = x_test.iloc[:,features1[0]]
y = pd.Series(shap_values[:,features1[0]])
#x = x.apply(lambda n: n+0.1*(np.random.random_sample()-0.5))
tracefull1 = go.Scatter(
    x = x,
    y = y,
    name = str_features[0],
    mode = 'markers',
    marker=dict(
            size=6,
            cmax=len(x_test.iloc[:,features2[0]].cat.categories),
            #cmin=0,
            color=[each for each in x_test.iloc[:,features2[0]]],
            #color=[x_test.iloc[:,features2[0]].cat.categories.to_list().index(each) for each in x_test.iloc[:,features2[0]]],
            colorbar=dict(
                #title='Colorbar', 
                tick0=0,
                dtick=1
            ),
            colorscale='RdBu'
        ),
)

data = [tracefull1]

layout = go.Layout(
    barmode='stack',
    title=go.layout.Title(
        text='Feature Importance of ' + str_features[0],
        xref='paper',
        x=0.5
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text=first_feature[0],
            font=dict(
                family='Courier New, monospace',
                size=18,
                color='#7f7f7f'
            )
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='SHAP values',
            font=dict(
                family='Courier New, monospace',
                size=18,
                color='#7f7f7f'
            )
        )
    ),
    annotations=[
        dict(
            x=1.05,
            y=1.05,
            align="right",
            valign="top",
            text=second_feature[0],
            showarrow=False,
            xref="paper",
            yref="paper",
            xanchor="center",
            yanchor="top"
        )
    ]
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)


In [19]:
def logit2prob(logit):
    odds = np.exp(logit)
    prob = odds / (1 + odds)
    return prob

In [20]:
def explain_row(index):
    
    # prediction of the test row
    pred_class = predictions[index]
    
    # base value of the explainer 
    based_value = explainer.expected_value
    
    # shap values
    y = shap_values[index,:]
    
    # sum of higher shap values
    h_shap_sum = sum(y[np.where(y>=0)])
    
    # sum of lower shap values
    l_shap_sum = sum(y[np.where(y<0)])
    
    color= np.array(['rgb(255,255,255)']*y.shape[0])
    color[y<0]='rgb(228,81,73)'
    color[y>=0]='rgb(37,144,134)'
    feature_values = np.array([var_roles[var_roles.Feature==each].Variable.values[0] \
                               + ' = ' + str(x_test.iloc[index, :][each]) \
                               for each in x_test.iloc[0, :].index])

    trace1 = go.Bar(
        x=[x_name for _,x_name in sorted(zip(y[np.where(y>=0)], x_test.columns[np.where(y>=0)]), reverse=True)],
        y=sorted(y[np.where(y>=0)], reverse=True),
        text=[txt_name for _,txt_name in sorted(zip(y[np.where(y>=0)], feature_values[np.where(y>=0)]), reverse=True)],
        marker=dict(color=color[np.where(y>=0)].tolist()),
        name= 'Higher', 
        hoverinfo = 'y+text',
        orientation = 'v',
        showlegend = True
    )

    trace2 = go.Bar(
        x=[x_name for _,x_name in sorted(zip(y[np.where(y<0)], x_test.columns[np.where(y<0)]))],
        y=sorted(y[np.where(y<0)]),
        text=[txt_name for _,txt_name in sorted(zip(y[np.where(y<0)], feature_values[np.where(y<0)]))],
        marker=dict(color=color[np.where(y<0)].tolist()),
        name= 'Lower', 
        hoverinfo = 'y+text',
        orientation = 'v',
        showlegend = True
    )

    data = [trace1, trace2]

    layout = go.Layout(
        barmode='stack',
        xaxis=go.layout.XAxis(
            title=go.layout.xaxis.Title(
                text='Features',
                font=dict(
                    family='Courier New, monospace',
                    size=18,
                    color='#7f7f7f'
                )
            )
        ),
        yaxis=go.layout.YAxis(
            title=go.layout.yaxis.Title(
                text='SHAP values (Log-Odds)',
                font=dict(
                    family='Courier New, monospace',
                    size=18,
                    color='#7f7f7f'
                )
            )
        )
    )
    
    fig = go.Figure(data=data, layout=layout)
    
    fig['layout'].update(
#         title=go.layout.Title(text='Feature Contributions For Predicted Class {}'.format(int(pred_class))),
        title=go.layout.Title(text='Feature Contributions'),
        autosize=True, 
    )
    
    return iplot(fig)

In [21]:
def explain_pred(index):
    print('ID: {}'.format(scored_test_pool.loc[index, :].ID.astype(int)))
    print('Actual Y: {}'.format(scored_test_pool.loc[index, :].Y.astype(int)))
    print('Predicted Y: {}'.format(scored_test_pool.loc[index, :].Pred_Y.astype(int)))
    pred_class = 'PredProb_' + str(scored_test_pool.loc[index, :].Pred_Y.astype(int))
    print('Base Probability Without Features: {:.2f}'.format((logit2prob(explainer.expected_value))))
    print('Predicted Probability: {:.2f}'.format(scored_test_pool.loc[index, :][pred_class]))

In [22]:
explain_pred(5996)

ID: 1260
Actual Y: 0
Predicted Y: 1
Base Probability Without Features: 0.64
Predicted Probability: 0.75


In [23]:
explain_row(5996)