In [7]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib
import matplotlib.ticker as ticker
import seaborn as sns
sns.set(color_codes=True)
pd.set_option('display.max_columns', None)

In [8]:
data = pd.read_csv("data/test_df_full.csv")



In [9]:
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,scaled_amount,scaled_time,Class,predicted_probability,proba_decile,prediction,FN,FP
0,1.054379,-0.764756,0.160168,0.665587,1.113466,4.771656,-1.536609,1.306483,1.565054,-0.349941,-1.238496,0.608002,-0.249115,-0.933969,-1.407873,-0.191383,-0.180853,0.22698,0.53747,0.050995,-0.097138,-0.029759,-0.171596,1.047201,0.698595,-0.217939,0.10197,0.040152,0.478654,-0.182124,0,0.0,0,0,0,0
1,-4.805134,4.351191,-0.916135,-0.900752,-0.870777,-0.782946,0.328104,0.582501,2.749336,4.72806,0.455162,0.64974,0.930379,-1.663325,1.334738,-0.283321,0.15234,-0.950771,-0.935567,2.160193,-0.854486,-0.758266,0.147005,-0.021274,0.55266,0.008318,0.431835,-0.436239,-0.169378,-0.046108,0,0.01,17,0,0,0
2,-1.549833,-0.261143,1.556289,-2.037817,-0.366315,-1.334314,0.165406,-0.025782,-1.472751,-0.172838,-0.594842,0.063059,1.054185,-0.171535,0.192041,-0.867483,-0.348274,0.743352,-1.454672,-0.347365,-0.405656,-0.951005,-0.147752,0.4241,0.081845,1.103559,-0.502271,-0.248822,0.334863,-0.511327,0,0.01,17,0,0,0
3,0.216344,0.663182,1.30352,0.169219,-0.404266,-0.517288,-0.036938,-0.838221,-0.528675,-0.174777,1.648466,1.114947,0.20215,0.479424,0.22642,0.456003,-0.558258,-0.101939,0.000758,-0.184929,0.552788,-0.961206,-0.014642,0.525962,0.750877,0.141543,0.045107,0.127321,-0.256154,-0.987464,0,0.0,0,0,0,0
4,-0.273365,0.825649,0.555674,0.384915,1.343842,1.984159,0.321808,0.510275,-0.891463,-0.115561,0.952916,-0.221254,-0.867646,0.974855,2.441035,-1.711023,0.9024,0.022272,2.097232,0.172188,0.346482,1.162889,-0.557535,-0.896146,0.503553,0.555129,0.129099,0.047571,-0.266444,0.895224,0,0.02,18,0,0,0


In [10]:
%matplotlib inline
from ipywidgets import interact
import seaborn as sns
plt.rcParams['agg.path.chunksize'] = 500

In [11]:
from IPython.display import Image

In [12]:
@interact(Variable = ['Actual % of Fraud', 'Actual Cases of Fraud', 'Predicted Likelihood', 'False Positives', 'False Negatives', 'RF Confusion Matrix'])
def plot(Variable):
    
    if Variable == 'Predicted Likelihood':
        plotdata = data[['predicted_probability', 'proba_decile']].groupby("proba_decile").mean().reset_index()
        fig, ax = plt.subplots(figsize=(15,10))
        sns.lineplot(plotdata.proba_decile, plotdata.predicted_probability, marker = 'o')
        ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
        ax.set_ylabel("Predicted Likelihood of Fraud", labelpad = 15)
        ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax = 1.0, decimals = 0))
        ax.set_xlabel("Predicted Probability Vigintile", labelpad = 15)
        ax.set_title("Predicted Fraud Likelihood by Probability Vigintile", pad = 20)
        for x,y in zip(plotdata.proba_decile, plotdata.predicted_probability):
            label = "{:.1f}%" .format(y*100)

            ax.annotate(label, # this is the text
                         (x,y), # this is the point to label
                         textcoords="offset points", # how to position the text
                         xytext=(-19,7), # distance from text to points (x,y)
                         ha='center') # horizon
    
    elif Variable == 'Actual % of Fraud':
        plotdata = data[['Class', 'proba_decile']].groupby("proba_decile").mean().reset_index()
        fig, ax = plt.subplots(figsize=(15,10))
        sns.lineplot(plotdata.proba_decile, plotdata.Class, marker = 'o')
        ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
        ax.set_ylabel("Actual % of Fraud", labelpad = 15)
        ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax = 1.0, decimals = 1))
        ax.set_xlabel("Predicted Probability Vigintile", labelpad = 15)
        ax.set_title("Actual % of Fraud by Probability Vigintile", pad = 20)
        for x,y in zip(plotdata.proba_decile, plotdata.Class):
            label = "{:.2f}%" .format(y*100)

            ax.annotate(label, # this is the text
                         (x,y), # this is the point to label
                         textcoords="offset points", # how to position the text
                         xytext=(-19,7), # distance from text to points (x,y)
                         ha='center') # horizon

    elif Variable == 'Actual Cases of Fraud':
        plotdata = data[['Class', 'proba_decile']].groupby("proba_decile").sum().reset_index()
        fig, ax = plt.subplots(figsize=(15,10))
        sns.lineplot(plotdata.proba_decile, plotdata.Class, marker = 'o')
        ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
        ax.set_ylabel("Actual Cases of Fraud", labelpad = 15)
        ax.set_xlabel("Predicted Probability Vigintile", labelpad = 15)
        ax.set_title("Actual Cases of Fraud by Probability Vigintile", pad = 20)
        for x,y in zip(plotdata.proba_decile, plotdata.Class):
            label = "{:.0f}" .format(y)

            ax.annotate(label, # this is the text
                         (x,y), # this is the point to label
                         textcoords="offset points", # how to position the text
                         xytext=(-19,7), # distance from text to points (x,y)
                         ha='center') # horizon
            
    elif Variable == 'False Positives':
        plotdata = data[['FP', 'proba_decile']].groupby("proba_decile").sum().reset_index()
        fig, ax = plt.subplots(figsize=(15,10))
        sns.lineplot(plotdata.proba_decile, plotdata.FP, marker = 'o')
        ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
        ax.set_ylabel("Number of False Positives", labelpad = 15)
        ax.set_xlabel("Predicted Probability Vigintile", labelpad = 15)
        ax.set_title("Number of False Positives by Probability Vigintile", pad = 20)
        for x,y in zip(plotdata.proba_decile, plotdata.FP):
            label = "{:.0f}" .format(y)

            ax.annotate(label, # this is the text
                         (x,y), # this is the point to label
                         textcoords="offset points", # how to position the text
                         xytext=(-19,7), # distance from text to points (x,y)
                         ha='center') # horizon
    elif Variable == 'False Negatives':
        plotdata = data[['FN', 'proba_decile']].groupby("proba_decile").sum().reset_index()
        fig, ax = plt.subplots(figsize=(15,10))
        sns.lineplot(plotdata.proba_decile, plotdata.FN, marker = 'o')
        ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
        ax.set_ylabel("Number of False Negatives", labelpad = 15)
        ax.set_xlabel("Predicted Probability Vigintile", labelpad = 15)
        ax.set_title("Number of False Negatives by Probability Vigintile", pad = 20)
        for x,y in zip(plotdata.proba_decile, plotdata.FN):
            label = "{:.0f}" .format(y)

            ax.annotate(label, # this is the text
                         (x,y), # this is the point to label
                         textcoords="offset points", # how to position the text
                         xytext=(-19,7), # distance from text to points (x,y)
                         ha='center') # horizon
            
    elif Variable == 'RF Confusion Matrix':
        display(Image(filename='data/image.png'))

interactive(children=(Dropdown(description='Variable', options=('Actual % of Fraud', 'Actual Cases of Fraud', …