# fast-tabulous homesite quote success app
> This app loads a previously trained model and uses it to predict quote success rate using user input to change fields. User input uses ipywidgets generated on the fly to match allow altering of the most sensitive fields.

In [3]:
import logging
import random
import threading
import time
import urllib.request

import ipywidgets as widgets
import pandas as pd
import numpy as np

from fastai.tabular.all import *
from IPython.display import display
from IPython.utils import io  # using io.capture_output
from sklearn.metrics import roc_auc_score

In [4]:
# On GPU instance run the following commands

#     to = TabularPandas(df=df_train, procs=procs, cat_names=cat_names, cont_names=cont_names, y_names=y_names,splits=splits, y_block=y_block)dls = to.dataloaders(bs=bs, val_bs=val_bs, layers=layers, embed_ps=emb_dropout, ps=dropout)
#     dls = to.dataloaders(bs=bs, val_bs=val_bs, layers=layers, embed_ps=emb_dropout, ps=dropout)
#     learn = tabular_learner(dls, metrics=roc_auc_binary)
    
#     save_pickle("to_0708.pkl", to)
#     learn_model_cpu = learn.model.to('cpu')
#     save_pickle("learn_model_cpu_0708.pkl", learn_model_cpu)
#     dls.to('cpu')
#     save_pickle("dls_cpu_0708.pkl", dls)


# Now specify the folder which contains the original kaggle data (train.csv and test.csv) 
# and the trained TabularModel (learn_model_cpu_0708.pkl) and DataLoaders (dls_cpu_0708.pkl)
pd.options.mode.chained_assignment = None  # default='warn'
# path = Path('data/homesite-quote')
logger = logging.getLogger("load_pickled_model")
logging.basicConfig(level=logging.INFO)
URL_MODEL = "https://drive.google.com/file/d/1UBcTSyBwkGqjV4BymhUrJgORG9mqVDqU/view?usp=sharing"
URL_DLS = "https://drive.google.com/file/d/173VCxcHV58rhExBNi8HfWqOKh_PELLu6/view?usp=sharing"
URL_TEST = "https://drive.google.com/file/d/1-OuitILRZKJUMeB4mHpS_pV0_3ovFQnd/view?usp=sharing"
URL_TRAIN = "https://drive.google.com/file/d/1-NtnEyuR7pbQb826D_kFiYGYR2kYO0c1/view?usp=sharing"

In [4]:
urllib.request.urlretrieve(URL_MODEL, "learn_model_cpu_0708.pkl")
learn_model_cpu = load_pickle("learn_model_cpu_0708.pkl")
urllib.request.urlretrieve(URL_DLS, "dls_cpu_0708.pkl")
dls_cpu = load_pickle("dls_cpu_0708.pkl")
# to = load_pickle(path/"to_0708.pkl")  # optional for now. needed for xgboost
learn=TabularLearner(dls=dls_cpu, model=learn_model_cpu)

In [5]:
preds, targs = learn.get_preds()
logger.warning(f"Trained deep learning model has a roc_auc_score of {roc_auc_score(to_np(targs), to_np(preds[:,1]))}")



In [6]:
urllib.request.urlretrieve(URL_TRAIN, "train.csv")
urllib.request.urlretrieve(URL_TEST, "test.csv")
df_train = pd.read_csv('train.csv', low_memory=False, parse_dates=['Original_Quote_Date'], index_col="QuoteNumber")
df_test = pd.read_csv('test.csv', low_memory=False, parse_dates=['Original_Quote_Date'], index_col="QuoteNumber")
sr_conv = df_train['QuoteConversion_Flag']
df_train.drop('QuoteConversion_Flag', inplace=True, axis=1)
df = pd.concat([df_train, df_test])
df = add_datepart(df, 'Original_Quote_Date')
logger.debug(f"{df.shape} {df_train.shape} {df_test.shape} {sr_conv.shape}")
df_train = None
df_test = None
qn_min = sr_conv.index.min()
qn_max = sr_conv.index.max()
qn = random.randint(qn_min, qn_max)  # pick an initial quote at random

In [27]:
def lst_ind_value(df, field):
    """Return the list of independent values to be tested for specified field"""
    num_unique = df[field].nunique()
    # If number of unique values is under 30 then try every value (or for objects try every value)
    if num_unique < 30 or df.dtypes[field] == 'O':
        return df[field].unique()
    else:
        if df.dtypes[field] == "int64":
            vmin = df[field].min()
            vmax = df[field].max()
            return [vmin + (vmax - vmin) * i // 10 for i in range(11)]
        elif df.dtypes[field] == "float64":
            vmin = df[field].min()
            vmax = df[field].max()
            return [vmin + (vmax - vmin) * i / 10 for i in range(11)]
        else:
            logger.warning(f"Unknown type {field} {num_unique} {df.dtypes[field]!r}")
            return []

def tf_equal_or_nan(a, b):
    """same as normal equals except np.nan == np.nan which is not normally True"""
    if a == b:
        return True
    try:
        if np.isnan(a) and np.isnan(b):
            return True
    except TypeError:
        pass
    return False

def nan_if_nan(n):
    """Can't include np.nan in dropdowns as np.nan != np.nan. Instead use a str"""
    try:
        if np.isnan(n):
            return "nan"
    except TypeError as te:
        pass
    return n
        
def df_for_field(df_ind_original, f, lst_v):
    """predicts quote success after changing field f from v_original to each value in lst_v.
    If prediction changes then quote is sensitive to the value of this field and True is returned
    Keyword arguments
        ind_original: all independent values from original quote (numpy Series with index = field names) 
        f: field name
        lst_v: list of alternative values of independent value in field f
    Returns
        dataframe of alternative values in field f and all other fields staying the same and a column called fieldname
    """
    # Create a DataFrame which has every row identical except for field in question
    # Field f iterates through every value provided
    ind_other = df_ind_original.drop(f, axis=1)  # fields other than f
    ind_f = pd.DataFrame(data={f: lst_v, "fieldname": [f] * len(lst_v)}, index=[df_ind_original.index[0]] * len(lst_v))
    # Merge these two DataFrames to create one with all rows identical except field f
    return pd.merge(ind_other, ind_f, right_index=True, left_index=True)

def sensitivity_analysis(ind_original):
    """Using data from Series of independent variables do a sensitivity analysis on all independent variables"""
    time_start = datetime.now()
    # Original prediction before changes
    prd = learn.predict(ind_original)
    logger.debug(f"After one predict time = {(datetime.now() - time_start).total_seconds()} seconds")
    # Predicted quote conversion flag
    qcf_original = prd[1].item()
    # Probability that quote conversion flag is as predicted
    prb_original = prd[2][qcf_original].item()
    lst_df_for_field = []
    # Loop through all fields. Check different values of each field to see if result is sensitive to it.
    df_ind_original = ind_original.to_frame().T
    for field in df.columns:
        val_original = ind_original[field]
        lst_val = lst_ind_value(df, field)
        lst_df_for_field.append(df_for_field(df_ind_original, field, lst_val))
    logger.info(f"Build lst_df_for_field time = {(datetime.now() - time_start).total_seconds()} seconds")
    df_sensitivity = pd.concat(lst_df_for_field, ignore_index=True)
#     logger.info(f"{df_sensitivity['Field7'].unique()=}")
    logger.info(f"Concat time = {(datetime.now() - time_start).total_seconds()} seconds {df_sensitivity.shape=}")
    sr_fieldname = df_sensitivity['fieldname']
    df_sensitivity.drop('fieldname', inplace=True, axis=1)
    dl = learn.dls.test_dl(df_sensitivity)
    logger.info(f"Dataloader time = {(datetime.now() - time_start).total_seconds()} seconds")
    dl.dataset.conts = dl.dataset.conts.astype(np.float32)
    # stop learn.get_preds() printing blank lines
    with io.capture_output() as captured:
        # using get_preds() rather than predict() because get_preds can do multiple rows at once
        inp,preds,_,dec_preds = learn.get_preds(dl=dl, with_input=True, with_decoded=True)
    logger.info(f"Time taken = {(datetime.now() - time_start).total_seconds()} seconds")
    df_results=pd.DataFrame({'fieldname': sr_fieldname, 'prob_success': preds[:,1]})
    df_results.sort_values(by='prob_success', ascending=False, inplace=True)
    return df_results, df_sensitivity

def sensitivity_analysis_for_quote_number(qn):
    return sensitivity_analysis(df.loc[qn])


In [28]:
# Widget event handlers
def configure_inputs():
    """Dynamically create inputs (dropdowns and radio buttons) for trialling combinations of values to improve quote success"""
    qn = wdg_quote_number_slider.value
    # Get the top 10 fields most likely to make quote more successful, and their values which work the best
    i = 0
    dct_fields = defaultdict(list)
    while len(dct_fields.keys()) < 10 and i < df.shape[1]:
        f = df_results.iloc[i, 0]  # fieldname column
        idx = df_results.index[i]  # index into df_sensitivity
        # independent variable value which has a good result
        ind_val = df_sensitivity.loc[idx, f]
        # create a list of all values which have a good result for this field
        dct_fields[f].append(ind_val)
        i += 1
    priority = 0
    # delete all elements of lst_input and lst_hbox without deleting references
    del lst_input[:]
    del lst_vbox[:]
    for f, lst_recommend in dct_fields.items():
        priority += 1
        num_unique = df[f].nunique()
        lst_unique = df[f].unique()
        try:
            tf_nan = sum(np.isnan(lst_unique)) > 0
        except TypeError:
            tf_nan = False
        if tf_nan:
            lst_unique = df[f].dropna().unique()
        lst_unique = sorted(lst_unique)
        if tf_nan:
            lst_unique.append("nan")
        v = nan_if_nan(df.loc[qn,f])
        tip = f"Priority {priority}. Initially {v}. Recommend {lst_recommend}"
        lbl = widgets.HTML(value=f"{tip}")
        if num_unique < 5:
            wdg = widgets.RadioButtons(options=lst_unique, 
                                       description=f, 
                                       description_tooltip=tip,
                                       style=style_input, 
                                       value=v)
        else:
            wdg = widgets.Dropdown(options=lst_unique, 
                                   description=f, 
                                   description_tooltip=tip,
                                   style=style_input, 
                                   value=v)
        wdg.observe(handle_input_change, names='value')
        lst_vbox.append(widgets.HBox(children=[wdg, lbl]))
        lst_input.append(wdg)
        wdg_inputs.children=lst_vbox
        
def do_progress_bar(progress):
    total = 100
    for i in range(total):
        time.sleep(0.2)
        progress.value = float(i + 1) / total

def do_sensitivity_analysis(btn=None):
    """Do a fresh sensitivity analysis for selected quote number"""
    global df_results, df_sensitivity
    
    thread = threading.Thread(target=do_progress_bar, args=(wdg_progress,))
    wdg_progress.layout.visibility = 'visible'
    thread.start()
    qn = wdg_quote_number_slider.value
    wdg_logging_out.clear_output()
    with wdg_logging_out:
        df_results, df_sensitivity = sensitivity_analysis_for_quote_number(qn)
    wdg_prob_success_out.clear_output()
    with wdg_prob_success_out:
        print(df_results.head(20))
    configure_inputs()
    handle_input_change(0)
    wdg_progress.layout.visibility = 'hidden'

def handle_input_change(change):
    qn = wdg_quote_number_slider.value
    ind = df.loc[qn].copy()
    for w in lst_input:
        if w.value == "nan":
            v = np.nan
        else:
            v = w.value
        ind[w.description] = v
    with io.capture_output() as captured:
        prd = learn.predict(ind)
    qcf = prd[1].item()
    prb = prd[2][qcf].item()
    act = dct_success_label[sr_conv[qn]] if qn in sr_conv else "unknown"
    wdg_status.value = f"<h2>Quote {qn} actual: {act}, predicted: {prb:.2%} {dct_success_label[qcf]}</h2>"


def handle_quote_number_change(change):
    qn = change.new
    with io.capture_output() as captured:
        prd = learn.predict(df.loc[qn])
    qcf = prd[1].item()
    prb = prd[2][qcf].item()
    act = dct_success_label[sr_conv[qn]] if qn in sr_conv else "unknown"
    wdg_quote_success.value = f"Quote {change.new} actual: {act}, predicted {prb:.2%} {dct_success_label[qcf]}"

In [29]:
lst_input = []
lst_vbox = []
# define all standard widgets
wdg_quote_success = widgets.Label(value="")
dct_success_label = {0: "unsuccessful", 1: "successful"}
style_qn = {'description_width': 'initial', 'width': '500px'}
style_input = {'description_width': 'initial'}
wdg_quote_number_text = widgets.BoundedIntText(
    description="Quote number", min=qn_min, max=qn_max, value=qn, style=style_qn)
wdg_quote_number_slider = widgets.IntSlider(
    description="Quote number", min=qn_min, max=qn_max, value=qn, style=style_qn, layout={'width': '600px'})
# link slider and textfield together
qn_link = widgets.jslink((wdg_quote_number_text, 'value'), (wdg_quote_number_slider, 'value'))
wdg_quote_number_slider.observe(handle_quote_number_change, names='value')
wdg_sensitivity_analysis_button = widgets.Button(
    description='Sensitivity Analysis',
    tooltip='Do a fresh sensitivity analysis for selected quote number and display top 10 inputs ',
)
wdg_sensitivity_analysis_button.on_click(do_sensitivity_analysis)
wdg_status = widgets.HTML(value=f"<h2>Please click on button 'Sensitivity Analysis' and wait 20 seconds</h2>")
wdg_logging_out = widgets.Output(layout={'border': '1px solid green'})
wdg_prob_success_out = widgets.Output()
wdg_inputs = widgets.VBox(children=lst_vbox)
wdg_progress = widgets.FloatProgress(value=0.0, min=0.0, max=1.0)
wdg_progress.layout.visibility = 'hidden'

In [30]:
display(wdg_quote_number_text)
display(wdg_quote_number_slider)
display(wdg_quote_success)
display(wdg_sensitivity_analysis_button)
display(wdg_progress)
display(wdg_status)
display(wdg_inputs)
display(wdg_prob_success_out)

BoundedIntText(value=197430, description='Quote number', max=434588, min=1, style=DescriptionStyle(description…

IntSlider(value=197430, description='Quote number', layout=Layout(width='600px'), max=434588, min=1, style=Sli…

Label(value='')

Button(description='Sensitivity Analysis', style=ButtonStyle(), tooltip='Do a fresh sensitivity analysis for s…

FloatProgress(value=0.0, layout=Layout(visibility='hidden'), max=1.0)

HTML(value="<h2>Please click on button 'Sensitivity Analysis' and wait 20 seconds</h2>")

VBox()

Output()