In [1]:
from pyspark.sql import SparkSession
from ai.h2o.sparkling.H2OContext import H2OContext

In [2]:
spark = SparkSession.builder.appName("H2O").getOrCreate()

In [3]:
hc = H2OContext.getOrCreate()

Connecting to H2O server at http://ajays-air:54325 ... successful.


0,1
H2O_cluster_uptime:,20 secs
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.3
H2O_cluster_version_age:,2 months and 2 days
H2O_cluster_name:,sparkling-water-aj_local-1626938892146
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,574 Mb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8



Sparkling Water Context:
 * Sparkling Water Version: 3.32.1.3-1-3.0
 * H2O name: sparkling-water-aj_local-1626938892146
 * cluster size: 1
 * list of used nodes:
  (executorId, host, port)
  ------------------------
  (0,192.168.43.37,54323)
  ------------------------

  Open H2O Flow in browser: http://ajays-air:54325 (CMD + click in Mac OSX)

    


In [4]:
def generateScore(prob):
    """

    :param prob: probability of model on data
    :return: it's return score
    """
    pdo = 20
    odds = prob / (1 - prob)
    factor = pdo / np.log(2)
    offset = 500 - factor * np.log(pdo)
    score = offset + factor * np.log(odds)
    return score

In [5]:
def stabilityHelperPSI_ModelComparison(initial, new):
    """

    :param initial:
    :param new:
    :return:
    """
    size = 11
    mini = min(initial)
    maxi = max(new)
    add = (maxi - mini) / size

    binSupporter = [mini if (i == 0) else (mini := mini + add) for i in range(size)]
    initial_counts = np.histogram(initial, binSupporter)[0]
    new_counts = np.histogram(new, binSupporter)[0]

    df = pd.DataFrame(
        {'Bucket': (np.arange(1, size)), 'Breakpoint Value': binSupporter[1:], 'Train Count': initial_counts,
         'Test Count': new_counts})
    df['Train Percent'] = df['Train Count'] / len(initial)
    df['Test Percent'] = df['Test Count'] / len(new)
    df['Test Percent'][df['Test Percent'] == 0] = 0.001
    df['Train Percent'][df['Train Percent'] == 0] = 0.001

    score = pd.DataFrame()
    score['Score'] = (df['Test Percent'] - df['Train Percent']) * np.log(df['Test Percent'] / df['Train Percent'])
    score.loc[len(score.index)] = np.sum(score['Score'])

    df = pd.concat([df, score], axis=1, sort=False)
    df = df.replace(np.nan, "", regex=True)
    df.replace([np.inf, -np.inf], 0, inplace=True)
    return df.round(2)



In [6]:

def stabilityHelperPSI_OngoingComparison(initial, new):
    """

    :param initial:
    :param new:
    :return:
    """
    size = 11
    mini = min(initial)
    maxi = max(new)
    add = (maxi - mini) / size

    binSupporter = [mini if (i == 0) else (mini := mini + add) for i in range(size)]
    initial_counts = np.histogram(initial, binSupporter)[0]
    new_counts = np.histogram(new, binSupporter)[0]

    df = pd.DataFrame({'Bucket': np.arange(1, size),
                       'Breakpoint Value': binSupporter[1:],
                       'Development Count': initial_counts,
                       'Monitoring Count': new_counts})
    df['Development Percent'] = df['Development Count'] / len(initial)
    df['Monitoring Percent'] = df['Monitoring Count'] / len(new)
    df['Monitoring Percent'][df['Monitoring Percent'] == 0] = 0.001
    df['Development Percent'][df['Development Percent'] == 0] = 0.001

    score = pd.DataFrame()
    score['Score'] = (df['Monitoring Percent'] - df['Development Percent']) * np.log(
        df['Monitoring Percent'] / df['Development Percent'])
    score.loc[len(score.index)] = np.sum(score['Score'])

    df = pd.concat([df, score], axis=1, sort=False)
    df = df.replace(np.nan, "", regex=True)
    df.replace([np.inf, -np.inf], 0, inplace=True)
    return df.round(2)



In [7]:
def PSI_OngoingComparison(developmentProb, monitoringProb):
    """

    :param developmentProb:
    :param monitoringProb:
    :return:
    """
    developmentScore = generateScore(developmentProb)
    monitoringScore = generateScore(monitoringProb)
    psi_table = stabilityHelperPSI_OngoingComparison(developmentScore, monitoringScore)
    return psi_table


def PSI_ModelComparison(developmentProb, monitoringProb):
    """

    :param developmentProb:
    :param monitoringProb:
    :return:
    """
    developmentScore = generateScore(developmentProb)
    monitoringScore = generateScore(monitoringProb)
    psi_table = stabilityHelperPSI_ModelComparison(developmentScore, monitoringScore)
    return psi_table




In [8]:
# CSI TEST
def stabilityHelper(initial, new):
    """

    :param initial:
    :param new:
    :return:
    """
    size = 11
    mini = min(initial)
    maxi = max(new)
    add = (maxi - mini) / size

    binSupporter = [mini if (i == 0) else (mini := mini + add) for i in range(size)]
    initial_counts = np.histogram(initial, binSupporter)[0]
    new_counts = np.histogram(new, binSupporter)[0]

    df = pd.DataFrame(
        {'Bucket': np.arange(1, size), 'Breakpoint Value': binSupporter[1:], 'Initial Count': initial_counts,
         'New Count': new_counts})
    df['Initial Percent'] = df['Initial Count'] / len(initial)
    df['New Percent'] = df['New Count'] / len(new)
    df['New Percent'][df['New Percent'] == 0] = 0.001
    df['Initial Percent'][df['Initial Percent'] == 0] = 0.001
    df['score'] = (df['New Percent'] - df['Initial Percent']) * np.log(df['New Percent'] / df['Initial Percent'])
    score = np.sum(df['score'])
    score = np.round(score, 4)
    return score




In [9]:

def CSI(development, monitoring):
    """

    :param development:
    :param monitoring:
    :return:
    """
    csi = stabilityHelper(development, monitoring)
    return csi


def calc_csi(column_name, X_train, X_test):
    """

    :param column_name:
    :param X_train:
    :param X_test:
    :return:
    """
    return CSI(X_train[column_name].values, X_test[column_name].values)


def calculate_csi(x_test, x_train):
    """

    :param x_test:
    :param x_train:
    :return:
    """
    columns = x_test.columns.to_list()
    csi_dict = {}
    for col in columns:
        test = calc_csi(col, x_train, x_test)
        csi_dict[col] = test

    table = pd.DataFrame({'Variable': list(csi_dict.keys()), 'CSI Score': list(csi_dict.values())}).round(2)
    return table




In [10]:

def KS_OngoingComparison(user, y_probas, y_true, x_probas, x_true):
    """

    :param user:
    :param y_probas:
    :param y_true:
    :param x_probas:
    :param x_true:
    :return:
    """
    helper = Model_helper(username=user)
    colors = helper.preference_maker(2)
    y_true = np.array(y_true).ravel()
    y_probas = y_probas.ravel()
    thresholds, pct1, pct2, ks_statistic, max_distance_at, classes = binary_ks_curve(y_true, y_probas)

    fig = go.Figure()
    idx = np.where(thresholds == max_distance_at)[0][0]
    fig.add_shape(dict(type="line", x0=max_distance_at, y0=pct1[idx], x1=max_distance_at, y1=pct2[idx],
                       line=dict(color="rgba(0,0,0,0)", width=3, dash="dot")))
    fig.add_trace(go.Scatter(x=thresholds, y=pct2, mode='lines', name='Class 1', line=dict(color=colors[0])))
    fig.add_trace(go.Scatter(x=thresholds, y=pct1, mode='lines', name='Class 0', line=dict(color=colors[1])))
    fig.add_annotation(x=1, y=0.05, showarrow=False,
                       text='KS Statistic : {:.3f} at {:.3f}'.format(ks_statistic, max_distance_at))
    fig.update_layout(title_text=f"Monitoring KS Statistic", title_x=0.5)
    fig.update_layout({'plot_bgcolor': 'rgba(0,0,0,0)', 'paper_bgcolor': 'rgba(0,0,0,0)'})
    fig.update_xaxes(title_text='Threshold')
    fig.update_yaxes(title_text='Percentage Below Threshold')
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=-0.23, xanchor="right", x=0.6))

    x_true = np.array(x_true)
    x_probas = np.array(x_probas).ravel()
    thresholds, pct1, pct2, ks_statistic, max_distance_at, classes = binary_ks_curve(x_true, x_probas)

    fig2 = go.Figure()
    idx = np.where(thresholds == max_distance_at)[0][0]
    fig2.add_shape(dict(type="line", x0=max_distance_at, y0=pct1[idx], x1=max_distance_at, y1=pct2[idx],
                        line=dict(color="rgba(0,0,0,0)", width=3, dash="dot")))
    fig2.add_trace(go.Scatter(x=thresholds, y=pct2, mode='lines', name='Class 1', line=dict(color=colors[0])))
    fig2.add_trace(go.Scatter(x=thresholds, y=pct1, mode='lines', name='Class 0', line=dict(color=colors[1])))
    fig2.add_annotation(x=1, y=0.05, showarrow=False,
                        text='KS Statistic : {:.3f} at {:.3f}'.format(ks_statistic, max_distance_at))
    fig2.update_layout(title_text=f"Development KS Statistic", title_x=0.5)
    fig2.update_layout({'plot_bgcolor': 'rgba(0,0,0,0)', 'paper_bgcolor': 'rgba(0,0,0,0)'})
    fig2.update_xaxes(title_text='Threshold')
    fig2.update_yaxes(title_text='Percentage Below Threshold')
    fig2.update_xaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    fig2.update_yaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    fig2.update_layout(legend=dict(orientation="h", yanchor="bottom", y=-0.23, xanchor="right", x=0.6))
    return fig, fig2


def KS_ModelComparison(y_probas, y_true, user):
    """

    :param y_probas:
    :param y_true:
    :param user:
    :return:
    """
    helper = Model_helper(username=user)
    colors = helper.preference_maker(2)
    y_true = np.array(y_true).reshape(-1)
    y_probas = np.array(y_probas).reshape(-1)
    probas = y_probas
    thresholds, pct1, pct2, ks_statistic, max_distance_at, classes = binary_ks_curve(y_true, probas.ravel())

    fig = go.Figure()
    idx = np.where(thresholds == max_distance_at)[0][0]
    fig.add_shape(dict(type="line", x0=max_distance_at, y0=pct1[idx], x1=max_distance_at, y1=pct2[idx],
                       line=dict(color="rgba(0,0,0,0)", width=3, dash="dot")))
    fig.add_trace(go.Scatter(x=thresholds, y=pct2, mode='lines', name='Class 1', line=dict(color=colors[0])))
    fig.add_trace(go.Scatter(x=thresholds, y=pct1, mode='lines', name='Class 0', line=dict(color=colors[1])))
    fig.add_annotation(x=1, y=0.05, showarrow=False,
                       text='KS Statistic : {:.3f} at {:.3f}'.format(ks_statistic, max_distance_at))
    fig.update_layout(title_text=f"<b>KS STATISTICS<b>", title_x=0.5)
    fig.update_layout({'plot_bgcolor': 'rgba(0,0,0,0)', 'paper_bgcolor': 'rgba(0,0,0,0)'})
    fig.update_xaxes(title_text='Threshold')
    fig.update_yaxes(title_text='Percentage Below Threshold')
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=-0.23, xanchor="right", x=0.6))
    return fig




In [11]:

def confusionmatrix_OngoingComparison(cnf_all, cnf_all2):
    """

    :param cnf_all:
    :param cnf_all2:
    :return:
    """
    x = ['Positive', 'Negative']
    y = ['Positive', 'Negative']

    fig = go.Figure()
    for step in range(9):
        fig.add_trace(go.Heatmap(visible=False, x=x, y=y, z=cnf_all[step], colorscale="teal"))
    fig.data[4].visible = True

    anno = []
    for i, row in enumerate(cnf_all[4]):
        for j, value in enumerate(row):
            anno.append(
                {
                    "x": x[j],
                    "y": y[i],
                    "font": {"color": "white", "size": 16},
                    "text": str(value),
                    "xref": "x1",
                    "yref": "y1",
                    "showarrow": False
                }
            )

    # Create and add slider
    steps = []
    for i in range(len(fig.data)):
        step = dict(method="update", args=[{"visible": [False] * len(fig.data)},
                                           {"title": "Slider switched to Threshold: " + str((i + 1) / 10)}],
                    label="Threshold : " + str((i + 1) / 10),  # layout attribute
                    )
        step["args"][0]["visible"][i] = True  # Toggle trace to "visible"
        steps.append(step)
    fig.update_layout(title_text=f"Monitoring Confusion Matrix", title_x=0.5, annotations=anno)

    fig2 = go.Figure()
    for step in range(9):
        fig2.add_trace(go.Heatmap(visible=False, x=x, y=y, z=cnf_all2[step], colorscale="teal"))
    fig2.data[4].visible = True

    anno2 = []
    for i, row in enumerate(cnf_all2[4]):
        for j, value2 in enumerate(row):
            anno2.append(
                {
                    "x": x[j],
                    "y": y[i],
                    "font": {"color": "white", "size": 16},
                    "text": str(value2),
                    "xref": "x1",
                    "yref": "y1",
                    "showarrow": False
                }
            )
    # Create and add slider
    steps = []
    for i in range(len(fig2.data)):
        step = dict(
            method="update",
            args=[{"visible": [False] * len(fig2.data)},
                  {"title": "Slider switched to Threshold: " + str((i + 1) / 10)}],
            label="Threshold : " + str((i + 1) / 10),  # layout attribute
        )
        step["args"][0]["visible"][i] = True  # Toggle trace to "visible"
        steps.append(step)
    fig2.update_layout(title_text=f"Development Confusion Matrix", title_x=0.5, annotations=anno2)
    """WHY??"""
    # confusion_matrix = cnf_all2[4]
    # TP = confusion_matrix[1, 1]
    # TN = confusion_matrix[0, 0]
    # FP = confusion_matrix[0, 1]
    # FN = confusion_matrix[1, 0]
    # a = cnf_all2[4].shape
    # # Calculating false positives
    # corrPred = 0
    # falsePred = 0
    # for row in range(a[0]):
    #     for c in range(a[1]):
    #         if row == c:
    #             corrPred += confusion_matrix[row, c]
    #         else:
    #             falsePred += confusion_matrix[row, c]
    #
    return fig, fig2


def CM_OngoingComparison(y_pred, y_test, x_test, x_pred):
    """

    :param y_pred:
    :param y_test:
    :param x_test:
    :param x_pred:
    :return:
    """
    step = np.arange(0.1, 1, 0.1)
    cnf_all1 = []
    cnf_all2 = []
    y_pred = pd.Series(y_pred)
    for i in step:
        y_pred[y_pred > i] = 1
        y_pred[y_pred <= i] = 0
        cnf1 = confusion_matrix(y_test, y_pred)
        cnf2 = confusion_matrix(x_test, x_pred)
        cnf_all1.append(cnf1)
        cnf_all2.append(cnf2)
    figs = confusionmatrix_OngoingComparison(cnf_all1, cnf_all2)
    return figs


def confusionmatrix_ModelComparison(cnf_all):
    """

    :param cnf_all:
    :return:
    """
    x = ['Positive', 'Negative']
    y = ['Positive', 'Negative']

    fig = go.Figure()
    for step in range(9):
        fig.add_trace(go.Heatmap(visible=False, x=x, y=y, z=cnf_all[step], colorscale="teal"))
    fig.data[4].visible = True

    anno2 = []
    for i, row in enumerate(cnf_all[4]):
        for j, value2 in enumerate(row):
            anno2.append(
                {
                    "x": x[j],
                    "y": y[i],
                    "font": {"color": "white", "size": 16},
                    "text": str(value2),
                    "xref": "x1",
                    "yref": "y1",
                    "showarrow": False
                }
            )

    # Create and add slider
    steps = []
    for i in range(len(fig.data)):
        step = dict(
            method="update",
            args=[{"visible": [False] * len(fig.data)},
                  {"title": "Slider switched to Threshold: " + str((i + 1) / 10)}],
            label="Threshold : " + str((i + 1) / 10),  # layout attribute
        )
        step["args"][0]["visible"][i] = True  # Toggle trace to "visible"
        steps.append(step)
    fig.update_layout(title_text=f"<b>CONFUSION MATRIX<b>", title_x=0.5, annotations=anno2)
    return fig


def CM_ModelComparison(y_pred, y_actual):
    """

    :param y_pred:
    :param y_actual:
    :return:
    """
    step = np.arange(0.1, 1, 0.1)
    cnf_all = []
    y_pred = pd.Series(y_pred)
    for i in step:
        y_pred[y_pred > i] = 1
        y_pred[y_pred <= i] = 0
        cnf = confusion_matrix(y_actual, y_pred)
        cnf_all.append(cnf)
    fig = confusionmatrix_ModelComparison(cnf_all)
    return fig




In [12]:

def graph_sen_OngoingComparison(train_model, test_model, user):
    """

    :param train_model:
    :param test_model:
    :param user:
    :return:
    """
    helper = Model_helper(username=user)
    colors = helper.preference_maker(2)
    # ROC for train dataset
    model_fpr, model_tpr = train_model.roc.select("FPR").show(), train_model.roc.select("TPR").show()

    go.Figure()
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig.add_trace(go.Scatter(x=model_fpr, y=1 - model_tpr,
                             mode='lines',
                             name='Specificity',
                             line=dict(color=colors[0]),
                             ))

    fig.add_trace(go.Scatter(x=model_fpr, y=model_tpr, mode='lines', name='Sensitivity', line=dict(color=colors[1])),
                  secondary_y=True, )
    fig.update_layout(title_text=f"Development<br>Specificity vs Sensitivity", title_x=0.5)
    fig.update_layout({'plot_bgcolor': 'rgba(0,0,0,0)', 'paper_bgcolor': 'rgba(0,0,0,0)'})
    fig.update_yaxes(title_text='Specificity')
    fig.update_yaxes(title_text='Sensitivity')
    fig.update_xaxes(title_text='Cutoff')
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=-0.23, xanchor="right", x=0.6))

    # ROC for test dataset
    model_fpr, model_tpr = test_model.roc.select("FPR").show(), test_model.roc.select("TPR").show()

    go.Figure()
    fig2 = make_subplots(specs=[[{"secondary_y": True}]])
    fig2.add_trace(go.Scatter(x=model_fpr, y=1 - model_tpr, mode='lines',
                              name='Specificity',
                              line=dict(color=colors[0]),
                              ))
    fig2.add_trace(go.Scatter(x=model_fpr, y=model_tpr, mode='lines', name='Sensitivity', line=dict(color=colors[1])),
                   secondary_y=True, )
    fig2.update_layout(title_text=f"Monitoring<br>Specificity vs Sensitivity", title_x=0.5)
    fig2.update_layout({'plot_bgcolor': 'rgba(0,0,0,0)', 'paper_bgcolor': 'rgba(0,0,0,0)'})
    fig2.update_yaxes(title_text='Specificity')
    fig2.update_yaxes(title_text='Sensitivity')
    fig2.update_xaxes(title_text='Cutoff')
    fig2.update_xaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    fig2.update_yaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    fig2.update_layout(legend=dict(orientation="h", yanchor="bottom", y=-0.23, xanchor="right", x=0.6))
    return fig, fig2


def graph_sen_ModelComparison(test_model, user):
    """

    :param test_model:
    :param user:
    :return:
    """
    helper = Model_helper(username=user)
    colors = helper.preference_maker(2)

    # ROC for test dataset
    model_fpr, model_tpr = test_model.roc.select("FPR").show(), test_model.roc.select("TPR").show()

    go.Figure()
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig.add_trace(
        go.Scatter(x=model_fpr, y=1 - model_tpr, mode='lines', name='Specificity', line=dict(color=colors[0])))
    fig.add_trace(go.Scatter(x=model_fpr, y=model_tpr, mode='lines', name='Sensitivity', line=dict(color=colors[1]), ),
                  secondary_y=True)
    fig.update_layout(title_text=f"<b>SPECIFICITY VS SENSITIVITY<b>", title_x=0.5)
    fig.update_layout({'plot_bgcolor': 'rgba(0,0,0,0)', 'paper_bgcolor': 'rgba(0,0,0,0)'})
    fig.update_yaxes(title_text='Specificity ')
    fig.update_yaxes(title_text='Sensitivity')
    fig.update_xaxes(title_text='Cutoff')
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=-0.23, xanchor="right", x=0.6))
    return fig



In [13]:

def strengthOfModel(model, strength_test, target_column):
    """

    :param model: It's train model object
    :param strength_test: It's a list of metrics name
    :param target_column: Target Variable
    :return: It's return dict object which basically contian metrics value for testing data
    """

    test_data = {}
    y_actual = model.select(target_column).show()
    y_pred = model.select("prediction").show()
    for i in strength_test:
        if i == 'KS':
            test_data['KS'] = KS_I(np.array(y_actual).reshape(-1), y_pred)
        elif i == 'gini':
            test_data['GINI'] = Gini(np.array(y_actual).reshape(-1), y_pred)
        elif i == 'ROC-AUC':
            test_data['ROC Score'], test_data['AUC Score'] = model.areaUnderROC, model.roc.show()
        elif i == 'specificity & sensitivity':
            test_data['Sensitivity'] = model.truePositiveRateByLabel
            test_data['Specificity'] = model.falsePositiveRateByLabel
        elif i == 'Precision':
            test_data['Precision'] = model.precisionByLabel
        elif i == 'Recall':
            test_data['Recall'] = model.recallByLabel
        elif i == 'F1-Score':
            test_data['F1-Score'] = model.fMeasure()
        elif i == "Accuracy":
            test_data['Accuracy'] = model.accuracy
    return test_data


def draw_table_OngoingComparison(train_model, test_model, strength_test, target_column):
    """

    :param train_model: It's model object of training data
    :param test_model: It's model object of testing data
    :param strength_test: It's a list of metrics name
    :param target_column: Target Variable
    :return: t's return dict object which basically contian metrics value for training and testing data
    """

    train_data = {}
    y_actual = train_model.select(target_column).show()
    y_pred = train_model.select("prediction").show()
    for i in strength_test:
        if i == 'KS':
            train_data['KS'] = KS_I(np.array(y_actual).reshape(-1), y_pred)
        elif i == 'gini':
            train_data['GINI'] = Gini(np.array(y_actual).reshape(-1), y_pred)
        elif i == 'ROC-AUC':
            train_data['ROC Score'], train_data[
                'AUC Score'] = train_model.summary.areaUnderROC, train_model.summary.roc.show()
        elif i == 'specificity & sensitivity':
            train_data['Sensitivity'] = train_model.summary.truePositiveRateByLabel
            train_data['Specificity'] = train_model.summary.falsePositiveRateByLabel
        elif i == 'Precision':
            train_data['Precision'] = train_model.summary.precisionByLabel
        elif i == 'Recall':
            train_data['Recall'] = train_model.summary.recallByLabel
        elif i == 'F1-Score':
            train_data['F1-Score'] = train_model.summary.fMeasure()
        elif i == "Accuracy":
            train_data['Accuracy'] = train_model.summary.accuracy

    test_data = strengthOfModel(test_model, strength_test, target_column)

    result = {'Strength': list(test_data.keys()), 'Monitoring': list(test_data.values()),
              'Development': list(train_data.values())}
    result = pd.DataFrame(result).round(2)
    return result


def draw_table_ModelComparison(model, strength_test, target_column):
    """

    :param model:
    :param strength_test:
    :param target_column:
    :return:
    """
    tbl = {}
    test_data = strengthOfModel(model, strength_test, target_column)
    tbl["Strength"] = list(test_data.keys())
    tbl["Score"] = list(test_data.values())
    tbl = pd.DataFrame(tbl).round(2)
    return tbl




In [14]:

def lift_OngoingComparison(ann_fpr, ann_tpr, ns_fpr, ns_tpr, ann_auc, ann_fpr2, ann_tpr2, ns_fpr2, ns_tpr2, ann_auc2,
                           name, user):
    """

    :param ann_fpr:
    :param ann_tpr:
    :param ns_fpr:
    :param ns_tpr:
    :param ann_auc:
    :param ann_fpr2:
    :param ann_tpr2:
    :param ns_fpr2:
    :param ns_tpr2:
    :param ann_auc2:
    :param name:
    :param user:
    :return:
    """
    helper = Model_helper(username=user)
    colors = helper.preference_maker(2)

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=ann_fpr, y=ann_tpr, mode='lines', name=f'{name}', line=dict(color=colors[0])))
    fig.add_trace(go.Scatter(x=ns_fpr, y=ns_tpr, mode='lines', name='No Skill', ine=dict(color=colors[1])))
    fig.add_annotation(x=max(ns_fpr), y=0.05, showarrow=False, text="AUC : " + str(ann_auc))
    fig.update_layout(title_text=f"Monitoring ROC AUC Curve", title_x=0.5)
    fig.update_layout({'plot_bgcolor': 'rgba(0,0,0,0)', 'paper_bgcolor': 'rgba(0,0,0,0)'})
    fig.update_xaxes(title_text='False Positive Rate')
    fig.update_yaxes(title_text='True Positive Rate')
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=-0.23, xanchor="right", x=0.6))

    fig2 = go.Figure()
    fig2.add_trace(go.Scatter(x=ann_fpr2, y=ann_tpr2, mode='lines', name=f'{name}', line=dict(color=colors[0])))
    fig2.add_trace(go.Scatter(x=ns_fpr2, y=ns_tpr2, mode='lines', name='No Skill', line=dict(color=colors[1])))
    fig2.add_annotation(x=max(ns_fpr2), y=0.05, showarrow=False, text="AUC : " + str(ann_auc2))
    fig2.update_layout(title_text=f"Development ROC AUC Curve", title_x=0.5)
    fig2.update_layout({'plot_bgcolor': 'rgba(0,0,0,0)', 'paper_bgcolor': 'rgba(0,0,0,0)'})
    fig2.update_xaxes(title_text='False Positive Rate')
    fig2.update_yaxes(title_text='True Positive Rate')
    fig2.update_xaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    fig2.update_yaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    fig2.update_layout(legend=dict(orientation="h", yanchor="bottom", y=-0.23, xanchor="right", x=0.6))
    return fig, fig2


# ROC AUC function for ModelComparison
def lift_ModelComparison(ann_fpr, ann_tpr, ns_fpr, ns_tpr, ann_auc, name, user):
    """

    :param ann_fpr:
    :param ann_tpr:
    :param ns_fpr:
    :param ns_tpr:
    :param ann_auc:
    :param name:
    :param user:
    :return:
    """
    helper = Model_helper(username=user)
    colors = helper.preference_maker(2)

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=ann_fpr, y=ann_tpr, mode='lines', name=f'{name}', line=dict(color=colors[0])))
    fig.add_trace(go.Scatter(x=ns_fpr, y=ns_tpr, mode='lines', name='No Skill', ine=dict(color=colors[1])))
    fig.add_annotation(x=max(ns_fpr), y=0.05, showarrow=False, text="AUC : " + str(ann_auc))
    fig.update_layout(title_text=f"<b>ROC AUC Curve<b>", title_x=0.5)
    fig.update_layout({'plot_bgcolor': 'rgba(0,0,0,0)', 'paper_bgcolor': 'rgba(0,0,0,0)'})
    fig.update_xaxes(title_text='False Positive Rate')
    fig.update_yaxes(title_text='True Positive Rate')
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=-0.23, xanchor="right", x=0.6))
    return fig




In [15]:

def draw_AUC_ModelComparison(model, name, target_column, user):
    """

    :param model:
    :param name:
    :param target_column:
    :param user:
    :return:
    """
    model_auc = model.areaUnderROC
    model_fpr, model_tpr = model.roc.select("FPR").show(), model.roc.select("TPR").show()
    ns_probs = [0 for _ in range(len(model.select("prediction").count()))]
    train_actual = model.select(target_column)
    ns_fpr, ns_tpr, _ = roc_curve(train_actual, ns_probs)
    return lift_ModelComparison(model_fpr, model_tpr, ns_fpr, ns_tpr, model_auc, name, user)


# ROC AUC Curve
def draw_AUC_OngoingComparison(train_model, test_model, target_column, name, user):
    """

    :param train_model:
    :param test_model:
    :param target_column:
    :param name:
    :param user:
    :return:
    """
    # ROC for Train Data
    train_model_auc = train_model.areaUnderROC
    train_model_fpr, train_model_tpr = train_model.roc.select("FPR").show(), train_model.roc.select("TPR").show()
    ns_probs1 = [0 for _ in range(len(train_model.select("prediction").count()))]
    train_actual = train_model.select(target_column)
    ns_fpr, ns_tpr, _ = roc_curve(train_actual, ns_probs1)

    # ROC for Test Data
    test_model_auc = test_model.areaUnderROC
    test_model_fpr, test_model_tpr = test_model.roc.select("FPR").show(), test_model.roc.select("TPR").show()
    ns_probs2 = [0 for _ in range(len(test_model.select("prediction").count()))]
    test_actual = test_model.select(target_column)
    ns_fpr2, ns_tpr2, _ = roc_curve(test_actual, ns_probs2)

    return lift_OngoingComparison(test_model_fpr, test_model_tpr, ns_fpr, ns_tpr, train_model_auc, train_model_fpr,
                                  train_model_tpr, ns_fpr2,
                                  ns_tpr2, test_model_auc, name, user=user)



In [16]:

"""VIF Test"""


def calc_vif_OngoingComparison(x_test, y_test):
    """

    :param x_test:
    :param y_test:
    :return:
    """
    vif = {"Variable": x_test.columns.tolist(),
           "VIF Train": [variance_inflation_factor(x_test.values, i) for i in range(x_test.shape[1])],
           "VIF Test": [variance_inflation_factor(y_test.values, i) for i in range(y_test.shape[1])]}
    vif = pd.DataFrame(vif).round(2)
    return vif


def calc_vif_ModelComparison(X):
    """

    :param X:
    :return:
    """
    vif = {"Variable": X.columns.tolist(), "VIF": [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]}
    vif = pd.DataFrame(vif).round(2)
    return vif


# HL test
def make_recarray(y_true, y_pred):
    """

    :param y_true:
    :param y_pred:
    :return:
    """
    recarrays = np.recarray((len(y_true),), [('y_true', 'u8'), ('y_pred', 'f8')])
    recarrays['y_true'] = y_true
    recarrays['y_pred'] = y_pred
    recarrays.sort(order='y_pred')
    return recarrays


def hosmer_lemeshow_table(y_true, y_pred, n_groups=10):
    """

    :param y_true:
    :param y_pred:
    :param n_groups:
    :return:
    """
    if n_groups < 2:
        raise ValueError('Number of groups must be greater than or equal to 2')
    elif n_groups > len(y_true):
        raise ValueError('Number of predictions must exceed number of groups')
    table = make_recarray(y_true, y_pred)
    table = [(len(g), g.y_true.sum(), g.y_pred.sum(), g.y_pred.mean()) for g in np.array_split(table, n_groups)]
    names = ('group_size', 'obs_freq', 'pred_freq', 'mean_prob')
    table = np.rec.fromrecords(table, names=names)
    return table


def hlTest(y_true, y_prob, model_name):
    """

    :param y_true:
    :param y_prob:
    :param model_name:
    :return:
    """
    table = hosmer_lemeshow_table(y_true, y_prob)
    num = np.square(table.obs_freq - table.mean_prob)
    den = table.group_size * table.mean_prob * (1 - table.mean_prob)
    C_hat = np.sum(num / den)
    df = len(table) - 2
    p = scipy.stats.distributions.chi2.sf(C_hat, df)

    if p > 0.05:
        conclusion = 'The Model is adequate'
    else:
        conclusion = 'The Model is not adequate'

    table = {'Groups': 10, 'HL test Statistic': C_hat, 'DOF': df, 'P-Value': p, 'Conclusion': conclusion}
    table = pd.DataFrame(table, index=[model_name])
    table = table.round(2)
    # TestResult = namedtuple('HosmerLemeshowTest', ('C_hat', 'df', 'p'))
    return table




In [17]:

def feature_importance_graph_ModelComparison(target_column, test_data):
    """

    :param target_column:
    :param test_data:
    :return:
    """
    features = test_data.drop(target_column).show()
    rf_model = RandomForestClassifier(labelCol=target_column, featuresCol=features, numTrees=10)
    rf_model = rf_model.fit(test_data)
    fi = pd.DataFrame({"Features": features, "Importance": [i for i in rf_model.featureImportances.values]})
    x, y = (list(x) for x in zip(*sorted(zip(rf_model.featureImportances.values, features), reverse=True)))
    trace = go.Bar(x=y, y=x, marker=dict(color='#32E0C4'))
    data = [trace]
    layout = go.Layout()
    fig = go.Figure(data=data, layout=layout)
    fig.update_layout({'plot_bgcolor': 'rgba(0,0,0,0)', 'paper_bgcolor': 'rgba(0,0,0,0)'})
    fig.update_layout(title_text="<b>RANDOM FOREST BASED FEATURE IMPORTANCE<b> ", title_x=0.5)
    fig.update_xaxes(title_text='Features')
    fig.update_yaxes(title_text='Feature Importance')
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    return fi.round(2), fig


def feature_importance_graph_OngoingComparison(target_column, dev, mon):
    features = mon.drop(target_column)
    rf_model = RandomForestClassifier(labelCol=target_column, featuresCol=features, numTrees=10)
    rf_model = rf_model.fit(mon)

    fi = pd.DataFrame({"Features": features, "Importance": rf_model.featureImportances.values})
    x, y = (list(x) for x in zip(*sorted(zip(rf_model.featureImportances.values, features), reverse=True)))
    trace = go.Bar(x=y, y=x, marker=dict(color='#32E0C4'))
    data = [trace]
    layout = go.Layout()
    fig = go.Figure(data=data, layout=layout)
    fig.update_layout({'plot_bgcolor': 'rgba(0,0,0,0)', 'paper_bgcolor': 'rgba(0,0,0,0)'})
    fig.update_layout(title_text="Monitoring Sample<br>Random Forest Based Feature Importances", title_x=0.5)
    fig.update_xaxes(title_text='Features')
    fig.update_yaxes(title_text='Feature Importance')
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')

    rf_model = rf_model.fit(dev)
    fi['Importance Development Sample'] = rf_model.featureImportances.values
    x, y = (list(x) for x in zip(*sorted(zip(rf_model.feature_importances_, features), reverse=True)))
    trace = go.Bar(x=y, y=x, marker=dict(color='#32E0C4'))
    data = [trace]
    layout = go.Layout()
    fig2 = go.Figure(data=data, layout=layout)
    fig2.update_layout({'plot_bgcolor': 'rgba(0,0,0,0)', 'paper_bgcolor': 'rgba(0,0,0,0)'})
    fig2.update_layout(title_text="Development Sample<br>Random Forest Based Feature Importance", title_x=0.5)
    fig2.update_xaxes(title_text='Features')
    fig2.update_yaxes(title_text='Feature Importance')
    fig2.update_xaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    fig2.update_yaxes(showline=True, linewidth=1, linecolor='black', rangemode='nonnegative')
    return fi.round(2), (fig, fig2)



In [18]:

def ModelComparison(user, models, target_column, dataset, stabilities, strengths, model_names):
    """

    :param user:
    :param models:
    :param target_column:
    :param dataset: All data point of dataset
    :param stabilities:
    :param strengths:
    :param model_names:
    :return:
    """
    figs = {}
    tables = {}
    for i in range(len(models)):
        if model_names[i] is None:
            model_name = "Model " + str(i + 1)
        else:
            model_name = model_names[i].capitalize()
        model = models[i]
        figs[model_name] = {}
        tables[model_name] = {}

        train_data, test_data = dataset.randomSplit([.7, .3], seed=42)
        train_model = model.transform(train_data)
        train_model = model.evaluate(train_data).select("prediction").show()
        test_model = model.evaluate(test_data).select("prediction").show()
        y_pred = model.evaluate(dataset).select("prediction").show()  # prediction classes for all dataset
        y_actual = dataset.select(target_column).show()  # actual classes for dataset

        for stability in stabilities:
            if stability == 'PSI':
                tables[model_name][stability] = PSI_ModelComparison(train_model, test_model)
            elif stability == 'CSI':
                tables[model_name][stability] = calculate_csi(test_data, train_data)
        for strength in strengths:
            if strength == 'ROC-AUC':
                fig = draw_AUC_ModelComparison(model, model_name, target_column, user)
                fig.update_layout(title_text=f"ROC AUC Curve: {model_name}", )
                figs[model_name][strength] = fig
            elif strength == 'Confusion Matrix':
                fig = CM_ModelComparison(y_pred, y_actual)
                fig.update_layout(title_text=f"Confusion Matrix : {model_name}", )
                figs[model_name][strength] = fig
            elif strength == 'KS':
                fig = KS_ModelComparison(y_pred, y_actual, user)
                fig.update_layout(title_text=f"KS Statistics Curve : {model_name}", )
                figs[model_name][strength] = fig
            elif strength == 'specificity & sensitivity':
                fig = graph_sen_ModelComparison(test_model, user)
                fig.update_layout(title_text=f"Specificity vs Sensitivity : {model_name}", )
                figs[model_name][strength] = fig
            elif strength == 'vif':
                table = calc_vif_ModelComparison(dataset)
                tables[model_name][strength] = table
            elif strength == 'HL':
                table = hlTest(np.array(y_actual).reshape(-1), y_pred, model_name)
                tables[model_name][strength] = table
        metrics_name = [x for x in strengths if
                        x in ['ROC-AUC', 'gini', 'KS', 'specificity & sensitivity', 'Precision', 'Recall', 'F1-Score']]
        tables[model_name]['Strength Statistics'] = draw_table_ModelComparison(test_model, metrics_name, target_column)
        table, fig = feature_importance_graph_ModelComparison(target_column, dataset)
        fig.update_layout(title_text=f"Random Forest Based Feature Importance : {model_name}", )
        tables[model_name]['Feature Importance'] = table
        figs[model_name]['Feature Importance'] = fig
    return figs, tables





In [19]:

# Wrapper for Ongoing Comparison
def OngoingComparison(user, models, target_column, test_data, train_data, stabilities, strengths, model_names):
    figs = {}
    tables = {}
    for i in range(len(models)):
        if model_names[i] is None:
            model_name = "Model " + str(i + 1)
        else:
            model_name = model_names[i].capitalize()
        model = models[i]
        figs[model_name] = {}
        tables[model_name] = {}

        train_model = model.evaluate(train_data)  # probability of both the classes for train data
        test_model = model.evaluate(test_data)
        train_data, test_data = train_data.drop(target_column), test_data.drop(target_column)

        for stability in stabilities:
            if stability == 'PSI':
                tables[test_data][stability] = PSI_OngoingComparison(train_model.select("prediction"),
                                                                     test_model.select("prediction"))
            elif stability == 'CSI':
                tables[test_data][stability] = calculate_csi(test_data, train_data)
        for strength in strengths:
            if strength == 'ROC-AUC':
                figs[test_data][strength] = draw_AUC_OngoingComparison(train_model, test_model, target_column,
                                                                       model_name,
                                                                       user)
            elif strength == 'Confusion Matrix':
                figs[test_data][strength] = CM_OngoingComparison(y_pred=test_model,
                                                                 y_test=test_data.select(target_column),
                                                                 x_test=test_model,
                                                                 x_pred=train_data.select(target_column))
            elif strength == 'KS':
                figs[test_data][strength] = KS_OngoingComparison(user=user, y_probas=train_model.select("prediction"),
                                                                 y_true=train_data.select(target_column),
                                                                 x_probas=test_model.select("prediction"),
                                                                 x_true=test_data.select(target_column))
            elif strength == 'specificity & sensitivity':
                figs[test_data][strength] = graph_sen_OngoingComparison(train_model, test_model, user=user)
            elif strength == 'vif':
                table = calc_vif_OngoingComparison(train_data, test_data)
                table.rename(columns={'VIF Test': 'VIF Monitoring', 'VIF Train': 'VIF Development'}, inplace=True)
                tables[test_data][strength] = table
            elif strength == 'HL':
                table = hlTest(np.array(test_data.select(target_column)).reshape(-1),
                               test_model.select("prediction").show(), model_name='model')
                tables[test_data][strength] = table

        metrics_list = [x for x in strengths if x in ['ROC-AUC', 'gini', 'KS', 'specificity & sensitivity',
                                                      'Precision', 'Recall', 'F1-Score']]
        tables[test_data]['Strength Statistics'] = draw_table_OngoingComparison(train_model, test_model,
                                                                                metrics_list, target_column)
        table, fig = feature_importance_graph_OngoingComparison(target_column, train_data, test_data)
        tables[test_data]['Feature Importance'] = table
        figs[test_data]['Feature Importance'] = fig
    return figs, tables

