In [3]:
import numpy as np
import category_encoders as ce
from sklearn.preprocessing import PowerTransformer

## Categorical plots
### no predictions

In [1]:
def cat_plot_no_pred(df, feat_lst, rows, cols, target, y, target_lim = None, output_file='cat_plot'):
    i=0
    sns.set_style('darkgrid')
    plt.figure()
    fig, ax = plt.subplots(rows, cols, figsize= (6*cols, 6*rows))
    
    for feature in feat_lst:
        x= df.groupby(feature, as_index= True)[[target, y]].aggregate({target: 'mean', y: 'sum'})
        x= x.rename_axis('bin').reset_index()
        
        i+=1
        ax = plt.subplot(rows, cols, i)
        sns.lineplot(data = x[target], marker='o', sort= False, ax=ax)
        plt.grid(False)
        plt.xticks(rotation=90)
        plt.title(feature)
        ax.set_ylim(target_lim)

        ax2 = ax.twinx()
        sns.barplot(data = x, x='bin', y= y, alpha= .5, ax=ax2)
        plt.grid(False)
        fig.tight_layout()
    fig.savefig(output_file)
    fig.show();

In [None]:
def cat_plot_with_pred(df, feat_lst, rows, cols, target, y, pred, target_lim = None):
    i=0
    sns.set_style('darkgrid')
    plt.figure()
    fig, ax = plt.subplots(rows, cols, figsize= (6*cols, 6*rows))
    
    for feature in feat_lst:
        x= df.groupby(feature, as_index= True)[[target, pred, y]].aggregate({target: 'mean', pred: 'mean', y: 'sum'})
        x= x.rename_axis('bin').reset_index()
        
        i+=1
        ax = plt.subplot(rows, cols, i)
        sns.lineplot(data = x[[target, pred]], marker='o', sort= False, ax=ax)
        plt.grid(False)
        plt.xticks(rotation=90)
        plt.title(feature)
        ax.set_ylim(target_lim)

        ax2 = ax.twinx()
        sns.barplot(data = x, x='bin', y= y, alpha= .5, ax=ax2)
        plt.grid(False)
        fig.tight_layout()
    fig.show();

## Numerical
### without predictions

In [None]:
def num_plot_no_pred(df, feat_lst, rows, cols, target, y, nbins=10, target_lim = None, output_file='num_plot'):
    i=0
    sns.set_style('darkgrid')
    plt.figure()
    fig, ax = plt.subplots(rows, cols, figsize= (6*cols, 6*rows))
    
    for feature in feat_lst:
        x= df.groupby(pd.qcut(df[feature], nbins, duplicates= 'drop'), as_index= True)[[target, y]].aggregate({target: 'mean', y: 'sum'})
        x= x.rename_axis('bin').reset_index()
        
        i+=1
        ax = plt.subplot(rows, cols, i)
        sns.lineplot(data = x[target], marker='o', sort= False, ax=ax)
        plt.grid(False)
        plt.xticks(rotation=90)
        plt.title(feature)
        ax.set_ylim(target_lim)

        ax2 = ax.twinx()
        sns.barplot(data = x, x='bin', y= y, alpha= .5, ax=ax2)
        plt.grid(False)
        fig.tight_layout()
    fig.savefig(output_file)    
    fig.show();

In [1]:
def num_plot_with_pred(df, feat_lst, rows, cols, target, y, pred, nbins=10, target_lim = None):
    i=0
    sns.set_style('darkgrid')
    plt.figure()
    fig, ax = plt.subplots(rows, cols, figsize= (6*cols, 6*rows))
    
    for feature in feat_lst:
        x= df.groupby(pd.qcut(df[feature], nbins, duplicates= 'drop'), as_index= True)[[target, pred, y]].aggregate({target: 'mean', pred: 'mean', y: 'sum'})
        x= x.rename_axis('bin').reset_index()
        
        i+=1
        ax = plt.subplot(rows, cols, i)
        sns.lineplot(data = x[[target, pred]], marker='o', sort= False, ax=ax)
        plt.grid(False)
        plt.xticks(rotation=90)
        plt.title(feature)
        ax.set_ylim(target_lim)

        ax2 = ax.twinx()
        sns.barplot(data = x, x='bin', y= y, alpha= .5, ax=ax2)
        plt.grid(False)
        fig.tight_layout()
    fig.show();

In [None]:
def group_low_freq(df, lst, pct=1):
    for f in lst:
        series= pd.value_counts(df[f])
        mask= (series/series.sum() * 100).lt(pct)
        mode= df[f].mode()[0]
        df[f] = np.where(df[f].isin(series[mask].index), mode, df[f])

In [6]:
def lorenz_curve(y_true, y_pred, exposure):
    y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)
    exposure = np.asarray(exposure)
    
    ranking = np.argsort(y_pred)
    ranked_exposure = exposure[ranking]
    ranked_pure_premium = y_true[ranking]
    cumulated_claim_amount = np.cumsum(ranked_pure_premium * ranked_exposure)
    cumulated_claim_amount = np.true_divide(cumulated_claim_amount, cumulated_claim_amount[-1], out= cumulated_claim_amount, casting='unsafe')
    cumulated_samples = np.linspace(0, 1, len(cumulated_claim_amount))
    return cumulated_samples, cumulated_claim_amount

In [None]:
def pair_bubble(df, group, lims= [50, 300]):

    fig, ax = plt.subplots(1, 2, figsize=(20, 10))

    #   Frequency vs Severity
    ax = plt.subplot(1, 2, 1)
    sns.scatterplot(x='Frequency', y="Severity", size="Dist", sizes=(1000, 10000), hue=group, palette="ch:r=-.2,d=.3_r", legend=False, data=df)
    plt.title('Frequency vs Severity')

    #For each point, we add a text inside the bubble
    for line in range(0, df.shape[0]):
        ax.text(df.Frequency[line], df.Severity[line], df[group][line], horizontalalignment='center', verticalalignment= 'center', size='medium', color='black')

    #   PurePremium vs AvgNetPremium
    ax = plt.subplot(1, 2, 2)
    sns.scatterplot(x='PurePremium', y="AvgEP", size="Dist", sizes=(1000, 10000), hue=group, palette="ch:r=-.2,d=.3_r", legend=False, data=df)
    plt.xlim(lims)
    plt.ylim(lims)
    ax.plot(lims, lims, linestyle="--", color="black")
    plt.title('PurePremium vs AvgNetPremium')

    #For each point, we add a text inside the bubble
    for line in range(0, df.shape[0]):
        ax.text(df.PurePremium[line], df.AvgEP[line], df[group][line], horizontalalignment='center', verticalalignment= 'center', size='medium', color='black')


### Dates

In [None]:
def planmaster_date_to_datetime(df, columns):
    '''
    function to convert Foundation date format to datetime
    '''
    df_= df.copy()
    for c in columns:
        df_['datemonth'] = df_[c].astype(int).astype(str).str[-4:]
        df_['year'] = df_[c].astype(int).astype(str).str[:3]
        df_['year'] = df_['year'].astype(int)+1900
        df_[c]= df_['year'].astype(str)+df_['datemonth']
        # df_[c] = pd.to_datetime(df_[c], format='%Y%m%d')
    return df_

In [None]:
def date_fix(df, cols):
    df[cols] = df[cols].apply(pd.to_datetime)
    # df[cols] = df[cols].apply(lambda x: x.dt.date)

## Performance Metrics

In [1]:
def model_perf_stats(model, train, valid):
    # Performance statistics
    perf1 = model.model_performance(train)
    perf2 = model.model_performance(valid)
    metrics_list= ['MCC: {:.5f} / {:.5f}'.format(perf1.mcc()[0][1], perf2.mcc()[0][1]),
                   'F1: {:.5f} / {:.5f}'.format(perf1.F1()[0][1], perf2.F1()[0][1]),
                   'AUC: {:.5f} / {:.5f}'.format(perf1.auc(), perf2.auc()),
                   'AUC PR: {:.5f} / {:.5f}'.format(perf1.aucpr(), perf2.aucpr()),
                   'Accuracy: {:.5f} / {:.5f}'.format(perf1.accuracy()[0][1], perf2.accuracy()[0][1]),
                   'Logloss: {:.5f} / {:.5f}'.format(perf1.logloss(), perf2.logloss())
                #    ,
                #    'KS: {:.5f}'.format(model.kolmogorov_smirnov())
        ]
    return metrics_list

def print_model_perf_stats(model, train, valid):
    # Performance statistics
    perf1 = model.model_performance(train)
    perf2 = model.model_performance(valid)
    print('MCC: {:.5f} / {:.5f}'.format(perf1.mcc()[0][1], perf2.mcc()[0][1]))
    print('F1: {:.5f} / {:.5f}'.format(perf1.F1()[0][1], perf2.F1()[0][1]))
    print('AUC: {:.5f} / {:.5f}'.format(perf1.auc(), perf2.auc()))
    print('AUC PR: {:.5f} / {:.5f}'.format(perf1.aucpr(), perf2.aucpr()))
    print('Accuracy: {:.5f} / {:.5f}'.format(perf1.accuracy()[0][1], perf2.accuracy()[0][1]))
    print('Logloss: {:.5f} / {:.5f}'.format(perf1.logloss(), perf2.logloss()))
    # print('KS: {:.5f}'.format(model.kolmogorov_smirnov()))

In [None]:
def model_plots(model, df, target):
    pred = model.predict(h2o.H2OFrame(df[features + [target]]))['p1'].as_data_frame()
    pred = pred.rename(columns= {'p1': 'predictions'})
    df_pred= pd.concat([df.reset_index(drop= True), pred.reset_index(drop= True)], axis= 1)

    sns.set_style('whitegrid')
    fig, ax = plt.subplots(1, 2, figsize= (20, 8))
    ax = plt.subplot(1, 2, 1)
    x= df_pred.groupby(pd.qcut(df_pred['predictions'], 20, duplicates= 'drop'), as_index=True, observed=True)[[target, 'predictions']].mean()
    x= x.rename_axis('Bin').reset_index()
    sns.lineplot(data= x[[target, 'predictions']], lw=2, ax=ax)
    
    ax = plt.subplot(1, 2, 2)
    sns.histplot(data= df_pred, x= 'predictions', stat= 'density', ax= ax)

## Categorical Encoding

In [None]:

def get_leave_one_out(feature, target):
    
    l1o_encoder = ce.LeaveOneOutEncoder( 
                                    handle_missing='return_nan',
                                    handle_unknown='return_nan',
                                    sigma=0.01)
    
    l1o_encoder.fit(feature,target)
    l1o_transformed_feature = l1o_encoder.transform(feature)
    
    return l1o_encoder, l1o_transformed_feature


def get_ordinal(feature):
    
    ord_encoder = ce.ordinal.OrdinalEncoder( 
                                    handle_missing='return_nan',
                                    handle_unknown='return_nan')
    
    ord_encoder.fit(feature)
    ord_transformed_feature = ord_encoder.transform(feature)
    
    return ord_encoder, ord_transformed_feature

## Resampling

In [None]:
def undersample(df, target, minor_label, major_label):
    '''
    resample a dataset so its majority class is undersampled to match the count of its
    minority class
    '''  
    seed = 1
    
    class_counts = df[target].value_counts()
    
    c1 = df[df[target] == minor_label]
    c0 = df[df[target] == major_label]

    df_1 = c1.sample(class_counts[minor_label], random_state = seed )
    df_0 = c0.sample(class_counts[minor_label])
    
    sampled_df = pd.concat([df_0, c1],axis=0)
        
    return sampled_df
    
def imbalanced_undersample(df, target, minor_label, major_label):
    '''
    resample a dataset so its majority class is undersampled to double the count of its
    minority class
    '''  
    
    class_counts = df[target].value_counts()
    
    c1 = df[df[target] == minor_label]
    c0 = df[df[target] == major_label]

    df_1 = c1.sample(class_counts[minor_label])
    df_0 = c0.sample(class_counts[minor_label]*2)
    
    sampled_df = pd.concat([df_0, c1],axis=0)
        
    return sampled_df