# offers_present to state_funded

In [175]:
# Number of variables to show in the correlation analysis
correlation_show = 20
line_width = 2 # line width for plotly plotting

# Number subset in cross validation
cv = 5
train_size = 0.8

# Training dataset time range end with this date
cut_time = '2019-06-01'

# cut_groups: used in qcut to group population based on pred scores
cut_groups = 10

# Heatmap color 
colorscale = 'Oranges'

In [176]:
import pandas as pd
import pandas_profiling

import numpy as np
from scipy import stats

import cufflinks as cf
import plotly.offline
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)



import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
import warnings
from collections import Counter
from sklearn.feature_selection import mutual_info_classif
warnings.filterwarnings('ignore')

from datetime import datetime, timedelta
import missingno as msno

import re

import operator
import copy

In [177]:
max_col_row = 500

x_label_font_size = 15
y_label_font_size = 15
title_font_size = 25

import pandas as pd
pd.set_option('display.max_rows', max_col_row)
pd.set_option('display.max_columns', max_col_row)
pd.set_option('display.width', max_col_row)

%matplotlib inline

from IPython.display import display
from IPython.display import Image

from IPython.display import HTML

In [178]:
# Find skewed variables and apply log transformation if necessary
from scipy.stats import skew
from scipy.stats.stats import pearsonr

from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from functools import reduce
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score 
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, auc, log_loss

In [179]:
# HTML('''<script>
# code_show=true; 
# function code_toggle() {
#  if (code_show){
#  $('div.input').hide();
#  } else {
#  $('div.input').show();
#  }
#  code_show = !code_show
# } 
# $( document ).ready(code_toggle);
# </script>
# <form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [180]:
# Execute this cell to remove the first column of dataframe tables (to remove index column)
from IPython.core.display import HTML
HTML("""
<style>
    table.dataframe thead th:first-child {
        display: none;
    }
    table.dataframe tbody th {
        display: none;
    }
</style>
""")

In [181]:
def if_diff(x):
    if x != 0:
        return 1
    else:
        return 0

# Function to convert a single column to 
# a dummy variables and concatenate back to teh dataframe
def convert_to_dummies(df, column):

    dummies = pd.get_dummies(df[column]).rename(columns=lambda x: column + '_' + str(x))
    return pd.concat([df, dummies], axis=1).drop(column, axis=1)

def corr_map(corr_coef):
    if abs(corr_coef) >= 0 and abs(corr_coef) <= 0.3:
        return "weak"
    elif abs(corr_coef) > 0.3 and abs(corr_coef) <= 0.7:
        return "moderate"
    elif abs(corr_coef) > 0.7 and abs(corr_coef) <= 1:
        return "strong"
    else:
        return "Please ignore"

def date_fmt_update(date):
    date = date[6:10] + "-" + date[0:2] + "-" + date[3:5]
    return date

In [182]:
class DFFunctionTransformer(TransformerMixin, BaseEstimator):
    # FunctionTransformer but for pandas DataFrames

    def __init__(self, *args, **kwargs):
        self.ft = FunctionTransformer(*args, **kwargs)

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        Xt = self.ft.transform(X)
        Xt = pd.DataFrame(Xt, index=X.index, columns=X.columns)
        return Xt


class DFFeatureUnion(TransformerMixin, BaseEstimator):
    # FeatureUnion but for pandas DataFrames

    def __init__(self, transformer_list):
        self.transformer_list = transformer_list

    def fit(self, X, y=None):
        for (name, t) in self.transformer_list:
            t.fit(X, y)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xts = [t.transform(X) for _, t in self.transformer_list]
        Xunion = reduce(lambda X1, X2: pd.merge(X1, X2, left_index=True, right_index=True), Xts)
        return Xunion


class DFImputer(TransformerMixin, BaseEstimator):
    # Imputer but for pandas DataFrames

    def __init__(self, strategy='mean'):
        self.strategy = strategy
        self.imp = None
        self.statistics_ = None

    def fit(self, X, y=None):
        self.imp = Imputer(strategy=self.strategy)
        X = X.replace([np.inf, -np.inf], np.nan)
        self.imp.fit(X)
        self.statistics_ = pd.Series(self.imp.statistics_, index=X.columns)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        X = X.replace([np.inf, -np.inf], np.nan)
        Ximp = self.imp.transform(X)
        Xfilled = pd.DataFrame(Ximp, index=X.index, columns=X.columns)
        return Xfilled


class DFStandardScaler(TransformerMixin, BaseEstimator):
    # StandardScaler but for pandas DataFrames

    def __init__(self):
        self.ss = None
        self.mean_ = None
        self.scale_ = None

    def fit(self, X, y=None):
        self.ss = StandardScaler()
        self.ss.fit(X)
        self.mean_ = pd.Series(self.ss.mean_, index=X.columns)
        self.scale_ = pd.Series(self.ss.scale_, index=X.columns)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xss = self.ss.transform(X)
        Xscaled = pd.DataFrame(Xss, index=X.index, columns=X.columns)
        return Xscaled


class DFRobustScaler(TransformerMixin, BaseEstimator):
    # RobustScaler but for pandas DataFrames

    def __init__(self):
        self.rs = None
        self.center_ = None
        self.scale_ = None

    def fit(self, X, y=None):
        self.rs = RobustScaler()
        self.rs.fit(X)
        self.center_ = pd.Series(self.rs.center_, index=X.columns)
        self.scale_ = pd.Series(self.rs.scale_, index=X.columns)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xrs = self.rs.transform(X)
        Xscaled = pd.DataFrame(Xrs, index=X.index, columns=X.columns)
        return Xscaled


class ColumnExtractor(TransformerMixin, BaseEstimator):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xcols = X[self.cols]
        return Xcols


class ZeroFillTransformer(TransformerMixin, BaseEstimator):
    """
    Fill infinite and nulls with zero

    :param X: DataFrame
    :return: DataFrame the replaced infinite and null with zero
    """
    
    def __init__(self, col):
        self.col = col

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        X[self.col] = X[self.col].replace([np.inf, -np.inf], np.nan)
        X[self.col] = X[self.col].fillna(value=0)
        return X

class FillWithAnotherColumn(TransformerMixin, BaseEstimator):
    """
    Fill null values with values from another column

    :param a: column to be filled
    :param b: column to fill a
    :return: DataFrame with filled values
    """
    
    def __init__(self, a, b):
        self.a = a
        self.b = b

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        X[self.a] = X[self.a].fillna(X[self.b])
        return X

class Log1pTransformer(TransformerMixin, BaseEstimator):

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xlog = np.log1p(X)
        return Xlog


class DateFormatter(TransformerMixin, BaseEstimator):
    
    def __init__(self, col):
        
        self.col = col

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        # Xdate = X.apply(pd.to_datetime)
        X[self.col] = X[self.col].apply(pd.to_datetime)
        X[self.col] = X[self.col].dt.strftime('%Y-%m-%d')
#         X[self.col] = X[self.col][6:10] + "-" + X[self.col][0:2] + "-" + X[self.col][3:5]
        return X


class DateDiffer(TransformerMixin, BaseEstimator):
    
    def __init__(self, begin_col, end_col, gaps):
        """
        
        :param X: DataFrame
        :param y: 
        :param begin_col: column that contains the beginning date
        :param end_col: column that contains the ending date
        :param gaps: name of the column that represents the gapss
        :return: number of days in between
        """

        self.begin_col = begin_col
        self.end_col = end_col
        self.gaps = gaps
        
    def fit(self, X, y=None):

        # stateless transformer
        return self

    def transform(self, X):

        
        # Convert to datetime format (assuming DateFormatter is applied)
        X[self.end_col] = pd.to_datetime(X[self.end_col])
        X[self.begin_col] = pd.to_datetime(X[self.begin_col])
        X[self.gaps] = (X[self.end_col] - X[self.begin_col]).dt.days
        Xz = X.drop([self.begin_col, self.end_col], axis=1)

        return Xz

class NumericDiffer(TransformerMixin, BaseEstimator):
    
    def __init__(self, a, b, dis_name):
        """
        Mainly used for calculating discrepancies between rates, payments, and terms
        
        :param a: column a
        :param b: column a
        :param dis_name: column name for the discrepancy

        :return: discrepancy        
        
        """
        
        self.a = a
        self.b = b
        self.dis_name = dis_name
        
    def fit(self, X, y=None):

        # stateless transformer
        return self

    def transform(self, X):
        """
        
        :param X: DataFrame
        :param y: 
        :param first_col: 
        :param second_col: column that contains the ending date
        :param gaps: name of the column that represents the gapss
        :return: gaps
        """

        X[self.dis_name] = X[self.a] - X[self.b]

        return X    

class DummyTransformer(TransformerMixin, BaseEstimator):

    def __init__(self):
        self.dv = None

    def fit(self, X, y=None):
        # assumes all columns of X are strings
        Xdict = X.to_dict('records')
        self.dv = DictVectorizer(sparse=False)
        self.dv.fit(Xdict)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xdict = X.to_dict('records')
        Xt = self.dv.transform(Xdict)
        cols = self.dv.get_feature_names()
        Xdum = pd.DataFrame(Xt, index=X.index, columns=cols)
        # drop column indicating NaNs
        nan_cols = [c for c in cols if '=' not in c]
        Xdum = Xdum.drop(nan_cols, axis=1)
        return Xdum


class MultiEncoder(TransformerMixin, BaseEstimator):
    # Multiple-column MultiLabelBinarizer for pandas DataFrames

    def __init__(self, sep=','):
        self.sep = sep
        self.mlbs = None

    def _col_transform(self, x, mlb):
        cols = [''.join([x.name, '=', c]) for c in mlb.classes_]
        xmlb = mlb.transform(x)
        xdf = pd.DataFrame(xmlb, index=x.index, columns=cols)
        return xdf

    def fit(self, X, y=None):
        Xsplit = X.applymap(lambda x: x.split(self.sep))
        self.mlbs = [MultiLabelBinarizer().fit(Xsplit[c]) for c in X.columns]
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xsplit = X.applymap(lambda x: x.split(self.sep))
        Xmlbs = [self._col_transform(Xsplit[c], self.mlbs[i])
                 for i, c in enumerate(X.columns)]
        Xunion = reduce(lambda X1, X2: pd.merge(X1, X2, left_index=True, right_index=True), Xmlbs)
        return Xunion


class StringTransformer(TransformerMixin, BaseEstimator):

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xstr = X.applymap(str)
        return Xstr


class ClipTransformer(TransformerMixin, BaseEstimator):

    def __init__(self, a_min, a_max):
        self.a_min = a_min
        self.a_max = a_max

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xclip = np.clip(X, self.a_min, self.a_max)
        return Xclip


class AddConstantTransformer(TransformerMixin, BaseEstimator):

    def __init__(self, c=1):
        self.c = c

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xc = X + self.c
        return Xc

class ChangeDataType(TransformerMixin, BaseEstimator):
    
    def __init__(self, cols, new_dtype):
        self.cols = cols
        self.new_dtype = new_dtype

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        """
        
        :param X: DataFrame
        :param cols: columns that require data type changes
        :param new_dtype: new data type
        :return: DataFrame with new data type
        """
        X[self.cols] = X[self.cols].astype(self.new_dtype) 
        return X

class MultiColumnLabelEncoder(TransformerMixin, BaseEstimator):  
    """Transformer for applying label encoder on multiple columns.

    This transformer applies label encoding to columns in a dataset.
    """

    def __init__(self):
        self.d = defaultdict(LabelEncoder)

    def transform(self, X, **transform_params):
        """Transforms X to have columns label encoded.

        Args:
            X (obj): The dataset to transform. Can be dataframe or matrix.
            transform_params (kwargs, optional): Additional params.

        Returns:
            The transformed dataset with the label encoded columns.
        """
        X = X.fillna('NaN')  # fill null values with 'NaN'
        transformed = X.apply(lambda x: self.d[x.name].transform(x))
        return transformed

    def fit(self, X, y=None, **fit_params):
        """Fits transfomer over X.

        Needs to apply fit over the defaultdict so as to retain the
        label classes when transforming.
        """
        X = X.fillna('NaN')  # fill null values with 'NaN'
        X.apply(lambda x: self.d[x.name].fit(x))
        return self


In [183]:
# cr_all_time.columns.to_list()

In [184]:
# customer report
cr_all_time = pd.read_csv\
("loan_app_report_loan_level---2018-03-01---2019-06-20.csv", low_memory=False)

In [185]:
cr_all_time['date'] = cr_all_time['date'].apply(date_fmt_update)


cr_all_time = cr_all_time[((cr_all_time['utm_campaign'].isin(['CK', 'ck'])) | \
            (cr_all_time['direct_mail_batch'].notnull())) & \
                   (cr_all_time["date"] <= cut_time)]

cr_all_time['channel'] = cr_all_time['direct_mail_batch'].fillna('CK')
cr_all_time.loc[cr_all_time['channel'] != "CK", 'channel'] = "DM"

cr_all_time = cr_all_time[cr_all_time['offers_present']==1].drop('offers_present', axis=1)

In [186]:
# Take out outliers

cr_all_time = cr_all_time[cr_all_time['vehicle_age'] != 2019]
cr_all_time = cr_all_time[cr_all_time['customer_annual_income'] < 1000000]
cr_all_time = cr_all_time[cr_all_time['current_payment'] < 3000]

In [187]:
OUTCOME = 'state_funded'
# NEAR_UNIQUE_FEATS = ['name_of_event', 'year_month_app', 'organization']
# DATE_FEATS = ['application_date', 'event_start_date', 'event_end_date']

DATE_FEATS = ['first_touched_full', 'last_touched_full']

# Features that require data type updates
DTYPE_FEATS = ['dti', 'ltv', 'decisioning_ltv', 'fico']

# CURR_MINUS_PRE_RATE_FEATS = ['current_rate', 'presented_rate']
# CURR_MINUS_BEST_PRE_RATE = ['current_rate', 'best_presented_rate']
# CURR_MINUS_MONTHLY_PAYMENT = ['current_rate', 'current_monthly_payment']
# CURR_MINUS_PRE_PAYMENT = ['current_rate', 'presented_payment']

# DUMMY_FEATS = ['']
RATE_PMT_TERM = [
    
    'current_rate', 
    'presented_rate',
    'best_presented_apr',
    'current_monthly_payment',
    'current_payment',
    'presented_payment',
    'current_term',
    'remaining_term',
    'presented_term',
    'balance_of_open_collections',
    'num_open_collections'
    
]

REST_NUM_FEATS = [ 
'fico_score_over_660',
 'vehicle_age',
 'current_loan_balance',
 'customer_age',
 'customer_annual_income',
 'apr_decrease_presented',
 'coborrower',
 'num_currently_deinquent_tradelines',
 'num_30_day_tradeline_delinquencies_in_12_months',
 'num_60_or_more_day_tradeline_delinquencies_in_12_months',
 'num_tradelines_opened_in_last_6_months',
 'num_tradelines_charged_off',
#  'curr_minus_pre_rate',
#  'curr_minus_best_pre_rate',
#  'curr_minus_monthly_payment',
#  'curr_minus_payment',
#  'remain_minus_pre_term'
]

ALL_FEATS = list(set(DATE_FEATS + DTYPE_FEATS + REST_NUM_FEATS))
# ALL_FEATS

In [188]:
# print("Number of total features used in model (not \
# including target variable): {}".format(len(ALL_FEATS)))

In [189]:

X = cr_all_time.drop(OUTCOME, axis=1)
y = cr_all_time[OUTCOME].values

x_train, x_test, y_train, y_test=train_test_split(X,y,train_size=train_size)

In [190]:

# Preprocessing with a Pipeline
lr_pipeline = Pipeline([
    ('features', DFFeatureUnion([
        
#         ('dates', Pipeline([
#             ('extract', ColumnExtractor(DATE_FEATS)),
#             ('diffs', DateDiffer('first_touched_full', 'last_touched_full', 'touch_gaps_days')),
#             ('touch_gaps_days_fill_with_avg', DFImputer())       
#         ])),

        ('rate_pmt_term', Pipeline([
            ('extract', ColumnExtractor(RATE_PMT_TERM)),
            ('current_rate_fill_with_zero', ZeroFillTransformer('current_rate')),
            ('pre_rate_filled_with_curr_rate', FillWithAnotherColumn('presented_rate', 'current_rate')),
            ('curr_minus_pre_rate', NumericDiffer('current_rate', 'presented_rate', 'curr_minus_pre_rate')),
            ('curr_minus_pre_rate_fill_zero', ZeroFillTransformer('curr_minus_pre_rate')),
            ('best_pre_apr_filled_with_curr_rate', FillWithAnotherColumn('best_presented_apr', 'current_rate')),
            ('curr_minus_best_pre_apr', NumericDiffer('current_rate', 'best_presented_apr', 'curr_minus_best_pre_apr')),
            ('curr_minus_best_pre_apr_fill_zero', ZeroFillTransformer('curr_minus_best_pre_apr')),
            ('pre_pmt_filled_with_curr_mthly_pmt', FillWithAnotherColumn('presented_payment', 'current_monthly_payment')),
            ('curr_monthly_minus_pre_payment', NumericDiffer('current_monthly_payment', 'presented_payment', 'curr_monthly_minus_pre_payment')),
            ('curr_monthly_minus_pre_payment_fill_zero', ZeroFillTransformer('curr_monthly_minus_pre_payment')),
            ('curr_minus_pre_payment', NumericDiffer('current_payment', 'presented_payment', 'curr_minus_pre_payment')),
            ('curr_minus_pre_payment_fill_zero', ZeroFillTransformer('curr_minus_pre_payment')),
            ('pre_term_filled_with_remaining_term', FillWithAnotherColumn('presented_term', 'current_term')),
            ('curr_minus_pre_term', NumericDiffer('current_term', 'presented_term', 'curr_minus_pre_term')),
            ('curr_minus_pre_term_fill_zero', ZeroFillTransformer('curr_minus_pre_term')),
            ('pre_term_filled_with_curr_term', FillWithAnotherColumn('presented_term', 'remaining_term')),
            ('remain_minus_pre_term', NumericDiffer('remaining_term', 'presented_term', 'remain_minus_pre_term')),
            ('remain_minus_pre_term_fill_zero', ZeroFillTransformer('remain_minus_pre_term')),            
            ('balance_of_open_collections_fill_zero', ZeroFillTransformer('balance_of_open_collections')),
            ('num_open_collections_fill_zero', ZeroFillTransformer('num_open_collections')),
            ('rate_pmt_term_fill_zero', ZeroFillTransformer(RATE_PMT_TERM))
        ])),
        
        ('dtype_change', Pipeline([
            ('extract', ColumnExtractor(DTYPE_FEATS)),
            ('dti_dtype_update', ChangeDataType('dti', 'float64')),
            ('ltv_dtype_update', ChangeDataType('ltv', 'float64')),
            ('dec_ltv_dtype_update', ChangeDataType('decisioning_ltv', 'float64'))
        ])),
     
        ('rest_numeric_fill_with_avg', Pipeline([
            ('extract', ColumnExtractor(REST_NUM_FEATS)),
            ('fill_rest_numeric_with_avg', DFImputer())
        ]))        
        
    ])),
    # For all data in df: 1. replacing infinity with np.nan
    # 2. fill everything with average
    # DFImputer takes care of both steps
    ('fill_all_with_avg', DFImputer()),
    ('scale', DFStandardScaler()),
    ('log_regre', LogisticRegression(C=10, penalty='l2'))
])

In [191]:
# test = lr_pipeline.fit_transform(x_train, y_train)
# test.head()

In [192]:
lr_pipeline.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('features',
                 DFFeatureUnion(transformer_list=[('rate_pmt_term',
                                                   Pipeline(memory=None,
                                                            steps=[('extract',
                                                                    ColumnExtractor(cols=['current_rate',
                                                                                          'presented_rate',
                                                                                          'best_presented_apr',
                                                                                          'current_monthly_payment',
                                                                                          'current_payment',
                                                                                          'presented_payment',
                                                                  

In [193]:
y_pred_proba = lr_pipeline.predict_proba(x_test)[:,1]

In [194]:
outcome_and_scores = pd.DataFrame({'actual_outcome':y_test, 'predict_score':y_pred_proba})
# outcome_and_scores.head()

outcome_and_scores['decile'] = pd.qcut(outcome_and_scores['predict_score'], cut_groups, labels=np.arange(cut_groups, 0, -1))
conversion_rate_by_decile = outcome_and_scores.groupby('decile')\
['actual_outcome'].agg({'actual_outcome':'mean'}).reset_index().sort_values(by='decile', ascending=False)\
.rename(columns={'actual_outcome':'conversion_rate'})
conversion_rate_by_decile

Unnamed: 0,decile,conversion_rate
9,1,0.414634
8,2,0.288344
7,3,0.184049
6,4,0.153374
5,5,0.08589
4,6,0.085366
3,7,0.09816
2,8,0.092025
1,9,0.06135
0,10,0.036585


In [195]:
print("Baseline Conversion Rate: {0:.2%}".format(sum(y_test)/len(y_test)))

Baseline Conversion Rate: 15.00%


In [196]:
# Conversion rate state_funded on different curr_minus_sel_rate_levels

data = [
    go.Bar(
        x=conversion_rate_by_decile['decile'], # assign x as the dataframe column 'x'
        y=conversion_rate_by_decile['conversion_rate'],
        marker = dict(color = 'rgb(0,152,255)'),
        name='Conversion Rate by Decile'
        # orientation='h',
    ),
    go.Scatter(x=list(conversion_rate_by_decile['decile']), \
               y=[sum(y_test)/len(y_test)] * cut_groups, 
                    mode='lines', 
                    line=dict(color='black', width=line_width, dash='dash'),
                    showlegend=True,
                    name='Baseline Conversion Rate'
    )
]

layout = go.Layout(
    title=go.layout.Title(
        text='Conversion Rate by Deciles After Applying Predictive Model',
        # titlefont=dict(size=title_font_size),
        font=dict(family="Franklin Gothic", size=title_font_size)
    ),
    xaxis=go.layout.XAxis(
        title='Deciles',
        automargin=True,
        titlefont=dict(size=x_label_font_size)
    ),
    yaxis=go.layout.YAxis(
        title='Conversion Rate',
        automargin=True,
        titlefont=dict(size=y_label_font_size),
        tickformat=".2%"
    ),
    
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='lift-chart')

In [197]:
[fpr, tpr, thr] = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
print('Model Results for {}/{} split:'.format(int(train_size*100), int(100 - train_size*100)))
# print(lr_pipeline.__class__.__name__+" accuracy is %2.3f" % accuracy_score(y_test, y_pred))
# print(lr_pipeline.__class__.__name__+" auc is %2.3f" % auc(fpr, tpr))

idx = np.min(np.where(tpr > 0.95)) # index of the first threshold for which the sensibility > 0.95

trace1 = go.Scatter(x=fpr, y=tpr, 
                    mode='lines', 
                    line=dict(color='darkorange', width=line_width),
                    name='ROC curve (area = %0.2f)' % roc_auc
                   )

trace2 = go.Scatter(x=[0, 1], y=[0, 1], 
                    mode='lines', 
                    line=dict(color='navy', width=line_width, dash='dash'),
                    showlegend=False)

layout = go.Layout(title='Receiver operating characteristic example',
                   xaxis=dict(title='False Positive Rate'),
                   yaxis=dict(title='True Positive Rate'))

fig = go.Figure(data=[trace1, trace2], layout=layout)
py.iplot(fig, filename='name_tbd')

print("Using a threshold of %.3f " % thr[idx] + "guarantees a sensitivity of %.3f " % tpr[idx] +  
      "and a specificity of %.3f" % (1-fpr[idx]) + 
      ", i.e. a false positive rate of %.2f%%." % (np.array(fpr[idx])*100))

Model Results for 80/20 split:


Using a threshold of 0.074 guarantees a sensitivity of 0.951 and a specificity of 0.178, i.e. a false positive rate of 82.20%.


## Take out first_touched_full and last_touched_full

In [198]:
# X = cr_all_time.drop(OUTCOME, axis=1)
# y = cr_all_time[OUTCOME].values

# x_train, x_test, y_train, y_test=train_test_split(X,y,train_size=train_size)

In [199]:

# Preprocessing with a Pipeline
lr_pipeline = Pipeline([
    ('features', DFFeatureUnion([
        
#         ('dates', Pipeline([
#             ('extract', ColumnExtractor(DATE_FEATS)),
#             ('diffs', DateDiffer('first_touched_full', 'last_touched_full', 'touch_gaps_days')),
#             ('touch_gaps_days_fill_with_avg', DFImputer())       
#         ])),

        ('rate_pmt_term', Pipeline([
            ('extract', ColumnExtractor(RATE_PMT_TERM)),
            ('current_rate_fill_with_zero', ZeroFillTransformer('current_rate')),
            ('pre_rate_filled_with_curr_rate', FillWithAnotherColumn('presented_rate', 'current_rate')),
            ('curr_minus_pre_rate', NumericDiffer('current_rate', 'presented_rate', 'curr_minus_pre_rate')),
            ('curr_minus_pre_rate_fill_zero', ZeroFillTransformer('curr_minus_pre_rate')),
            ('best_pre_apr_filled_with_curr_rate', FillWithAnotherColumn('best_presented_apr', 'current_rate')),
            ('curr_minus_best_pre_apr', NumericDiffer('current_rate', 'best_presented_apr', 'curr_minus_best_pre_apr')),
            ('curr_minus_best_pre_apr_fill_zero', ZeroFillTransformer('curr_minus_best_pre_apr')),
            ('pre_pmt_filled_with_curr_mthly_pmt', FillWithAnotherColumn('presented_payment', 'current_monthly_payment')),
            ('curr_monthly_minus_pre_payment', NumericDiffer('current_monthly_payment', 'presented_payment', 'curr_monthly_minus_pre_payment')),
            ('curr_monthly_minus_pre_payment_fill_zero', ZeroFillTransformer('curr_monthly_minus_pre_payment')),
            ('curr_minus_pre_payment', NumericDiffer('current_payment', 'presented_payment', 'curr_minus_pre_payment')),
            ('curr_minus_pre_payment_fill_zero', ZeroFillTransformer('curr_minus_pre_payment')),
            ('pre_term_filled_with_remaining_term', FillWithAnotherColumn('presented_term', 'current_term')),
            ('curr_minus_pre_term', NumericDiffer('current_term', 'presented_term', 'curr_minus_pre_term')),
            ('curr_minus_pre_term_fill_zero', ZeroFillTransformer('curr_minus_pre_term')),
            ('pre_term_filled_with_curr_term', FillWithAnotherColumn('presented_term', 'remaining_term')),
            ('remain_minus_pre_term', NumericDiffer('remaining_term', 'presented_term', 'remain_minus_pre_term')),
            ('remain_minus_pre_term_fill_zero', ZeroFillTransformer('remain_minus_pre_term')),            
            ('balance_of_open_collections_fill_zero', ZeroFillTransformer('balance_of_open_collections')),
            ('num_open_collections_fill_zero', ZeroFillTransformer('num_open_collections')),
            ('rate_pmt_term_fill_zero', ZeroFillTransformer(RATE_PMT_TERM))
        ])),
        
        ('dtype_change', Pipeline([
            ('extract', ColumnExtractor(DTYPE_FEATS)),
            ('dti_dtype_update', ChangeDataType('dti', 'float64')),
            ('ltv_dtype_update', ChangeDataType('ltv', 'float64')),
            ('dec_ltv_dtype_update', ChangeDataType('decisioning_ltv', 'float64'))
        ])),
     
        ('rest_numeric_fill_with_avg', Pipeline([
            ('extract', ColumnExtractor(REST_NUM_FEATS)),
            ('fill_rest_numeric_with_avg', DFImputer())
        ]))        
        
    ])),
    # For all data in df: 1. replacing infinity with np.nan
    # 2. fill everything with average
    # DFImputer takes care of both steps
    ('fill_all_with_avg', DFImputer()),
    ('scale', DFStandardScaler()),
    ('log_regre', LogisticRegression(C=10, penalty='l2'))
])

In [200]:
lr_pipeline.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('features',
                 DFFeatureUnion(transformer_list=[('rate_pmt_term',
                                                   Pipeline(memory=None,
                                                            steps=[('extract',
                                                                    ColumnExtractor(cols=['current_rate',
                                                                                          'presented_rate',
                                                                                          'best_presented_apr',
                                                                                          'current_monthly_payment',
                                                                                          'current_payment',
                                                                                          'presented_payment',
                                                                  

In [201]:
y_pred_proba = lr_pipeline.predict_proba(x_test)[:,1]

In [202]:
outcome_and_scores = pd.DataFrame({'actual_outcome':y_test, 'predict_score':y_pred_proba})
# outcome_and_scores.head()

outcome_and_scores['decile'] = pd.qcut(outcome_and_scores['predict_score'], cut_groups, labels=np.arange(cut_groups, 0, -1))
conversion_rate_by_decile = outcome_and_scores.groupby('decile')\
['actual_outcome'].agg({'actual_outcome':'mean'}).reset_index().sort_values(by='decile', ascending=False)\
.rename(columns={'actual_outcome':'conversion_rate'})
conversion_rate_by_decile

Unnamed: 0,decile,conversion_rate
9,1,0.414634
8,2,0.288344
7,3,0.184049
6,4,0.153374
5,5,0.08589
4,6,0.085366
3,7,0.09816
2,8,0.092025
1,9,0.06135
0,10,0.036585


In [203]:
print("Baseline Conversion Rate: {0:.2%}".format(sum(y_test)/len(y_test)))

Baseline Conversion Rate: 15.00%


In [204]:
# Conversion rate state_funded on different curr_minus_sel_rate_levels

data = [
    go.Bar(
        x=conversion_rate_by_decile['decile'], # assign x as the dataframe column 'x'
        y=conversion_rate_by_decile['conversion_rate'],
        marker = dict(color = 'rgb(0,152,255)'),
        name='Conversion Rate by Decile'
        # orientation='h',
    ),
    go.Scatter(x=list(conversion_rate_by_decile['decile']), \
               y=[sum(y_test)/len(y_test)] * cut_groups, 
                    mode='lines', 
                    line=dict(color='black', width=line_width, dash='dash'),
                    showlegend=True,
                    name='Baseline Conversion Rate'
    )
]

layout = go.Layout(
    title=go.layout.Title(
        text='Conversion Rate by Deciles After Applying Predictive Model',
        # titlefont=dict(size=title_font_size),
        font=dict(family="Franklin Gothic", size=title_font_size)
    ),
    xaxis=go.layout.XAxis(
        title='Deciles',
        automargin=True,
        titlefont=dict(size=x_label_font_size)
    ),
    yaxis=go.layout.YAxis(
        title='Conversion Rate',
        automargin=True,
        titlefont=dict(size=y_label_font_size),
        tickformat=".2%"
    ),
    
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='lift-chart')

In [205]:
[fpr, tpr, thr] = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
print('Model Results for {}/{} split:'.format(int(train_size*100), int(100 - train_size*100)))
# print(lr_pipeline.__class__.__name__+" accuracy is %2.3f" % accuracy_score(y_test, y_pred))
# print(lr_pipeline.__class__.__name__+" auc is %2.3f" % auc(fpr, tpr))

idx = np.min(np.where(tpr > 0.95)) # index of the first threshold for which the sensibility > 0.95

trace1 = go.Scatter(x=fpr, y=tpr, 
                    mode='lines', 
                    line=dict(color='darkorange', width=line_width),
                    name='ROC curve (area = %0.2f)' % roc_auc
                   )

trace2 = go.Scatter(x=[0, 1], y=[0, 1], 
                    mode='lines', 
                    line=dict(color='navy', width=line_width, dash='dash'),
                    showlegend=False)

layout = go.Layout(title='Receiver operating characteristic example',
                   xaxis=dict(title='False Positive Rate'),
                   yaxis=dict(title='True Positive Rate'))

fig = go.Figure(data=[trace1, trace2], layout=layout)
py.iplot(fig, filename='name_tbd')

print("Using a threshold of %.3f " % thr[idx] + "guarantees a sensitivity of %.3f " % tpr[idx] +  
      "and a specificity of %.3f" % (1-fpr[idx]) + 
      ", i.e. a false positive rate of %.2f%%." % (np.array(fpr[idx])*100))

Model Results for 80/20 split:


Using a threshold of 0.074 guarantees a sensitivity of 0.951 and a specificity of 0.178, i.e. a false positive rate of 82.20%.
