In [2]:
# Identifying Customer Targets (Python)

# prepare for Python version 3x features and functions
from __future__ import division, print_function

# import packages for text processing and machine learning
import pandas as pd  # DataFrame structure and operations
import numpy as np  # arrays and numerical processing
import matplotlib.pyplot as plt  # 2D plotting
import statsmodels.api as sm  # logistic regression
import statsmodels.formula.api as smf  # R-like model specification
import patsy  # translate model specification into design matrices

# import user-defined module
import evaluate_classifier as eval

# read in comma-delimited text file and create data frame
# there are blank character fields for missing data
# read them as character fields initially
path='C:\\Users\\HP\\PycharmProjects\\mds\\dbase_mktng_ch3\\'
bank = pd.read_csv(path+'bank.csv', sep = ';')
print('original data bank.head() is') 
print(bank.head())

# define jobtype variable
job_to_jobtype = {'admin.':'White Collar',\
    'entrepreneur':'White Collar',\
    'management':'White Collar',\
    'self-employed':'White Collar',\
    'blue-collar':'Blue Collar',\
    'services':'Blue Collar',\
    'technician':'Blue Collar'}
bank['jobtype'] = bank['job'].map(job_to_jobtype)
bank['jobtype'] = bank['jobtype'].fillna('Other/Unknown')

# set marital variable
marital_to_label = {'divorced':'Divorced',\
    'married':'Married',\
    'single':'Single'}
bank['marital'] = bank['marital'].map(marital_to_label)
bank['marital'] = bank['marital'].fillna('Unknown')

# set education variable
education_to_label = {'primary':'Primary',\
    'secondary':'Secondary',\
    'tertiary':'Tertiary'}
bank['education'] = bank['education'].map(education_to_label)
bank['education'] = bank['education'].fillna('Unknown')

# set no/yes variable labels
noyes_to_label = {'no':'No', 'yes':'Yes'}
bank['default'] = bank['default'].map(noyes_to_label)
bank['default'] = bank['default'].fillna('No')

bank['housing'] = bank['housing'].map(noyes_to_label)
bank['housing'] = bank['housing'].fillna('No')

bank['loan'] = bank['loan'].map(noyes_to_label)
bank['loan'] = bank['loan'].fillna('No')

# code response as binary variable
noyes_to_binary = {'no':0, 'yes':1}
bank['response'] = bank['response'].map(noyes_to_binary)
bank['response'] = bank['response'].fillna('No')

# work only with bank clients who are being approached for the first time  
filter = bank['pdays'].map(lambda d: d == -1)

# apply the filter and select columns needed for targeting model
bankwork = pd.DataFrame(bank[filter], columns = ['response','age','jobtype',\
    'education',  'marital', 'default', 'balance', 'housing', 'loan'])
print ('printing bankwork.head()')
print(bankwork.head()) 
print('shape is',bankwork.shape)   

# examine descriptive statistics and frequency tables for variables in 
print('descriptive statistics and frequency tables for variables')
print(bankwork.describe())
print('\njobtype:\n',bankwork['jobtype'].value_counts())
print('\nmarital:\n',bankwork['marital'].value_counts())
print('\neducation:\n',bankwork['education'].value_counts())
print('\ndefault:\n',bankwork['default'].value_counts())
print('\nhousing:\n',bankwork['housing'].value_counts())
print('\nloan:\n',bankwork['loan'].value_counts())

# examine means of continuous explanatory variables by response
print(bankwork.pivot_table(['age'], index = ['response']))
print(bankwork.pivot_table(['balance'], index = ['response']))

# baseline response rate computed (will be used later)
filter_took_offer = bankwork['response'].map(lambda d: d == 1)
baseline_response_rate = len(bankwork[filter_took_offer]) / len(bankwork)
print('\nBaseline proportion of clients responding to offer: ',\
    round(baseline_response_rate,5), '\n')

# examine proportion responding across levels 
# of categorical variables
print(bankwork.pivot_table(['response'], index = ['jobtype']))
print(bankwork.pivot_table(['response'], index = ['education']))
print(bankwork.pivot_table(['response'], index = ['marital']))
print(bankwork.pivot_table(['response'], index = ['default']))
print(bankwork.pivot_table(['response'], index = ['housing']))
print(bankwork.pivot_table(['response'], index = ['loan']))

# specify model for logisitc regression
bank_spec = 'response ~ age + jobtype + education + marital +\
    default + balance + housing + loan'

# ----------------------------------
# fit logistic regression model 
# ----------------------------------
# convert R-like formula into design matrix needed for statsmodels        
y,x = patsy.dmatrices(bank_spec, bankwork, return_type = 'dataframe')    

my_logit_model = sm.Logit(y,x)
# fit the model to the full data set
my_logit_model_fit = my_logit_model.fit()
print(my_logit_model_fit.summary())

# predicted probability of reponding to the offer
bankwork['pred_logit_prob'] = my_logit_model_fit.predict(linear = False)

# map target from probability cutoff specified
def prob_to_pred(x, cutoff):
    if(x > cutoff):
        return(1)
    else:
        return(0)

# try cutoff set at 0.50
bankwork['pred_logit_50'] =\
    bankwork['pred_logit_prob'].\
    apply(lambda d: prob_to_pred(d, cutoff = 0.50))    
print('\nConfusion matrix for 0.50 cutoff\n',\
    pd.crosstab(bankwork.pred_logit_50, bankwork.response, margins = True))    
# cutoff 0.50 does not work for targeting... all predictions 0 or No    

# try cutoff set at 0.10
bankwork['pred_logit_10'] =\
    bankwork['pred_logit_prob'].\
    apply(lambda d: prob_to_pred(d, cutoff = 0.10))    
print('\nConfusion matrix for 0.10 cutoff\n',\
    pd.crosstab(bankwork.pred_logit_10, bankwork.response, margins = True)) 

print('\n Logistic Regression Performance (0.10 cutoff)\n',\
    'Percentage of Targets Correctly Classified:',\
    100 * round(eval.evaluate_classifier(bankwork['pred_logit_10'],\
    bankwork['response'])[4], 3),'\n')

# direct calculation of lift 
# decile labels from highest to lowest 
decile_label = []
for i in range(10):
    decile_label.append('Decile_'+str(10 - i))
# draws on baseline response rate computed earlier    
def lift(x):
    return(x / baseline_response_rate)

prediction_deciles = pd.qcut(bankwork.pred_logit_prob, 10, labels = decile_label)
decile_groups = bankwork.response.groupby(prediction_deciles)
print(decile_groups.mean())
lift_values = decile_groups.mean() / baseline_response_rate
print('\nLift Chart Values by Decile:\n', lift_values, '\n')

original data bank.head() is
   age          job  marital  education default  balance housing loan  \
0   30   unemployed  married    primary      no     1787      no   no   
1   33     services  married  secondary      no     4789     yes  yes   
2   35   management   single   tertiary      no     1350     yes   no   
3   30   management  married   tertiary      no     1476     yes  yes   
4   59  blue-collar  married  secondary      no        0     yes   no   

    contact  day month  duration  campaign  pdays  previous poutcome response  
0  cellular   19   oct        79         1     -1         0  unknown       no  
1  cellular   11   may       220         1    339         4  failure       no  
2  cellular   16   apr       185         1    330         1  failure       no  
3   unknown    3   jun       199         4     -1         0  unknown       no  
4   unknown    5   may       226         1     -1         0  unknown       no  
printing bankwork.head()
   response  age        job

Optimization terminated successfully.
         Current function value: 0.293877
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               response   No. Observations:                 3705
Model:                          Logit   Df Residuals:                     3692
Method:                           MLE   Df Model:                           12
Date:                Sat, 20 Aug 2016   Pseudo R-squ.:                 0.03568
Time:                        08:53:09   Log-Likelihood:                -1088.8
converged:                       True   LL-Null:                       -1129.1
                                        LLR p-value:                 3.223e-12
                               coef    std err          z      P>|z|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------------------
Intercept                   -2.3937      0.390     -6.132      0.000        -3.1

In [3]:
# try cutoff set at 0.50
bankwork['pred_logit_50'] =\
    bankwork['pred_logit_prob'].\
    apply(lambda d: prob_to_pred(d, cutoff = 0.50))    
print('\nConfusion matrix for 0.50 cutoff\n',\
    pd.crosstab(bankwork.pred_logit_50, bankwork.response, margins = True))    
# cutoff 0.50 does not work for targeting... all predictions 0 or No


Confusion matrix for 0.50 cutoff
 response          0    1   All
pred_logit_50                 
0              3368  337  3705
All            3368  337  3705


In [8]:
bankwork

Unnamed: 0,response,age,jobtype,education,marital,default,balance,housing,loan,pred_logit_prob,pred_logit_50,pred_logit_10
0,0,30,Other/Unknown,Primary,Married,No,1787,No,No,0.109280,0,1
3,0,30,White Collar,Tertiary,Married,No,1476,Yes,Yes,0.035131,0,0
4,0,59,Blue Collar,Secondary,Married,No,0,Yes,No,0.064461,0,0
7,0,39,Blue Collar,Secondary,Married,No,147,Yes,No,0.053391,0,0
8,0,41,White Collar,Tertiary,Married,No,221,Yes,No,0.074998,0,0
10,0,39,Blue Collar,Secondary,Married,No,9374,Yes,No,0.055666,0,0
11,0,43,White Collar,Secondary,Married,No,264,Yes,No,0.063508,0,0
12,0,36,Blue Collar,Tertiary,Married,No,1109,No,No,0.091452,0,0
13,1,20,Other/Unknown,Secondary,Single,No,502,No,No,0.173003,0,1
15,0,40,White Collar,Tertiary,Married,No,194,No,Yes,0.056637,0,0


In [7]:
print (pd.crosstab(bankwork.pred_logit_50, bankwork.response, margins = True))

response          0    1   All
pred_logit_50                 
0              3368  337  3705
All            3368  337  3705
