In [43]:
# Data Processing
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

# Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier

# Evaluation
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn import metrics

In [44]:
df_train = pd.read_csv("/sandbox/cbgba/Modeling/Users/zhangyun/CR/data/in/TRAIN.csv")
test = pd.read_csv("/sandbox/cbgba/Modeling/Users/zhangyun/CR/data/in/TEST.csv")

In [45]:
df_train.set_index('PARTY_ID', inplace=True)
test.set_index('PARTY_ID', inplace=True)

In [46]:
print 'df_train-shape: ', df_train.shape
print 'test-shape:  ', test.shape

df_train-shape:  (1840353, 26)
test-shape:   (1838442, 26)


In [47]:
df_train.columns

Index([u'event', u'F_CARD_1M', u'AGE_1M', u'F_CARD1_ESSO_1M', u'DBS_TENURE_1M',
       u'Avg_Ttl_txncnt_IB_MB_DCC_6m', u'avg_T_txncnt_Local_Spend_6m',
       u'A_3m_tamt_cr', u'F_MAINCARD_MAX3M', u'Avg_Ttl_txncnt_N_Fin_6m',
       u'A_6m_tamt_Giro', u'avg_T_txnamt_Telecom_6m',
       u'avg_T_txnamt_Dining_6m', u'BAL_DEP1_SA_CHG',
       u'Avg_Ttl_txncnt_IB_MB_THE_6m', u'mx_6_tcnt_dr', u'mn_3_tamt_Advice',
       u'cr_dr_amt_3', u'A_6m_tcnt_SA_Cr', u'Avg_Ttl_txnamt_MB_6m',
       u'Avg_Ttl_txnamt_IB_MB_TPFT_6m', u'Avg_Ttl_txnamt_IB_MB_G3IFT_6m',
       u'SC_BAL_DEP1_SA_1M', u'Avg_Ttl_txnamt_ATM_3m', u'mn_3_tamt_SA_IBW',
       u'Avg_Ttl_txnamt_IB_MB_FPFT_6m'],
      dtype='object')

In [48]:
#Handling Unbalance 
k = 7 
n_sample = df_train.event.value_counts()[1] * k
train_0 = df_train[df_train['event'] == 0].sample(n=n_sample, random_state=1)
train_1 = df_train[df_train['event'] == 1]
train = train_0.append(train_1)

print 'train_0.shape: ', train_0.shape
print 'train_1.shape: ', train_1.shape
print 'train.shape:', train.shape

train_0.shape:  (168637, 26)
train_1.shape:  (24091, 26)
train.shape: (192728, 26)


In [49]:
#Halidng Missing Values
na_fill = df_train.median().to_dict()
df_train.fillna(value=na_fill, inplace=True)
train.fillna(value=na_fill, inplace=True)
test.fillna(value=na_fill, inplace=True)

Unnamed: 0_level_0,event,F_CARD_1M,AGE_1M,F_CARD1_ESSO_1M,DBS_TENURE_1M,Avg_Ttl_txncnt_IB_MB_DCC_6m,avg_T_txncnt_Local_Spend_6m,A_3m_tamt_cr,F_MAINCARD_MAX3M,Avg_Ttl_txncnt_N_Fin_6m,...,mn_3_tamt_Advice,cr_dr_amt_3,A_6m_tcnt_SA_Cr,Avg_Ttl_txnamt_MB_6m,Avg_Ttl_txnamt_IB_MB_TPFT_6m,Avg_Ttl_txnamt_IB_MB_G3IFT_6m,SC_BAL_DEP1_SA_1M,Avg_Ttl_txnamt_ATM_3m,mn_3_tamt_SA_IBW,Avg_Ttl_txnamt_IB_MB_FPFT_6m
PARTY_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10601_0000006810,0,0,60.24,0,18.54,0.0,0.0000,0.0000,0,0.0000,...,0.00,0.0000,0.0000,0.0000,0.0000,0.0000,0.00,0.0000,0.0,0.000
10601_0000007934,0,0,49.35,0,29.81,0.0,0.0000,0.0000,0,0.0000,...,0.00,0.0000,0.0000,0.0000,0.0000,0.0000,0.00,0.0000,0.0,0.000
10601_0000008723,0,0,23.40,0,23.28,0.0,0.0000,5488.9333,0,0.0000,...,2320.62,1.2513,0.0000,0.0000,0.0000,0.0000,0.00,0.0000,0.0,0.000
10601_0000008822,0,0,27.40,0,19.42,0.0,0.0000,203.9500,0,0.0000,...,5519.00,0.0049,0.0000,0.0000,0.0000,0.0000,0.00,0.0000,0.0,0.000
10601_0000009620,0,0,24.81,0,23.61,0.0,0.0000,0.0000,0,0.0000,...,0.00,0.0000,0.0000,0.0000,0.0000,0.0000,0.00,0.0000,0.0,0.000
10601_0000009822,0,0,46.36,0,29.89,0.0,0.0000,33333.3333,0,0.0000,...,0.00,1.0535,0.0000,0.0000,0.0000,0.0000,0.00,0.0000,0.0,0.000
10601_0000011144,0,0,26.78,0,26.12,0.0,0.0000,0.0000,0,0.0000,...,0.00,0.0000,0.0000,0.0000,0.0000,0.0000,0.00,0.0000,0.0,0.000
10601_0000011227,0,0,43.61,0,27.16,0.0,0.0000,60100.0000,0,0.0000,...,0.00,0.9813,0.0000,0.0000,0.0000,0.0000,0.00,0.0000,0.0,0.000
10601_0000012544,0,0,36.67,0,22.80,0.0,0.0000,21226.0767,0,0.0000,...,50.00,0.8270,0.0000,0.0000,0.0000,0.0000,0.00,0.0000,0.0,0.000
10601_0000013371,0,0,43.44,0,30.06,0.0,0.0000,0.0000,0,0.0000,...,0.00,0.0000,0.0000,0.0000,0.0000,0.0000,0.00,0.0000,0.0,0.000


### RandomForest

In [50]:
#Data--> Train, val, test
X, y = train.drop('event', axis=1), train['event'].copy()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1)
print 'train | val (sampling):'
print 'X_train.shape: ', X_train.shape
print 'X_val.shape: ', X_val.shape
print 'y_train.shape: ', y_train.shape
print 'y_val.shape: ', y_val.shape

X_test, y_test = test.drop('event', axis=1), test['event'].copy()
print 'Test:'
print 'X_test.shape: ', X_test.shape
print 'y_test.shape: ', y_test.shape


X_df, y_df = df_train.drop('event', axis=1), df_train['event'].copy()
X_df_train, X_df_val, y_df_train, y_df_val = train_test_split(X_df, y_df, test_size=0.3, stratify=y_df, random_state=1)
print 'train | val (full):'
print 'X_df_train.shape: ', X_df_train.shape
print 'y_df_train.shape: ', y_df_train.shape
print 'X_df_val.shape: ', X_df_val.shape
print 'y_df_val.shape: ', y_df_val.shape

train | val (sampling):
X_train.shape:  (134909, 25)
X_val.shape:  (57819, 25)
y_train.shape:  (134909,)
y_val.shape:  (57819,)
Test:
X_test.shape:  (1838442, 25)
y_test.shape:  (1838442,)
train | val (full):
X_df_train.shape:  (1288247, 25)
y_df_train.shape:  (1288247,)
X_df_val.shape:  (552106, 25)
y_df_val.shape:  (552106,)


In [51]:
#Training Model
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=1) #max_depth =10
rf.fit(X_train, y_train)

#Evaluation
rf_train_full = rf.predict(X_df_train) #full train dataset
rf_val_full = rf.predict(X_df_val) #full val dataset
rf_train = rf.predict(X_train) #sampling datast
rf_val = rf.predict(X_val)
rf_test = rf.predict(X_test)

#Print
print 'AUC-RandomForest:'
print('train_full: {:.2%}'.format(metrics.roc_auc_score(y_df_train, rf_train_full)))
print('val_full: {:.2%}'.format(metrics.roc_auc_score(y_df_val, rf_val_full)))
print('train_sample: {:.2%}'.format(metrics.roc_auc_score(y_train, rf_train)))
print('val: {:.2%}'.format(metrics.roc_auc_score(y_val, rf_val)))
print('test: {:.2%}'.format(metrics.roc_auc_score(y_test, rf_test)))

AUC-RandomForest:
train_full: 57.21%
val_full: 57.45%
train_sample: 57.48%
val: 56.94%
test: 56.80%


In [52]:
def DecileReport(y_true, y_pred, group=10):
    '''
    to generate the decile report (Ks, Gain, Lift@decile, Lift@total)
    act: actual values
    pred: prediction values
    group: # of deciles 
    '''
    
    #Construct dataframe: 
    pdict = pd.DataFrame({"true":y_true, "pred":y_pred})
    pdict['decile'] = pd.qcut(pdict['pred'], group, labels=False)
    report = pdict.groupby(['decile']).agg({'true': {'#pop' : 'count', '#num1': 'sum'},
                                            'pred': {'prob_min': 'min', 'prob_max': 'max'}})
    
    
    report.columns = report.columns.droplevel(0) #drop level0('true', 'pred')
    report.sort_index(ascending=False, inplace=True) #sort index
    
    #counts
    report['#num0'] = report['#pop'] - report['#num1'] 
    
    #percent
    report['%pop'] = report['#pop'] / sum(report['#pop'])
    report['%num1'] = report['#num1'] / sum(report['#num1'])
    report['%num0'] = report['#num0'] / sum(report['#num0'])
    
    #cumulative
    report['cum%pop'] = report['%pop'].cumsum()
    report['cum%num1'] = report['%num1'].cumsum()
    report['cum%num0'] = report['%num0'].cumsum()
    
    #result
    report['ks'] = report['cum%num1'] - report['cum%num0']
    report['lift@decile'] = report['%num1'] / report['%pop']
    report['lift@total'] = report['cum%num1'] / report['cum%pop']
    
    '''
    return:
    prob_min: minimum probablity
    prob_max: maximum probablity 
    
    #pop: number of cases in its group
    #num1: number of positive(events/responses..)
    #num0: number of negative(non events/responses..)
    
    %pop: percentage of cases 
    %num1: percentage of positive
    %num0: percentage of negative
    
    cum%pop: cumulative percentage of cases
    cum%num1: cumulative percentage of positive
    cum%num0: cumulative percentage of negative
    
    ks: the degree of separation between the positive and negative distributions
    lift@decile: lift in each group, (%num1/%pop)
    lift@total: lift total, (cum%num1/cum%pop)
    '''
    
    return report.round(3)

In [38]:
# full training
temp = rf.predict_proba(X_df_train)[:,1]
DecileReport(y_df_train, temp, 10).to_csv('/sandbox/cbgba/Modeling/Users/zhangyun/CR/data/out0/decile_train' + str(k)+'.csv')

In [39]:
# full val
temp = rf.predict_proba(X_df_val)[:,1]
DecileReport(y_df_val, temp, 10).to_csv('/sandbox/cbgba/Modeling/Users/zhangyun/CR/data/out0/decile_val'+ str(k)+'.csv')

In [40]:
# test
temp = rf.predict_proba(X_test)[:,1]
DecileReport(y_test, temp, 10).to_csv('/sandbox/cbgba/Modeling/Users/zhangyun/CR/data/out0/decile_test'+ str(k)+'.csv')

In [55]:
temp = rf.predict_proba(X_df_train)[:,1]
DR = DecileReport(y_df_train, temp, 10)

In [56]:
DR

Unnamed: 0_level_0,prob_min,prob_max,#pop,#num1,#num0,%pop,%num1,%num0,cum%pop,cum%num1,cum%num0,ks,lift@decile,lift@total
decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
9,0.247,0.993,128825,9437,119388,0.1,0.56,0.094,0.1,0.56,0.094,0.466,5.596,5.596
8,0.187,0.247,128825,2908,125917,0.1,0.172,0.099,0.2,0.732,0.193,0.539,1.724,3.66
7,0.134,0.187,128824,1750,127074,0.1,0.104,0.1,0.3,0.836,0.293,0.543,1.038,2.786
6,0.092,0.134,128825,1145,127680,0.1,0.068,0.1,0.4,0.904,0.393,0.51,0.679,2.259
5,0.056,0.092,128824,652,128172,0.1,0.039,0.101,0.5,0.942,0.494,0.448,0.387,1.885
4,0.035,0.056,128825,340,128485,0.1,0.02,0.101,0.6,0.963,0.595,0.367,0.202,1.604
3,0.025,0.035,128825,303,128522,0.1,0.018,0.101,0.7,0.98,0.696,0.284,0.18,1.401
2,0.019,0.025,128824,196,128628,0.1,0.012,0.101,0.8,0.992,0.797,0.195,0.116,1.24
1,0.014,0.019,128825,81,128744,0.1,0.005,0.101,0.9,0.997,0.899,0.098,0.048,1.108
0,0.012,0.014,128825,52,128773,0.1,0.003,0.101,1.0,1.0,1.0,-0.0,0.031,1.0


In [82]:
import plotly
import plotly.graph_objs as go
plotly.offline.init_notebook_mode(connected=True)

In [83]:
dr_data = DR.copy()

In [146]:
#Data for Cummulative Gain Chart
x1 = [0] + dr_data['cum%pop'].tolist()
y1 = [0] + dr_data['cum%num1'].tolist()

x2 = [0] + dr_data['cum%pop'].tolist()
y2 = [0] + dr_data['cum%pop'].tolist()

In [153]:
#Cummulative Gain Chart
trace1 = go.Scatter(name='Model',
                    x=x1, y=y1)
trace2 = go.Scatter(name='Random',
                    x=x2, y=y2)


layout=go.Layout(
                autosize=True,
                width=550, height=400,
                title='Cummulative Gain Chart',
                xaxis= dict(title= '%Population', range=[0,1]),
                yaxis=dict(title= '%Event')
                )

data = [trace1, trace2]

fig= go.Figure(data=data,layout=layout)
plotly.offline.iplot(fig)

In [154]:
#Lift Chart
x1 = dr_data['cum%pop'].tolist()
y1 = dr_data['lift@total'].tolist()

x2 = dr_data['cum%pop'].tolist()
y2 = [1] * 10

In [159]:
#list Chart
trace1 = go.Scatter(name='Model',
                    x=x1, y=y1)
trace2 = go.Scatter(name='Random',
                    x=x2, y=y2)


layout=go.Layout(
                width=550, height=400,
                title='Lift Chart',
                xaxis= dict(title= '%Population', range=[0,1]),
                yaxis=dict(title= 'Lift')
                )
data = [trace1, trace2]

fig= go.Figure(data=data,layout=layout)
plotly.offline.iplot(fig)

In [160]:
#lift@decile
x1 = dr_data['cum%pop'].tolist()
y1 = dr_data['lift@decile'].tolist()

x2 = dr_data['cum%pop'].tolist()
y2 = [1] * 10

In [165]:
#list@decile Chart
trace1 = go.Scatter(name='Model',
                    x=x1, y=y1)
trace2 = go.Scatter(name='Random',
                    x=x2, y=y2)


layout=go.Layout(
                width=550, height=400,
                title='Lift@Decile Chart',
                xaxis= dict(title= '%Population', range=[0,1]),
                yaxis=dict(title= 'Lift@Decile')
                )
data = [trace1, trace2]

fig= go.Figure(data=data,layout=layout)
plotly.offline.iplot(fig)

In [185]:
#KS
x1 = [0] + dr_data['cum%pop'].tolist()
y1 = [0] + dr_data['cum%num1'].tolist()

x2 = [0] + dr_data['cum%pop'].tolist()
y2 = [0] + dr_data['cum%num0'].tolist()


In [183]:
trace1 = go.Scatter(name='%Event',
                    x=x1, y=y1)
trace2 = go.Scatter(name='%Non-event',
                    x=x2, y=y2)
layout=go.Layout(
                width=550, height=400,
                title='K-S Chart',
                xaxis= dict(title= '%Population', range=[0,1]),
                yaxis=dict(title= '%Event')
                )
data = [trace1, trace2]

fig= go.Figure(data=data,layout=layout)
plotly.offline.iplot(fig)

In [262]:
#Data
x1 = dr_data['cum%pop'].tolist()
y1 = dr_data['cum%num1'].tolist()
y2 = dr_data['cum%num0'].tolist()
y3 = dr_data['lift@decile'].tolist()
y4 = [1] * 10
y5= dr_data['lift@total'].tolist()


#KS
ks1 = go.Scatter(name='%Event', x=[0]+x1, y=[0]+y1)
ks2 = go.Scatter(name='%Non-event', x=[0]+x1, y=[0]+y2)

#Gain
gain1 = go.Scatter(name='Model', x=[0]+x1, y=[0]+y1)
gain2 = go.Scatter(name='Random', x=[0]+x1, y=[0]+x1)

#Lift@Decile
lift_d1 = go.Scatter(name='Model', x=x1, y=y3)
lift_d2 = go.Scatter(name='Random', x=x1, y=y4)

#Lift@Total
lift_t1 = go.Scatter(name='Model', x=x1, y=y5)
lift_t2 = go.Scatter(name='Random', x=x1, y=y4)



fig = plotly.tools.make_subplots(rows=2, cols=2, subplot_titles=('K-S Chart', 'Cummulative Gain Chart', 
                                                                 'Lift@Decile Chart', 'Lift@Total Chart'))
fig.append_trace(ks1, 1, 1)
fig.append_trace(ks2, 1, 1)

fig.append_trace(gain1, 1, 2)
fig.append_trace(gain2, 1, 2)

fig.append_trace(lift_d1, 2, 1)
fig.append_trace(lift_d2, 2, 1)

fig.append_trace(lift_t1, 2, 2)
fig.append_trace(lift_t2, 2, 2)

fig['layout']['xaxis1'].update(title='%Population')
fig['layout']['xaxis2'].update(title='%Population')
fig['layout']['xaxis3'].update(title='%Population')
fig['layout']['xaxis4'].update(title='%Population')

fig['layout']['yaxis1'].update(title='%Count')
fig['layout']['yaxis2'].update(title='%Event')
fig['layout']['yaxis3'].update(title='Lift@decile')
fig['layout']['yaxis4'].update(title='Lift@total')

fig['layout']['legend'].update()

fig['layout'].update(legend=dict(), title='Decile Report', width=1100, height=900)
plotly.offline.iplot(fig)

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3 ]  [ (2,2) x4,y4 ]



In [252]:
help(plotly.tools.make_subplots)

Help on function make_subplots in module plotly.tools:

make_subplots(rows=1, cols=1, shared_xaxes=False, shared_yaxes=False, start_cell='top-left', print_grid=True, **kwargs)
    Return an instance of plotly.graph_objs.Figure
    with the subplots domain set in 'layout'.
    
    Example 1:
    # stack two subplots vertically
    fig = tools.make_subplots(rows=2)
    
    This is the format of your plot grid:
    [ (1,1) x1,y1 ]
    [ (2,1) x2,y2 ]
    
    fig['data'] += [Scatter(x=[1,2,3], y=[2,1,2])]
    fig['data'] += [Scatter(x=[1,2,3], y=[2,1,2], xaxis='x2', yaxis='y2')]
    
    # or see Figure.append_trace
    
    Example 2:
    # subplots with shared x axes
    fig = tools.make_subplots(rows=2, shared_xaxes=True)
    
    This is the format of your plot grid:
    [ (1,1) x1,y1 ]
    [ (2,1) x1,y2 ]
    
    
    fig['data'] += [Scatter(x=[1,2,3], y=[2,1,2])]
    fig['data'] += [Scatter(x=[1,2,3], y=[2,1,2], yaxis='y2')]
    
    Example 3:
    # irregular subplot layout (mor