## Default prediction from customer spending habit

1. Create base functions for the project.
2. Initial Feature Engineering(Brainstorming)
3. Fit the model to the new features and test the results

In [1]:
# import the relevant computational modules

# data manipulation
import pandas as pd #data processing
import numpy as np #linear algebra

# Models Packages
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Gradient Boosting
import lightgbm as lgb
from sklearn.cross_validation import KFold



In [13]:
# Data Exploration
transaction_df = pd.read_csv('../raw_data/transactions_train.csv')
payment_df = pd.read_csv('../raw_data/paiements_train.csv')
billing_df = pd.read_csv('../raw_data/facturation_train.csv')
performance_df = pd.read_csv('../raw_data/performance_train.csv')

### Transaction table

In [14]:
transaction_df.head()

Unnamed: 0,ID_CPTE,MERCHANT_CATEGORY_XCD,MERCHANT_CITY_NAME,MERCHANT_COUNTRY_XCD,DECISION_XCD,PRIOR_CREDIT_LIMIT_AMT,TRANSACTION_AMT,TRANSACTION_CATEGORY_XCD,TRANSACTION_DTTM,TRANSACTION_TYPE_XCD,SICGROUP
0,99690111,A,365767,DP,C,5927.0,52.53,E,2015-06-20 12:00:00,F,AN
1,99690111,L,2635650,DP,C,13343.0,28.35,B,2015-01-25 12:00:00,F,AN
2,99690111,L,2635650,DP,C,13343.0,0.0,A,2015-01-26 12:00:00,G,AN
3,99690111,J,680536,AF,C,9430.0,0.0,A,2015-03-25 08:00:00,G,AW
4,99690111,J,680536,AF,C,10600.0,0.0,A,2015-03-03 08:00:00,G,AW


In [15]:
# the type of columns
transaction_df.dtypes

ID_CPTE                       int64
MERCHANT_CATEGORY_XCD        object
MERCHANT_CITY_NAME            int64
MERCHANT_COUNTRY_XCD         object
DECISION_XCD                 object
PRIOR_CREDIT_LIMIT_AMT      float64
TRANSACTION_AMT             float64
TRANSACTION_CATEGORY_XCD     object
TRANSACTION_DTTM             object
TRANSACTION_TYPE_XCD         object
SICGROUP                     object
dtype: object

In [69]:
# convert and select categorical columnst
transaction_df['MERCHANT_CITY_NAME'] = transaction_df['MERCHANT_CITY_NAME'].astype(str)

In [81]:
def get_categorical_features(df_object):
    columns = df_object.select_dtypes(include='object').columns
    return np.array(columns)

In [82]:
transaction_categorical = get_categorical_features(transaction_df)

In [83]:
lbl = preprocessing.LabelEncoder()
for col in transaction_categorical:
    transaction_df[col].fillna('unknown')
    transaction_df[col] = lbl.fit_transform(transaction_df[col])

In [85]:
transaction_df.head()

Unnamed: 0,ID_CPTE,MERCHANT_CATEGORY_XCD,MERCHANT_CITY_NAME,MERCHANT_COUNTRY_XCD,DECISION_XCD,PRIOR_CREDIT_LIMIT_AMT,TRANSACTION_AMT,TRANSACTION_CATEGORY_XCD,TRANSACTION_DTTM,TRANSACTION_TYPE_XCD,SICGROUP
0,99690111,0,8516,90,2,5927.0,52.53,4,7611,5,13
1,99690111,27,6681,90,2,13343.0,28.35,1,6735,5,13
2,99690111,27,6681,90,2,13343.0,0.0,0,6741,6,13
3,99690111,24,9798,5,2,9430.0,0.0,0,7088,6,22
4,99690111,24,9798,5,2,10600.0,0.0,0,6956,6,22


In [114]:
transaction_df[transaction_df['ID_CPTE'] == 75780289]

Unnamed: 0,ID_CPTE,MERCHANT_CATEGORY_XCD,MERCHANT_CITY_NAME,MERCHANT_COUNTRY_XCD,DECISION_XCD,PRIOR_CREDIT_LIMIT_AMT,TRANSACTION_AMT,TRANSACTION_CATEGORY_XCD,TRANSACTION_DTTM,TRANSACTION_TYPE_XCD,SICGROUP
2461,75780289,52,5196,90,2,597.0,37.00,1,2000,5,0
2462,75780289,51,5196,26,2,2082.0,2.12,1,1684,2,24
2463,75780289,18,7043,90,2,235.0,2013.00,0,1594,5,17
2464,75780289,14,6326,90,2,1517.0,19.76,3,1978,5,13
2465,75780289,36,2447,26,2,235.0,0.00,0,1612,6,18
2466,75780289,43,2487,90,2,464.0,97.37,2,1848,1,21
2467,75780289,43,2487,90,2,4891.0,116.39,2,363,1,21
2468,75780289,43,3339,90,2,3374.0,1.03,1,1940,5,21
2469,75780289,43,2487,90,2,181.0,106.05,2,2151,1,21
2470,75780289,43,2487,90,2,5005.0,70.38,2,911,1,21


### Payment table

In [91]:
payment_df.head()

Unnamed: 0,ID_CPTE,TRANSACTION_AMT,TRANSACTION_DTTM,PAYMENT_REVERSAL_XFLG
0,99690111,208.0,2015-04-26 00:00:00,Q
1,99690111,176.8,2015-05-28 00:00:00,Q
2,99690111,200.0,2015-03-27 04:00:00,Q
3,99690111,80.8,2015-04-02 00:00:00,Q
4,99690111,250.0,2015-11-24 00:00:00,Q


In [125]:
tmp = payment_df[payment_df['ID_CPTE'] == 28710728].sort_values(['TRANSACTION_DTTM'])
tmp

Unnamed: 0,ID_CPTE,TRANSACTION_AMT,TRANSACTION_DTTM,PAYMENT_REVERSAL_XFLG
3669,28710728,1.05,2016-03-07 00:00:00,Q
3675,28710728,1319.7,2016-06-04 00:00:00,Q
3676,28710728,535.0,2016-06-11 00:00:00,Q
3674,28710728,2000.0,2016-07-12 00:00:00,Q
3673,28710728,1040.0,2016-08-25 00:00:00,Q
3677,28710728,520.0,2016-09-18 00:00:00,Q
3672,28710728,4089.49,2016-10-12 00:00:00,Q
3671,28710728,535.0,2016-11-24 00:00:00,Q
3670,28710728,424.0,2016-12-11 00:00:00,Q


In [126]:
sum(tmp['TRANSACTION_AMT'])

10464.24

### Billing table

In [5]:
billing_df.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,StatementDate,CurrentTotalBalance,CashBalance,CreditLimit,DelqCycle
0,99690111,2015-05-01,2015-05-03,8497.84,4293.12,16200.0,0
1,99690111,2014-11-01,2014-11-03,866.0,0.0,12000.0,0
2,99690111,2015-06-01,2015-05-31,10790.95,5224.44,16200.0,0
3,99690111,2015-10-01,2015-10-04,12388.46,4786.08,16200.0,0
4,99690111,2015-11-01,2015-11-02,12746.5,4818.48,16200.0,0


In [127]:
billing_df[billing_df['ID_CPTE'] == 28710728].sort_values(['StatementDate'])

Unnamed: 0,ID_CPTE,PERIODID_MY,StatementDate,CurrentTotalBalance,CashBalance,CreditLimit,DelqCycle
3259,28710728,2015-11-01,2015-11-26,0.0,0.0,5400.0,0
3256,28710728,2015-12-01,2015-12-20,0.0,0.0,5400.0,0
3258,28710728,2016-01-01,2016-01-23,0.0,0.0,5400.0,0
3249,28710728,2016-02-01,2016-02-23,1.0,0.0,5400.0,0
3254,28710728,2016-03-01,2016-03-19,0.0,0.0,5400.0,0
3257,28710728,2016-04-01,2016-04-23,1307.25,0.0,5400.0,0
3253,28710728,2016-05-01,2016-05-26,5147.1,0.0,5400.0,1
3255,28710728,2016-06-01,2016-06-24,4231.0,0.0,10600.0,0
3261,28710728,2016-07-01,2016-07-20,3371.86,0.0,10600.0,0
3248,28710728,2016-08-01,2016-08-24,4188.45,0.0,10600.0,1


In [6]:
performance_df.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,Default
0,99690111,2015-12-01,0
1,57427180,2012-12-01,0
2,29617912,2015-12-01,0
3,61632809,2015-12-01,0
4,14117855,2013-12-01,0


In [107]:
len(performance_df[performance_df['Default'] == 1]) / len(performance_df)

0.19336134453781512

In [128]:
performance_df[performance_df['Default'] == 1].head()

Unnamed: 0,ID_CPTE,PERIODID_MY,Default
18,75780289,2012-12-01,1
53,58022132,2013-12-01,1
56,25809739,2015-12-01,1
72,35143533,2013-12-01,1
137,94504449,2012-12-01,1


### Create basic classes and functions

In [7]:
# Create basic scikit-learn wrapper model class
class SklearnWrapper:
    def __init__(self, clf, seed=0, params=None, seed_bool=True):
        if (seed_bool == True):
            params['random_state'] = seed
        self.clf = clf(**params)
    
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict(x)

In [8]:
# create basic xgboost wrapper model class
class XgbWrapper:
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [9]:
# create basic lightGBM wrapper model class
class LightGbmWrapper:
    def __init(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 1550)
        self.verbose_eval = params.pop('verbose_eval', 100)
        
    def train(self, x_train, y_train):
        lgtrain = lgb.Dataset(x_train, y_train)
        self.lgbm = lgb.train(self.param, lgtrain, num_boost_round=self.nrounds, verbose_eval=self.verbose_eval)
    
    def predict(self, x):
        return self.lgbm.predict(lgb.Dataset(x))

In [10]:
# create out-of-fold predictions 
# make good use of k-fold CV's result 
# serving for the staking alogrithm 
# create a new column generated from model's score

def get_oof(clf, x_train, y, x_test):
    '''
    clf: the classifer, which can be logistic regression, SVM regression, Bayes classifier, etc.
    x_train: the training x in training dataset
    y: the training y in training dataset
    x_test: the testing x in training dataset 
    '''
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((nfold, ntest))
    
    for i, (train_index, test_index) in enumerate(kf):
        print('\nFold {}'.format(i))
        x_tr = x_train[train_index]
        y_tr = y[train_index]
        x_te = x_train[test_index]
        
        clf.train(x_tr, y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
        
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)       