## Default prediction from customer spending habit

1. Create base functions for the project.
2. Initial Feature Engineering(Brainstorming)
3. Fit the model to the new features and test the results

In [1]:
# import the relevant computational modules

# data manipulation
import pandas as pd #data processing
import numpy as np #linear algebra

# Models Packages
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Gradient Boosting
import lightgbm as lgb
from sklearn.cross_validation import KFold

from scipy import stats



### Data Exploration and Feature Engineering --- manipulate and extract features from each table

In [12]:
transaction_df = pd.read_csv('../raw_data/transactions_train.csv')
payment_df = pd.read_csv('../raw_data/paiements_train.csv')
billing_df = pd.read_csv('../raw_data/facturation_train.csv')
performance_df = pd.read_csv('../raw_data/performance_train.csv')

### Transaction table

In [3]:
transaction_df.head()

Unnamed: 0,ID_CPTE,MERCHANT_CATEGORY_XCD,MERCHANT_CITY_NAME,MERCHANT_COUNTRY_XCD,DECISION_XCD,PRIOR_CREDIT_LIMIT_AMT,TRANSACTION_AMT,TRANSACTION_CATEGORY_XCD,TRANSACTION_DTTM,TRANSACTION_TYPE_XCD,SICGROUP
0,99690111,A,365767,DP,C,5927.0,52.53,E,2015-06-20 12:00:00,F,AN
1,99690111,L,2635650,DP,C,13343.0,28.35,B,2015-01-25 12:00:00,F,AN
2,99690111,L,2635650,DP,C,13343.0,0.0,A,2015-01-26 12:00:00,G,AN
3,99690111,J,680536,AF,C,9430.0,0.0,A,2015-03-25 08:00:00,G,AW
4,99690111,J,680536,AF,C,10600.0,0.0,A,2015-03-03 08:00:00,G,AW


In [4]:
# the type of columns
transaction_df.dtypes

ID_CPTE                       int64
MERCHANT_CATEGORY_XCD        object
MERCHANT_CITY_NAME            int64
MERCHANT_COUNTRY_XCD         object
DECISION_XCD                 object
PRIOR_CREDIT_LIMIT_AMT      float64
TRANSACTION_AMT             float64
TRANSACTION_CATEGORY_XCD     object
TRANSACTION_DTTM             object
TRANSACTION_TYPE_XCD         object
SICGROUP                     object
dtype: object

In [5]:
# convert and select categorical columnst
categorical_columns = ['MERCHANT_CATEGORY_XCD', 'MERCHANT_CITY_NAME', 'MERCHANT_COUNTRY_XCD', 'DECISION_XCD', 
                       'TRANSACTION_CATEGORY_XCD', 'TRANSACTION_TYPE_XCD', 'SICGROUP'] 

In [6]:
# encode the categorical features
lbl = preprocessing.LabelEncoder()
for col in categorical_columns:
    transaction_df[col].fillna('unknown')
    transaction_df[col] = lbl.fit_transform(transaction_df[col].astype(str))

In [7]:
transaction_df.head()

Unnamed: 0,ID_CPTE,MERCHANT_CATEGORY_XCD,MERCHANT_CITY_NAME,MERCHANT_COUNTRY_XCD,DECISION_XCD,PRIOR_CREDIT_LIMIT_AMT,TRANSACTION_AMT,TRANSACTION_CATEGORY_XCD,TRANSACTION_DTTM,TRANSACTION_TYPE_XCD,SICGROUP
0,99690111,0,8516,90,2,5927.0,52.53,4,2015-06-20 12:00:00,5,13
1,99690111,27,6681,90,2,13343.0,28.35,1,2015-01-25 12:00:00,5,13
2,99690111,27,6681,90,2,13343.0,0.0,0,2015-01-26 12:00:00,6,13
3,99690111,24,9798,5,2,9430.0,0.0,0,2015-03-25 08:00:00,6,22
4,99690111,24,9798,5,2,10600.0,0.0,0,2015-03-03 08:00:00,6,22


In [39]:
### Extract features from transaction_df
# for MERCHANT_CATEGORY_XCD
# sum the total spent money on the merchant category for the individual customer
tmp = transaction_df.groupby(['ID_CPTE', 'MERCHANT_CATEGORY_XCD'])['TRANSACTION_AMT'].sum()
tmp.head()

ID_CPTE   MERCHANT_CATEGORY_XCD
10034823  2                         14.98
          5                        536.54
          6                        395.68
          8                        377.36
          11                       905.19
Name: TRANSACTION_AMT, dtype: float64

In [40]:
tmp = tmp.reset_index()

In [41]:
tmp.head()

Unnamed: 0,ID_CPTE,MERCHANT_CATEGORY_XCD,TRANSACTION_AMT
0,10034823,2,14.98
1,10034823,5,536.54
2,10034823,6,395.68
3,10034823,8,377.36
4,10034823,11,905.19


In [42]:
tmp = tmp.pivot_table('TRANSACTION_AMT', ['ID_CPTE'], 'MERCHANT_CATEGORY_XCD')
tmp.columns = ['MERCHANT_CATEGORY_' + str(i) for i in tmp.columns]

In [43]:
tmp = tmp.fillna(0)

In [44]:
tmp.head()

Unnamed: 0_level_0,MERCHANT_CATEGORY_0,MERCHANT_CATEGORY_1,MERCHANT_CATEGORY_2,MERCHANT_CATEGORY_3,MERCHANT_CATEGORY_4,MERCHANT_CATEGORY_5,MERCHANT_CATEGORY_6,MERCHANT_CATEGORY_7,MERCHANT_CATEGORY_8,MERCHANT_CATEGORY_9,...,MERCHANT_CATEGORY_45,MERCHANT_CATEGORY_46,MERCHANT_CATEGORY_47,MERCHANT_CATEGORY_48,MERCHANT_CATEGORY_49,MERCHANT_CATEGORY_50,MERCHANT_CATEGORY_51,MERCHANT_CATEGORY_52,MERCHANT_CATEGORY_53,MERCHANT_CATEGORY_54
ID_CPTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10034823,0.0,0.0,14.98,0.0,0.0,536.54,395.68,0.0,377.36,0.0,...,38.84,0.0,6.36,0.0,0.0,1548.0,840.65,0.0,0.0,0.0
10069450,0.0,311.37,0.0,0.0,17.51,39.22,0.0,267.4,175.76,0.0,...,635.75,135.87,0.0,0.0,0.0,0.0,132.14,0.0,0.0,0.0
10081565,0.0,0.0,0.0,0.0,8.4,0.0,0.0,0.0,169.6,0.0,...,34.1,0.0,0.0,0.0,0.0,10.47,307.88,0.0,0.0,0.0
10083607,0.0,0.0,0.0,0.0,336.16,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1022.19,11.33,0.0,0.0,0.0
10097162,0.0,0.0,552.22,0.0,397.8,0.0,1308.31,10.3,170.69,0.0,...,0.0,0.0,0.0,0.0,0.0,241.82,775.75,0.0,0.0,0.0


### Payment table

In [45]:
payment_df.head(20)

Unnamed: 0,ID_CPTE,TRANSACTION_AMT,TRANSACTION_DTTM,PAYMENT_REVERSAL_XFLG
0,99690111,208.0,2015-04-26 00:00:00,Q
1,99690111,176.8,2015-05-28 00:00:00,Q
2,99690111,200.0,2015-03-27 04:00:00,Q
3,99690111,80.8,2015-04-02 00:00:00,Q
4,99690111,250.0,2015-11-24 00:00:00,Q
5,99690111,273.0,2015-12-26 00:00:00,Q
6,99690111,267.5,2015-08-23 00:00:00,Q
7,99690111,618.0,2015-07-27 00:00:00,Q
8,99690111,226.6,2015-09-30 00:00:00,Q
9,99690111,244.8,2015-10-27 00:00:00,Q


In [48]:
# null / total
payment_df['TRANSACTION_AMT'].isnull().sum() / len(payment_df)

0.000256568144499179

In [49]:
# drop the null rows
payment_df = payment_df.dropna()

In [50]:
# get the month and year of TRANSACTION_DTTM
payment_df['TRANSACTION_DTTM'] = payment_df['TRANSACTION_DTTM'].apply(lambda x: str(x).split(' ')[0][:-3])

In [51]:
payment_df = payment_df.sort_values(['ID_CPTE', 'TRANSACTION_DTTM'])

In [52]:
payment_df = payment_df.groupby(['ID_CPTE', 'TRANSACTION_DTTM'])['TRANSACTION_AMT'].sum().reset_index()

In [53]:
payment_df = payment_df.groupby('ID_CPTE').tail(12)

In [54]:
payment_df['TRANSACTION_DTTM'] = payment_df['TRANSACTION_DTTM'].apply(lambda x: x.split('-')[1])

In [55]:
payment_transaction = payment_df.pivot_table('TRANSACTION_AMT', ['ID_CPTE'], 'TRANSACTION_DTTM')
payment_transaction.columns = ['transaction_' + str(i) for i in payment_transaction.columns + '_month']

In [56]:
payment_transaction = payment_transaction.reset_index()

In [57]:
payment_transaction = payment_transaction.fillna(0)

### Billing table

In [58]:
billing_df.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,StatementDate,CurrentTotalBalance,CashBalance,CreditLimit,DelqCycle
0,99690111,2015-05-01,2015-05-03,8497.84,4293.12,16200.0,0
1,99690111,2014-11-01,2014-11-03,866.0,0.0,12000.0,0
2,99690111,2015-06-01,2015-05-31,10790.95,5224.44,16200.0,0
3,99690111,2015-10-01,2015-10-04,12388.46,4786.08,16200.0,0
4,99690111,2015-11-01,2015-11-02,12746.5,4818.48,16200.0,0


In [61]:
billing_df['PERIODID_MY'] = billing_df['PERIODID_MY'].apply(lambda x: x[:-3])

In [62]:
billing_df = billing_df.sort_values(['ID_CPTE', 'PERIODID_MY'])
billing_df = billing_df.reset_index(drop=True)

In [63]:
billing_df = billing_df.groupby('ID_CPTE').tail(12)
billing_df = billing_df.reset_index(drop=True)

In [64]:
billing_df['PERIODID_MY'] = billing_df['PERIODID_MY'].apply(lambda x: x[-2:])

In [65]:
# CurrentTotalBalance
total_balance = billing_df.pivot_table('CurrentTotalBalance', ['ID_CPTE'], 'PERIODID_MY')
total_balance.columns = ['total_balance_' + str(i) for i in total_balance.columns + '_month']

In [66]:
# CashBalance
cash_balance = billing_df.pivot_table('CashBalance', ['ID_CPTE'], 'PERIODID_MY')
cash_balance.columns = ['cash_balance_' + str(i) for i in cash_balance.columns + '_month']

In [67]:
# CreditLimit
credit_limit = billing_df.pivot_table('CreditLimit', ['ID_CPTE'], 'PERIODID_MY')
credit_limit.columns = ['credit_limit_' + str(i) for i in credit_limit.columns + '_month']

In [68]:
# DelqCycle
delq_cycle = billing_df.pivot_table('DelqCycle', ['ID_CPTE'], 'PERIODID_MY')
delq_cycle.columns = ['delq_cycle_' + str(i) for i in delq_cycle.columns + '_month']

In [69]:
total_balance = total_balance.reset_index()
cash_balance = cash_balance.reset_index()
credit_limit = credit_limit.reset_index()
delq_cycle = delq_cycle.reset_index()

In [118]:
tmp = total_balance.merge(cash_balance, on='ID_CPTE')
tmp = tmp.merge(credit_limit, on='ID_CPTE')
tmp = tmp.merge(delq_cycle, on='ID_CPTE')

In [119]:
tmp.head()

Unnamed: 0,ID_CPTE,total_balance_01_month,total_balance_02_month,total_balance_03_month,total_balance_04_month,total_balance_05_month,total_balance_06_month,total_balance_07_month,total_balance_08_month,total_balance_09_month,...,delq_cycle_03_month,delq_cycle_04_month,delq_cycle_05_month,delq_cycle_06_month,delq_cycle_07_month,delq_cycle_08_month,delq_cycle_09_month,delq_cycle_10_month,delq_cycle_11_month,delq_cycle_12_month
0,10001822,11479.66,11481.12,11199.6,11017.08,8079.42,9389.97,9315.23,9831.0,10254.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10007972,560.0,0.0,38.0,560.74,443.7,442.0,571.34,400.05,519.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10012520,1634.96,1846.95,1428.45,1013.52,619.03,410.55,495.91,641.68,876.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10025534,6107.9,6353.91,6277.08,6389.28,1274.11,4549.65,5282.59,6312.78,6401.85,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,10033579,409.76,251.45,432.0,416.58,385.84,474.88,495.0,524.28,552.12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [120]:
tmp = payment_transaction.merge(tmp, on='ID_CPTE', how='right')

In [121]:
tmp.head()

Unnamed: 0,ID_CPTE,transaction_01_month,transaction_02_month,transaction_03_month,transaction_04_month,transaction_05_month,transaction_06_month,transaction_07_month,transaction_08_month,transaction_09_month,...,delq_cycle_03_month,delq_cycle_04_month,delq_cycle_05_month,delq_cycle_06_month,delq_cycle_07_month,delq_cycle_08_month,delq_cycle_09_month,delq_cycle_10_month,delq_cycle_11_month,delq_cycle_12_month
0,10001822,318.0,522.5,374.5,4200.0,262.5,265.0,267.5,300.0,250.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10007972,784.34,168.28,1050.0,559.5,664.0,313.5,843.71,191.9,945.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10012520,0.0,86.1,458.0,1177.31,315.0,525.0,505.0,50.5,1115.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10025534,131.3,0.0,260.0,0.0,6264.0,0.0,2080.0,0.0,318.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,10033579,574.55,391.3,412.23,470.53,546.8,419.61,283.92,106.0,84.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [122]:
len(tmp)

11900

In [123]:
len(performance_df[performance_df['Default'] == 1]) / len(performance_df)

0.19336134453781512

In [124]:
performance_df[performance_df['Default'] == 1].head()

Unnamed: 0,ID_CPTE,PERIODID_MY,Default
18,75780289,2012-12-01,1
53,58022132,2013-12-01,1
56,25809739,2015-12-01,1
72,35143533,2013-12-01,1
137,94504449,2012-12-01,1


In [125]:
tmp = tmp.merge(performance_df[['ID_CPTE', 'Default']], on='ID_CPTE')

In [126]:
tmp = tmp.set_index('ID_CPTE')

In [127]:
tmp.head()

Unnamed: 0_level_0,transaction_01_month,transaction_02_month,transaction_03_month,transaction_04_month,transaction_05_month,transaction_06_month,transaction_07_month,transaction_08_month,transaction_09_month,transaction_10_month,...,delq_cycle_04_month,delq_cycle_05_month,delq_cycle_06_month,delq_cycle_07_month,delq_cycle_08_month,delq_cycle_09_month,delq_cycle_10_month,delq_cycle_11_month,delq_cycle_12_month,Default
ID_CPTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001822,318.0,522.5,374.5,4200.0,262.5,265.0,267.5,300.0,250.0,231.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
10007972,784.34,168.28,1050.0,559.5,664.0,313.5,843.71,191.9,945.9,701.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
10012520,0.0,86.1,458.0,1177.31,315.0,525.0,505.0,50.5,1115.0,612.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
10025534,131.3,0.0,260.0,0.0,6264.0,0.0,2080.0,0.0,318.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1
10033579,574.55,391.3,412.23,470.53,546.8,419.61,283.92,106.0,84.0,219.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0


### Create basic classes and functions

In [82]:
# Create basic scikit-learn wrapper model class
class SklearnWrapper:
    def __init__(self, clf, seed=0, params=None, seed_bool=True):
        if (seed_bool == True):
            params['random_state'] = seed
        self.clf = clf(**params)
    
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict(x)

In [83]:
# create basic xgboost wrapper model class
class XgbWrapper:
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [84]:
# create basic lightGBM wrapper model class
class LightGbmWrapper:
    def __init(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 1550)
        self.verbose_eval = params.pop('verbose_eval', 100)
        
    def train(self, x_train, y_train):
        lgtrain = lgb.Dataset(x_train, y_train)
        self.lgbm = lgb.train(self.param, lgtrain, num_boost_round=self.nrounds, verbose_eval=self.verbose_eval)
    
    def predict(self, x):
        return self.lgbm.predict(lgb.Dataset(x))

In [429]:
# create out-of-fold predictions 
# make good use of k-fold CV's result 
# serving for the staking alogrithm 
# create a new column generated from model's score

def get_oof(clf, x_train, y, x_test):
    '''
    clf: the classifer, which can be logistic regression, SVM regression, Bayes classifier, etc.
    x_train: the training x in training dataset
    y: the training y in training dataset
    x_test: the testing x in training dataset 
    '''
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))
    
    for i, (train_index, test_index) in enumerate(kf):
        print('\nFold {}'.format(i))
        x_tr = x_train[train_index]
        y_tr = y[train_index]
        x_te = x_train[test_index]
        
        clf.fit(x_tr, y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
    
    m = stats.mode(oof_test_skf, axis=1)
    oof_test[:] = m[0][0]
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)       

In [128]:
# deal with missing values in the table
tmp.head()

Unnamed: 0_level_0,transaction_01_month,transaction_02_month,transaction_03_month,transaction_04_month,transaction_05_month,transaction_06_month,transaction_07_month,transaction_08_month,transaction_09_month,transaction_10_month,...,delq_cycle_04_month,delq_cycle_05_month,delq_cycle_06_month,delq_cycle_07_month,delq_cycle_08_month,delq_cycle_09_month,delq_cycle_10_month,delq_cycle_11_month,delq_cycle_12_month,Default
ID_CPTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001822,318.0,522.5,374.5,4200.0,262.5,265.0,267.5,300.0,250.0,231.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
10007972,784.34,168.28,1050.0,559.5,664.0,313.5,843.71,191.9,945.9,701.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
10012520,0.0,86.1,458.0,1177.31,315.0,525.0,505.0,50.5,1115.0,612.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
10025534,131.3,0.0,260.0,0.0,6264.0,0.0,2080.0,0.0,318.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1
10033579,574.55,391.3,412.23,470.53,546.8,419.61,283.92,106.0,84.0,219.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0


In [129]:
transaction_col = tmp.iloc[:, :12].columns

In [136]:
for col in transaction_col:
    tmp[col] = tmp[col].fillna(tmp[tmp[col].notna()][col].mean())

In [138]:
tmp = tmp.fillna(0)

In [159]:
tmp = tmp.sample(frac=1) # shuffle data

In [193]:
from sklearn.preprocessing import StandardScaler

In [176]:
X = np.array(tmp.iloc[:, :-1])
y = np.array(tmp.iloc[:, -1])

In [185]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [227]:
from sklearn.decomposition import PCA
pca = PCA(n_components=30)
principalComponents = pca.fit_transform(X)

In [228]:
from imblearn.over_sampling import SMOTE

In [229]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(principalComponents, y)

In [230]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)

In [231]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [232]:
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [233]:
predict = clf.predict(X_test)

In [234]:
roc_auc_score(predict, y_test)

0.8052196827062564

In [235]:
from sklearn.externals import joblib

In [236]:
joblib.dump(clf, 'logistic_model.pkl')

['logistic_model.pkl']

In [264]:
# data preprocess

class DataPreprocess:
    def __init__(self, label_encoder):
        self.lbl = label_encoder
        
    def preprocess_transcation(self, transaction_df):
        categorical_columns = ['MERCHANT_CATEGORY_XCD', 'MERCHANT_CITY_NAME', 'MERCHANT_COUNTRY_XCD', 
                               'DECISION_XCD', 'TRANSACTION_CATEGORY_XCD', 'TRANSACTION_TYPE_XCD', 'SICGROUP']
        
        for col in categorical_columns:
            transaction_df[col].fillna('unknown')
            transaction_df[col] = self.lbl.fit_transform(transaction_df[col].astype(str))
        
        transaction_df = transaction_df.groupby(['ID_CPTE', 'MERCHANT_CATEGORY_XCD'])['TRANSACTION_AMT'].sum()
        transaction_df = transaction_df.reset_index()
        transaction_df = transaction_df.pivot_table('TRANSACTION_AMT', ['ID_CPTE'], 'MERCHANT_CATEGORY_XCD')
        transaction_df.columns = ['MERCHANT_CATEGORY_' + str(i) for i in transaction_df.columns]
        transaction_df = transaction_df.fillna(0)
        
        return transaction_df
    
    def preprocess_payment(self, payment_df):
        payment_df = payment_df.dropna()
        payment_df['TRANSACTION_DTTM'] = payment_df['TRANSACTION_DTTM'].apply(lambda x: str(x).split(' ')[0][:-3])
        payment_df = payment_df.sort_values(['ID_CPTE', 'TRANSACTION_DTTM'])
        payment_df = payment_df.groupby(['ID_CPTE', 'TRANSACTION_DTTM'])['TRANSACTION_AMT'].sum().reset_index()
        payment_df = payment_df.groupby('ID_CPTE').tail(12)
        payment_df['TRANSACTION_DTTM'] = payment_df['TRANSACTION_DTTM'].apply(lambda x: x.split('-')[1])
        payment_df = payment_df.pivot_table('TRANSACTION_AMT', ['ID_CPTE'], 'TRANSACTION_DTTM')
        payment_df.columns = ['transaction_' + str(i) for i in payment_df.columns + '_month']
        payment_df = payment_df.reset_index()
        payment_df = payment_df.fillna(0)
        
        return payment_df
    
    def preprocess_billing(self, billing_df):
        billing_df['PERIODID_MY'] = billing_df['PERIODID_MY'].apply(lambda x: x[:-3])
        billing_df = billing_df.sort_values(['ID_CPTE', 'PERIODID_MY'])
        billing_df = billing_df.reset_index(drop=True)
        billing_df = billing_df.groupby('ID_CPTE').tail(12)
        billing_df = billing_df.reset_index(drop=True)
        billing_df['PERIODID_MY'] = billing_df['PERIODID_MY'].apply(lambda x: x[-2:])
        total_balance = billing_df.pivot_table('CurrentTotalBalance', ['ID_CPTE'], 'PERIODID_MY')
        total_balance.columns = ['total_balance_' + str(i) for i in total_balance.columns + '_month']
        cash_balance = billing_df.pivot_table('CashBalance', ['ID_CPTE'], 'PERIODID_MY')
        cash_balance.columns = ['cash_balance_' + str(i) for i in cash_balance.columns + '_month']
        credit_limit = billing_df.pivot_table('CreditLimit', ['ID_CPTE'], 'PERIODID_MY')
        credit_limit.columns = ['credit_limit_' + str(i) for i in credit_limit.columns + '_month']
        delq_cycle = billing_df.pivot_table('DelqCycle', ['ID_CPTE'], 'PERIODID_MY')
        delq_cycle.columns = ['delq_cycle_' + str(i) for i in delq_cycle.columns + '_month']
        total_balance = total_balance.reset_index()
        cash_balance = cash_balance.reset_index()
        credit_limit = credit_limit.reset_index()
        delq_cycle = delq_cycle.reset_index()
        
        tmp = total_balance.merge(cash_balance, on='ID_CPTE')
        tmp = tmp.merge(credit_limit, on='ID_CPTE')
        tmp = tmp.merge(delq_cycle, on='ID_CPTE')
        
        return tmp
    
    def merge(self, payment, billing):
        merge_df = payment.merge(billing, on='ID_CPTE', how='right')
        return merge_df.set_index(['ID_CPTE'])  

In [240]:
label_encoder = preprocessing.LabelEncoder()

In [271]:
transaction_training = pd.read_csv('../raw_data/transactions_train.csv')
payment_training = pd.read_csv('../raw_data/paiements_train.csv')
billing_training = pd.read_csv('../raw_data/facturation_train.csv')
performance_training = pd.read_csv('../raw_data/performance_train.csv')

In [269]:
transaction_test = pd.read_csv('../raw_data/transactions_test.csv')
payment_test = pd.read_csv('../raw_data/paiements_test.csv')
billing_test = pd.read_csv('../raw_data/facturation_test.csv')
performance_test = pd.read_csv('../raw_data/performance_test.csv')

In [282]:
train_index = list(performance_training['ID_CPTE'])
test_index = list(performance_test['ID_CPTE'])

In [273]:
transaction_df = transaction_training.append(transaction_test).reset_index(drop=True)
payment_df = payment_training.append(payment_test).reset_index(drop=True)
billing_df = billing_training.append(billing_test).reset_index(drop=True)

In [284]:
billing_df.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,StatementDate,CurrentTotalBalance,CashBalance,CreditLimit,DelqCycle
0,99690111,2015-05-01,2015-05-03,8497.84,4293.12,16200.0,0
1,99690111,2014-11-01,2014-11-03,866.0,0.0,12000.0,0
2,99690111,2015-06-01,2015-05-31,10790.95,5224.44,16200.0,0
3,99690111,2015-10-01,2015-10-04,12388.46,4786.08,16200.0,0
4,99690111,2015-11-01,2015-11-02,12746.5,4818.48,16200.0,0


In [285]:
preprocess = DataPreprocess(label_encoder)

In [286]:
processed_transaction = preprocess.preprocess_transcation(transaction_df)

In [287]:
processed_payment = preprocess.preprocess_payment(payment_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [288]:
processed_billing = preprocess.preprocess_billing(billing_df)

In [464]:
processed_data = preprocess.merge(processed_payment, processed_billing)

In [465]:
processed_data.head()

Unnamed: 0_level_0,transaction_01_month,transaction_02_month,transaction_03_month,transaction_04_month,transaction_05_month,transaction_06_month,transaction_07_month,transaction_08_month,transaction_09_month,transaction_10_month,...,delq_cycle_03_month,delq_cycle_04_month,delq_cycle_05_month,delq_cycle_06_month,delq_cycle_07_month,delq_cycle_08_month,delq_cycle_09_month,delq_cycle_10_month,delq_cycle_11_month,delq_cycle_12_month
ID_CPTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001822,318.0,522.5,374.5,4200.0,262.5,265.0,267.5,300.0,250.0,231.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10007972,784.34,168.28,1050.0,559.5,664.0,313.5,843.71,191.9,945.9,701.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10012520,0.0,86.1,458.0,1177.31,315.0,525.0,505.0,50.5,1115.0,612.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10025534,131.3,0.0,260.0,0.0,6264.0,0.0,2080.0,0.0,318.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
10033579,574.55,391.3,412.23,470.53,546.8,419.61,283.92,106.0,84.0,219.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [466]:
processed_data.tail()

Unnamed: 0_level_0,transaction_01_month,transaction_02_month,transaction_03_month,transaction_04_month,transaction_05_month,transaction_06_month,transaction_07_month,transaction_08_month,transaction_09_month,transaction_10_month,...,delq_cycle_03_month,delq_cycle_04_month,delq_cycle_05_month,delq_cycle_06_month,delq_cycle_07_month,delq_cycle_08_month,delq_cycle_09_month,delq_cycle_10_month,delq_cycle_11_month,delq_cycle_12_month
ID_CPTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
94986464,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95132184,,,,,,,,,,,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
96857733,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98975426,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
99407660,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [467]:
transaction_col = processed_data.iloc[:, :12].columns

In [468]:
for col in transaction_col:
    replace_value = processed_data[processed_data[col].notna()][col].mean()
    processed_data[col] = processed_data[col].fillna(replace_value)

In [469]:
processed_data = processed_data.fillna(0)

In [470]:
training_X = processed_data.loc[train_index].reset_index()

In [471]:
tmp = training_X.merge(performance_training[['ID_CPTE', 'Default']], on='ID_CPTE')

In [472]:
tmp.tail()

Unnamed: 0,ID_CPTE,transaction_01_month,transaction_02_month,transaction_03_month,transaction_04_month,transaction_05_month,transaction_06_month,transaction_07_month,transaction_08_month,transaction_09_month,...,delq_cycle_04_month,delq_cycle_05_month,delq_cycle_06_month,delq_cycle_07_month,delq_cycle_08_month,delq_cycle_09_month,delq_cycle_10_month,delq_cycle_11_month,delq_cycle_12_month,Default
11895,58923219,1866.6,0.0,2007.32,1391.0,2439.04,2782.0,2575.0,1818.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
11896,40478502,867.0,1395.82,2442.81,950.82,2388.24,1033.09,1023.13,873.6,1530.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
11897,97548597,3508.05,1366.39,1629.03,2067.21,2170.23,2070.0,1452.48,1818.0,1250.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
11898,12603910,1861.6,1826.19,2218.11,2265.42,2052.0,7868.64,2447.55,2504.78,2910.78,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
11899,11346742,2314.0,3063.0,1816.99,1676.0,2607.96,2276.88,1956.24,1966.64,1625.09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [473]:
training_y = np.array(tmp['Default'])

In [479]:
training_X = np.array(tmp.iloc[:, :-1])

In [480]:
test_X = processed_data.loc[test_index]
test_X = np.array(test_X)

In [481]:
ntrain = training_X.shape[0]
ntest = test_X.shape[0]
NFOLDS = 10
SEED = 2018

In [482]:
kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

In [483]:
from sklearn.linear_model import LinearRegression
clf = LogisticRegression()

In [484]:
log_oof_train, log_oof_test = get_oof(clf, training_X, training_y, test_X)


Fold 0


ValueError: X has 60 features per sample; expecting 61

In [450]:
roc_auc_score(log_oof_train, training_y)

0.781071505089598

In [452]:
processed_data['log_result'] = np.concatenate([log_oof_train, log_oof_test])

In [453]:
processed_data.head()

Unnamed: 0_level_0,transaction_01_month,transaction_02_month,transaction_03_month,transaction_04_month,transaction_05_month,transaction_06_month,transaction_07_month,transaction_08_month,transaction_09_month,transaction_10_month,...,delq_cycle_04_month,delq_cycle_05_month,delq_cycle_06_month,delq_cycle_07_month,delq_cycle_08_month,delq_cycle_09_month,delq_cycle_10_month,delq_cycle_11_month,delq_cycle_12_month,log_result
ID_CPTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001822,318.0,522.5,374.5,4200.0,262.5,265.0,267.5,300.0,250.0,231.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10007972,784.34,168.28,1050.0,559.5,664.0,313.5,843.71,191.9,945.9,701.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10012520,0.0,86.1,458.0,1177.31,315.0,525.0,505.0,50.5,1115.0,612.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10025534,131.3,0.0,260.0,0.0,6264.0,0.0,2080.0,0.0,318.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
10033579,574.55,391.3,412.23,470.53,546.8,419.61,283.92,106.0,84.0,219.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [454]:
print("Light Gradient Boosting Regressor")
lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    # 'max_depth': 15,
    'num_leaves': 270,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.75,
    'bagging_freq': 2,
    'learning_rate': 0.0175,
    'verbose': 0
}  

Light Gradient Boosting Regressor


In [458]:
X = processed_data.loc[train_index].reset_index()
y = performance_training

In [462]:
tmp = X.merge(y[['ID_CPTE', 'Default']], on='ID_CPTE')
X = tmp[X]

Unnamed: 0,ID_CPTE,transaction_01_month,transaction_02_month,transaction_03_month,transaction_04_month,transaction_05_month,transaction_06_month,transaction_07_month,transaction_08_month,transaction_09_month,...,delq_cycle_05_month,delq_cycle_06_month,delq_cycle_07_month,delq_cycle_08_month,delq_cycle_09_month,delq_cycle_10_month,delq_cycle_11_month,delq_cycle_12_month,log_result,Default
0,99690111,262.50,303.00,200.00,288.80,176.80,303.00,618.00,267.50,226.60,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,57427180,494.60,0.00,0.00,1076.25,454.50,412.00,884.00,252.50,393.90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,29617912,429.07,144.45,742.64,970.78,792.38,371.28,257.25,185.64,433.63,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,61632809,1074.75,463.05,2123.44,1137.76,640.56,0.00,2132.52,0.00,1899.86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,14117855,1129.00,0.00,894.40,2307.54,0.00,1168.12,1416.78,3281.55,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,23700394,1849.60,1410.79,493.00,953.32,95.17,430.29,609.15,551.38,1599.93,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6,27881705,202.00,262.75,2905.05,540.57,652.00,208.00,2259.10,668.50,1155.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
7,46100731,0.00,1472.00,1024.59,1769.54,0.00,599.96,714.48,895.59,395.90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8,58512689,204.00,206.00,204.00,312.00,2040.00,400.00,500.00,420.00,782.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
9,24661392,360.00,342.39,437.58,388.41,506.94,348.74,508.20,802.70,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=2018)