### Features test

In [1]:
# import the relevant computational modules

# data manipulation
import pandas as pd #data processing
import numpy as np #linear algebra

# Models Packages
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Gradient Boosting
import lightgbm as lgb

# Basic Model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Oversampling
from imblearn.over_sampling import SMOTE

# split dataset
from sklearn.model_selection import train_test_split

In [2]:
# import data

transaction_training = pd.read_csv('../raw_data/transactions_train.csv')
payment_training = pd.read_csv('../raw_data/paiements_train.csv')
billing_training = pd.read_csv('../raw_data/facturation_train.csv')
performance_training = pd.read_csv('../raw_data/performance_train.csv')


transaction_test = pd.read_csv('../raw_data/transactions_test.csv')
payment_test = pd.read_csv('../raw_data/paiements_test.csv')
billing_test = pd.read_csv('../raw_data/facturation_test.csv')
performance_test = pd.read_csv('../raw_data/performance_test.csv')

In [3]:
payment_training.head()

Unnamed: 0,ID_CPTE,TRANSACTION_AMT,TRANSACTION_DTTM,PAYMENT_REVERSAL_XFLG
0,99690111,208.0,2015-04-26 00:00:00,Q
1,99690111,176.8,2015-05-28 00:00:00,Q
2,99690111,200.0,2015-03-27 04:00:00,Q
3,99690111,80.8,2015-04-02 00:00:00,Q
4,99690111,250.0,2015-11-24 00:00:00,Q


In [4]:
billing_training.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,StatementDate,CurrentTotalBalance,CashBalance,CreditLimit,DelqCycle
0,99690111,2015-05-01,2015-05-03,8497.84,4293.12,16200.0,0
1,99690111,2014-11-01,2014-11-03,866.0,0.0,12000.0,0
2,99690111,2015-06-01,2015-05-31,10790.95,5224.44,16200.0,0
3,99690111,2015-10-01,2015-10-04,12388.46,4786.08,16200.0,0
4,99690111,2015-11-01,2015-11-02,12746.5,4818.48,16200.0,0


In [5]:
performance_training.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,Default
0,99690111,2015-12-01,0
1,57427180,2012-12-01,0
2,29617912,2015-12-01,0
3,61632809,2015-12-01,0
4,14117855,2013-12-01,0


In [4]:
payment_training = payment_training.dropna()

In [5]:
payment_training['TRANSACTION_DTTM'] = payment_training['TRANSACTION_DTTM'].apply(lambda x: str(x).split(' ')[0][:-3])
payment_training = payment_training.sort_values(['ID_CPTE', 'TRANSACTION_DTTM'])
payment_training['PAYMENT_N_COUNT'] = payment_training['PAYMENT_REVERSAL_XFLG'] == 'N'

payment_training = payment_training.groupby(['ID_CPTE', 'TRANSACTION_DTTM'])[['TRANSACTION_AMT', 'PAYMENT_N_COUNT']].sum().reset_index()
payment_training = payment_training.groupby('ID_CPTE').tail(12)

tmp = payment_training.groupby(['ID_CPTE'])['PAYMENT_N_COUNT'].sum().reset_index()

In [6]:
payment_training = payment_training.groupby(['ID_CPTE', 'TRANSACTION_DTTM'])['TRANSACTION_AMT'].sum().reset_index()
payment_training = payment_training.rename(columns={'TRANSACTION_DTTM': 'PERIODID_MY'})

In [7]:
billing_training = billing_training.sort_values(['ID_CPTE', 'PERIODID_MY'])
billing_training = billing_training.groupby('ID_CPTE').tail(12)
billing_training['PERIODID_MY'] = billing_training['PERIODID_MY'].apply(lambda x: x[:-3])

In [8]:
processed_data = billing_training.merge(payment_training, on=['ID_CPTE', 'PERIODID_MY'], how='left')

In [9]:
processed_data['CreditLeft'] = processed_data['CreditLimit'] - processed_data['CurrentTotalBalance']
processed_data['BalanceLeft'] = processed_data['CurrentTotalBalance'] - processed_data['TRANSACTION_AMT']

In [10]:
processed_data = processed_data.fillna(0)
processed_data['PERIODID_MY'] = processed_data['PERIODID_MY'].apply(lambda x: x[-2:])

credit_left = processed_data.pivot_table('CreditLeft', ['ID_CPTE'], 'PERIODID_MY')
credit_left.columns = ['credit_left_' + str(i) for i in credit_left.columns + '_month']

cash_balance = processed_data.pivot_table('CashBalance', ['ID_CPTE'], 'PERIODID_MY')
cash_balance.columns = ['cash_balance_' + str(i) for i in cash_balance.columns + '_month']

balance_left = processed_data.pivot_table('BalanceLeft', ['ID_CPTE'], 'PERIODID_MY')
balance_left.columns = ['balance_left_' + str(i) for i in balance_left.columns + '_month']

delq_cycle = processed_data.groupby(['ID_CPTE'])['DelqCycle'].max().reset_index()
delq_cycle = delq_cycle.rename(columns={'DelqCycle': 'MaxDelqCycle'})

credit_left = credit_left.reset_index()
cash_balance = cash_balance.reset_index()
balance_left = balance_left.reset_index()

tmp2 = credit_left.merge(cash_balance, on='ID_CPTE')
tmp2 = tmp2.merge(balance_left, on='ID_CPTE')
tmp2 = tmp2.merge(delq_cycle, on='ID_CPTE')

tmp2 = tmp2.merge(tmp, on='ID_CPTE')

In [12]:
tmp2.head()

Unnamed: 0,ID_CPTE,credit_left_01_month,credit_left_02_month,credit_left_03_month,credit_left_04_month,credit_left_05_month,credit_left_06_month,credit_left_07_month,credit_left_08_month,credit_left_09_month,...,balance_left_05_month,balance_left_06_month,balance_left_07_month,balance_left_08_month,balance_left_09_month,balance_left_10_month,balance_left_11_month,balance_left_12_month,MaxDelqCycle,PAYMENT_N_COUNT
0,10001822,20.34,18.88,300.4,482.92,3420.58,2110.03,2184.77,1669.0,1246.0,...,7816.92,9124.97,9047.73,9531.0,10004.0,10300.53,11386.5,11614.2,0,0.0
1,10007972,140.0,700.0,662.0,139.26,256.3,258.0,128.66,299.95,180.25,...,-220.3,128.5,-272.37,208.15,-426.15,-300.61,-1591.4,-4457.03,0,0.0
2,10012520,565.04,353.05,771.55,1186.48,1580.97,1789.45,1704.09,1558.32,2323.32,...,304.03,-114.45,-9.09,591.18,-238.32,514.0,523.9,25.2,0,0.0
3,10025534,-7.9,-253.91,-177.08,-289.28,4825.89,1550.35,817.41,-212.78,-301.85,...,-4989.89,0.0,3202.59,0.0,6083.85,0.0,5845.94,0.0,1,0.0
4,10033579,90.24,248.55,68.0,83.42,114.16,25.12,5.0,-24.28,-52.12,...,-160.96,55.27,211.08,418.28,468.12,318.61,0.0,501.8,1,0.0
