In [102]:
# import the relevant computational modules

# data manipulation
import pandas as pd #data processing
import numpy as np #linear algebra

# Models Packages
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Gradient Boosting
import lightgbm as lgb
from sklearn.cross_validation import KFold

In [103]:
# import data

transaction_training = pd.read_csv('../raw_data/transactions_train.csv')
payment_training = pd.read_csv('../raw_data/paiements_train.csv')
billing_training = pd.read_csv('../raw_data/facturation_train.csv')
performance_training = pd.read_csv('../raw_data/performance_train.csv')


transaction_test = pd.read_csv('../raw_data/transactions_test.csv')
payment_test = pd.read_csv('../raw_data/paiements_test.csv')
billing_test = pd.read_csv('../raw_data/facturation_test.csv')
performance_test = pd.read_csv('../raw_data/performance_test.csv')

In [104]:
# Create basic scikit-learn wrapper model class
class SklearnWrapper:
    def __init__(self, clf, seed=0, params=None, seed_bool=True):
        if (seed_bool == True):
            params['random_state'] = seed
        self.clf = clf(**params)
    
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict(x)

In [105]:
# create basic xgboost wrapper model class
class XgbWrapper:
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [106]:
# create basic lightGBM wrapper model class
class LightGbmWrapper:
    def __init(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 1550)
        self.verbose_eval = params.pop('verbose_eval', 100)
        
    def train(self, x_train, y_train):
        lgtrain = lgb.Dataset(x_train, y_train)
        self.lgbm = lgb.train(self.param, lgtrain, num_boost_round=self.nrounds, verbose_eval=self.verbose_eval)
    
    def predict(self, x):
        return self.lgbm.predict(lgb.Dataset(x))

In [107]:
# create out-of-fold predictions 
# make good use of k-fold CV's result 
# serving for the staking alogrithm 
# create a new column generated from model's score

def get_oof(clf, x_train, y, x_test):
    '''
    clf: the classifer, which can be logistic regression, SVM regression, Bayes classifier, etc.
    x_train: the training x in training dataset
    y: the training y in training dataset
    x_test: the testing x in training dataset 
    '''
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))
    
    for i, (train_index, test_index) in enumerate(kf):
        print('\nFold {}'.format(i))
        x_tr = x_train[train_index]
        y_tr = y[train_index]
        x_te = x_train[test_index]
        
        clf.fit(x_tr, y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
    
    m = stats.mode(oof_test_skf, axis=1)
    oof_test[:] = m[0][0]
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)       

In [108]:
# data preprocess

class DataPreprocess:
    def __init__(self, label_encoder):
        self.lbl = label_encoder
        
    def preprocess_transcation(self, transaction_df):
        categorical_columns = ['MERCHANT_CATEGORY_XCD', 'MERCHANT_CITY_NAME', 'MERCHANT_COUNTRY_XCD', 
                               'DECISION_XCD', 'TRANSACTION_CATEGORY_XCD', 'TRANSACTION_TYPE_XCD', 'SICGROUP']
        
        for col in categorical_columns:
            transaction_df[col].fillna('unknown')
            transaction_df[col] = self.lbl.fit_transform(transaction_df[col].astype(str))
        
        transaction_df = transaction_df.groupby(['ID_CPTE', 'MERCHANT_CATEGORY_XCD'])['TRANSACTION_AMT'].sum()
        transaction_df = transaction_df.reset_index()
        transaction_df = transaction_df.pivot_table('TRANSACTION_AMT', ['ID_CPTE'], 'MERCHANT_CATEGORY_XCD')
        transaction_df.columns = ['MERCHANT_CATEGORY_' + str(i) for i in transaction_df.columns]
        transaction_df = transaction_df.fillna(0)
        
        return transaction_df
    
    def preprocess_payment(self, payment_df):
        payment_df = payment_df.dropna()
        payment_df['TRANSACTION_DTTM'] = payment_df['TRANSACTION_DTTM'].apply(lambda x: str(x).split(' ')[0][:-3])
        payment_df = payment_df.sort_values(['ID_CPTE', 'TRANSACTION_DTTM'])
        payment_df['PAYMENT_N_COUNT'] = payment_df['PAYMENT_REVERSAL_XFLG'] == 'N'
        
        payment_df = payment_df.groupby(['ID_CPTE', 'TRANSACTION_DTTM'])[['TRANSACTION_AMT', 'PAYMENT_N_COUNT']].sum().reset_index()
        payment_df = payment_df.groupby('ID_CPTE').tail(12)
        
        tmp = payment_df.groupby(['ID_CPTE'])['PAYMENT_N_COUNT'].sum().reset_index()
        
        payment_df['TRANSACTION_DTTM'] = payment_df['TRANSACTION_DTTM'].apply(lambda x: x.split('-')[1])
        payment_df = payment_df.pivot_table('TRANSACTION_AMT', ['ID_CPTE'], 'TRANSACTION_DTTM')
        payment_df.columns = ['transaction_' + str(i) for i in payment_df.columns + '_month']
        payment_df = payment_df.reset_index()
        payment_df = payment_df.fillna(0)
        
        payment_df = payment_df.merge(tmp, on='ID_CPTE')
        
        return payment_df
    
    def preprocess_billing(self, billing_df):
        billing_df['PERIODID_MY'] = billing_df['PERIODID_MY'].apply(lambda x: x[:-3])
        billing_df = billing_df.sort_values(['ID_CPTE', 'PERIODID_MY'])
        billing_df = billing_df.reset_index(drop=True)
        billing_df = billing_df.groupby('ID_CPTE').tail(12)
        billing_df = billing_df.reset_index(drop=True)
        billing_df['CreditLeft'] = billing_df['CreditLimit'] - billing_df['CurrentTotalBalance']
        
        billing_df['PERIODID_MY'] = billing_df['PERIODID_MY'].apply(lambda x: x[-2:])
        credit_left = billing_df.pivot_table('CreditLeft', ['ID_CPTE'], 'PERIODID_MY')
        credit_left.columns = ['credit_left_' + str(i) for i in credit_left.columns + '_month']
        cash_balance = billing_df.pivot_table('CashBalance', ['ID_CPTE'], 'PERIODID_MY')
        cash_balance.columns = ['cash_balance_' + str(i) for i in cash_balance.columns + '_month']
        
        delq_cycle = billing_df.groupby(['ID_CPTE'])['DelqCycle'].max().reset_index()
        delq_cycle = delq_cycle.rename(columns={'DelqCycle': 'MaxDelqCycle'})
        
        credit_left = credit_left.reset_index()
        cash_balance = cash_balance.reset_index()
        
        tmp = credit_left.merge(cash_balance, on='ID_CPTE')
        tmp = tmp.merge(credit_left, on='ID_CPTE')
        tmp = tmp.merge(delq_cycle, on='ID_CPTE')
        
        return tmp
    
    def merge(self, payment, billing):
        merge_df = payment.merge(billing, on='ID_CPTE', how='right')
        return merge_df.set_index(['ID_CPTE']) 

In [109]:
billing_test.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,StatementDate,CurrentTotalBalance,CashBalance,CreditLimit,DelqCycle
0,71424379,2013-11-01,2013-11-04,1444.3,0.0,3200.0,0
1,71424379,2014-05-01,2014-04-30,785.89,0.0,3200.0,0
2,71424379,2014-08-01,2014-08-02,1095.48,0.0,3200.0,0
3,71424379,2014-04-01,2014-04-02,845.3,0.0,3200.0,0
4,71424379,2013-12-01,2013-11-30,1623.28,0.0,3200.0,0


In [110]:
label_encoder = preprocessing.LabelEncoder()

In [111]:
preprocess = DataPreprocess(label_encoder)

In [112]:
processed_payment = preprocess.preprocess_payment(payment_training)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [113]:
processed_billing = preprocess.preprocess_billing(billing_training)

In [114]:
processed_data = preprocess.merge(processed_payment, processed_billing)

In [115]:
transaction_col = processed_data.iloc[:, :12].columns

In [116]:
for col in transaction_col:
    replace_value = processed_data[processed_data[col].notna()][col].mean()
    processed_data[col] = processed_data[col].fillna(replace_value)

In [117]:
processed_data = processed_data.fillna(0)

In [118]:
processed_data = processed_data.reset_index()

In [119]:
processed_data = processed_data.merge(performance_training[['ID_CPTE', 'Default']], on='ID_CPTE')

In [120]:
processed_data = processed_data.set_index('ID_CPTE')

In [121]:
X = np.array(processed_data.iloc[:, :-1])
y = np.array(processed_data.iloc[:, -1])

In [122]:
from imblearn.over_sampling import SMOTE

In [123]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(X, y)

In [150]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)

In [145]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
    
clf = LogisticRegression()
rf = RandomForestClassifier(min_samples_split=200, max_depth=8, random_state=0)

In [151]:
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [152]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=200,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [153]:
predict_clf = clf.predict(X_test)

In [154]:
roc_auc_score(predict_clf, y_test)

0.7958241477168921

In [155]:
predict_rf = rf.predict(X_test)

In [156]:
roc_auc_score(predict_rf, y_test)

0.8473469607180288

In [73]:
processed_data[processed_data['Default'] == 1].head(20)

Unnamed: 0_level_0,transaction_01_month,transaction_02_month,transaction_03_month,transaction_04_month,transaction_05_month,transaction_06_month,transaction_07_month,transaction_08_month,transaction_09_month,transaction_10_month,...,credit_left_05_month_y,credit_left_06_month_y,credit_left_07_month_y,credit_left_08_month_y,credit_left_09_month_y,credit_left_10_month_y,credit_left_11_month_y,credit_left_12_month_y,MaxDelqCycle,Default
ID_CPTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10025534,131.3,0.0,260.0,0.0,6264.0,0.0,2080.0,0.0,318.0,0.0,...,4825.89,1550.35,817.41,-212.78,-301.85,-222.98,-169.94,-342.65,1,1
10081565,52.5,104.0,154.5,107.0,101.0,51.5,294.5,1408.95,289.7,292.0,...,164.74,220.96,205.66,1425.7,490.18,317.3,224.38,-2.86,0,1
10083607,510.0,207.0,1249.05,107.0,334.6,260.4,985.76,14.28,1121.7,1059.9,...,10.74,170.0,265.0,1100.0,339.08,158.68,79.4,289.39,0,1
10102455,420.0,202.0,434.22,336.27,790.68,632.24,55.62,205.5,0.0,1028.52,...,274.75,175.84,168.56,193.0,186.92,421.75,220.0,188.32,0,1
10205391,54.415,512.37,971.42,0.0,0.0,39.35,2110.19,2139.85,394.21,614.1,...,83.0,59.25,60.72,526.86,-13.02,-30.7,-18.12,9.9,1,1
10294646,721.0,11340.2,0.0,3717.8,515.0,1040.0,520.0,1369.0,835.0,0.0,...,-358.34,-453.36,2.16,-540.64,-1142.74,-785.78,-1629.28,-801.45,2,1
10303856,368.8,233.07,645.32,2614.22,719.06,1703.04,725.29,532.49,720.4,113.36,...,407.13,-58.72,387.44,-171.3,-141.24,-83.0,-66.56,-148.16,1,1
10351106,518.0,254.4,628.0,103.0,1053.0,983.0,41.6,150.0,308.1,250.24,...,2632.9,117.4,94.51,89.36,6.1,-174.88,16.6,878.27,2,1
10370887,0.0,0.0,0.0,65.27,206.0,0.0,320.3,4908.8,0.0,1010.0,...,2358.85,1338.46,1588.22,105.98,3234.08,3639.0,2094.54,1975.05,3,1
10373514,0.0,50.5,787.5,0.0,104.0,0.0,0.0,309.0,72.3,0.0,...,-33.52,-130.24,-141.68,-39.72,-48.43,-26.0,-80.88,-83.8,2,1


In [60]:
billing_training[billing_training['ID_CPTE'] == 86170782]

Unnamed: 0,ID_CPTE,PERIODID_MY,StatementDate,CurrentTotalBalance,CashBalance,CreditLimit,DelqCycle
97534,86170782,2012-04,2012-04-16,14724.15,0.0,16100.0,0
97535,86170782,2012-11,2012-11-18,12116.58,1020.0,16100.0,0
97536,86170782,2012-08,2012-08-14,15938.0,0.0,16100.0,0
97537,86170782,2012-12,2012-12-13,13003.12,693.68,16100.0,0
97538,86170782,2011-12,2011-12-13,15292.41,0.0,16100.0,0
97539,86170782,2012-05,2012-05-15,16121.0,0.0,16100.0,0
97540,86170782,2012-03,2012-03-16,14796.6,0.0,16100.0,0
97541,86170782,2012-07,2012-07-17,10617.61,0.0,16100.0,0
97542,86170782,2012-09,2012-09-14,14403.9,918.0,16100.0,0
97543,86170782,2012-10,2012-10-19,9889.0,0.0,16100.0,0


In [36]:
from sklearn.externals import joblib
joblib.dump(clf, 'random_forest.pkl')

['random_forest.pkl']

In [136]:
processed_payment_test = preprocess.preprocess_payment(payment_test)
processed_billing_test = preprocess.preprocess_billing(billing_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [160]:
processed_test = preprocess.merge(processed_payment_test, processed_billing_test)

In [161]:
for col in transaction_col:
    replace_value = processed_test[processed_test[col].notna()][col].mean()
    processed_test[col] = processed_test[col].fillna(replace_value)

In [162]:
processed_test = processed_test.fillna(0)

In [163]:
processed_test.head()

Unnamed: 0_level_0,transaction_01_month,transaction_02_month,transaction_03_month,transaction_04_month,transaction_05_month,transaction_06_month,transaction_07_month,transaction_08_month,transaction_09_month,transaction_10_month,...,credit_left_04_month_y,credit_left_05_month_y,credit_left_06_month_y,credit_left_07_month_y,credit_left_08_month_y,credit_left_09_month_y,credit_left_10_month_y,credit_left_11_month_y,credit_left_12_month_y,MaxDelqCycle
ID_CPTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10074849,411.0,340.26,993.92,0.0,906.38,363.0,915.54,609.5,626.85,396.93,...,1082.89,1689.6,1355.5,1514.89,1691.76,1303.07,1254.59,1586.58,1237.53,0
10086539,0.0,556.92,832.0,642.0,661.26,1880.0,950.86,1591.0,1048.6,500.0,...,932.32,611.21,192.4,1309.15,845.2,601.42,394.9,866.74,865.76,0
10140908,214.1,88.36,200.0,206.0,239.99,160.5,418.0,428.72,202.0,163.2,...,66.75,388.0,305.1,616.1,726.48,942.6,917.25,543.06,8.44,2
10147994,38.11,39.52,218.4,227.9,224.7,229.69,226.0,504.29,4.24,256.2,...,1289.95,861.59,741.1,104.62,-52.0,-230.21,-374.16,158.0,-11.0,0
10152808,420.0,1030.0,510.0,423.3,877.5,1157.44,709.0,995.0,1015.0,515.0,...,800.69,571.52,775.0,957.0,1266.0,910.8,1132.0,674.48,818.62,0


In [64]:
processed_data[processed_data['Default'] == 0].head(20)

Unnamed: 0_level_0,transaction_01_month,transaction_02_month,transaction_03_month,transaction_04_month,transaction_05_month,transaction_06_month,transaction_07_month,transaction_08_month,transaction_09_month,transaction_10_month,...,credit_left_05_month_y,credit_left_06_month_y,credit_left_07_month_y,credit_left_08_month_y,credit_left_09_month_y,credit_left_10_month_y,credit_left_11_month_y,credit_left_12_month_y,MaxDelqCycle,Default
ID_CPTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001822,318.0,522.5,374.5,4200.0,262.5,265.0,267.5,300.0,250.0,231.75,...,3420.58,2110.03,2184.77,1669.0,1246.0,967.72,-182.3,-432.2,0,0
10007972,784.34,168.28,1050.0,559.5,664.0,313.5,843.71,191.9,945.9,701.47,...,256.3,258.0,128.66,299.95,180.25,299.14,609.9,700.0,0,0
10012520,0.0,86.1,458.0,1177.31,315.0,525.0,505.0,50.5,1115.0,612.0,...,1580.97,1789.45,1704.09,1558.32,2323.32,2074.0,1606.1,1922.8,0,0
10033579,574.55,391.3,412.23,470.53,546.8,419.61,283.92,106.0,84.0,219.6,...,114.16,25.12,5.0,-24.28,-52.12,-38.21,-33.52,-129.33,1,0
10034823,982.03,0.0,2182.0,0.0,1180.69,0.0,1400.58,521.18,0.0,1977.0,...,725.0,1308.87,1390.95,1617.68,179.85,-296.68,1272.12,645.57,1,0
10036020,464.6,1483.3,739.54,0.0,2328.03,0.0,647.35,583.0,733.36,831.23,...,8176.09,8482.9,8477.26,8387.0,8220.46,8043.28,8518.6,8690.98,0,0
10068805,447.43,3336.17,3012.93,1564.34,2194.79,2740.56,3655.82,2660.58,1819.7,1737.91,...,9734.06,9411.96,9651.67,9431.0,10005.84,9360.68,10100.0,9662.16,0,0
10069450,83.7,473.58,569.61,303.22,1054.51,569.37,184.29,1007.32,176.44,200.39,...,1445.0,1490.8,1585.3,1234.35,1584.1,1518.1,1524.74,1283.06,0,0
10097162,303.0,1085.2,620.7,4097.19,283.0,247.22,120.0,1586.51,2410.26,1004.95,...,655.7,35.6,504.32,1099.36,3266.4,3198.0,2162.0,1693.0,0,0
10108876,3793.0,1328.25,2525.0,1125.4,1950.09,3637.0,1351.6,1345.0,1502.2,705.0,...,829.92,1341.0,995.6,1306.19,1519.53,238.51,1321.35,1018.0,0,0


In [164]:
X_test = np.array(processed_test)

In [165]:
processed_test['Default'] = rf.predict(X_test)

In [166]:
results = processed_test.reset_index()[['ID_CPTE', 'Default']]

In [167]:
results.to_csv('submission.csv')

In [97]:
results.head()

Unnamed: 0,ID_CPTE,Default
0,10074849,0
1,10086539,0
2,10140908,1
3,10147994,0
4,10152808,0


In [168]:
len(results)

5100

In [169]:
performance_test.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,Default
0,71424379,2014-12-01,
1,64887111,2015-12-01,
2,69431075,2014-12-01,
3,31823308,2016-12-01,
4,39407834,2012-12-01,
