In [1]:
# import the relevant computational modules

# data manipulation
import pandas as pd #data processing
import numpy as np #linear algebra

# Models Packages
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Gradient Boosting
import lightgbm as lgb
from sklearn.cross_validation import KFold



In [168]:
# import data

transaction_training = pd.read_csv('../raw_data/transactions_train.csv')
payment_training = pd.read_csv('../raw_data/paiements_train.csv')
billing_training = pd.read_csv('../raw_data/facturation_train.csv')
performance_training = pd.read_csv('../raw_data/performance_train.csv')


transaction_test = pd.read_csv('../raw_data/transactions_test.csv')
payment_test = pd.read_csv('../raw_data/paiements_test.csv')
billing_test = pd.read_csv('../raw_data/facturation_test.csv')
performance_test = pd.read_csv('../raw_data/performance_test.csv')

In [3]:
# Create basic scikit-learn wrapper model class
class SklearnWrapper:
    def __init__(self, clf, seed=0, params=None, seed_bool=True):
        if (seed_bool == True):
            params['random_state'] = seed
        self.clf = clf(**params)
    
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict(x)

In [4]:
# create basic xgboost wrapper model class
class XgbWrapper:
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [5]:
# create basic lightGBM wrapper model class
class LightGbmWrapper:
    def __init(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 1550)
        self.verbose_eval = params.pop('verbose_eval', 100)
        
    def train(self, x_train, y_train):
        lgtrain = lgb.Dataset(x_train, y_train)
        self.lgbm = lgb.train(self.param, lgtrain, num_boost_round=self.nrounds, verbose_eval=self.verbose_eval)
    
    def predict(self, x):
        return self.lgbm.predict(lgb.Dataset(x))

In [6]:
# create out-of-fold predictions 
# make good use of k-fold CV's result 
# serving for the staking alogrithm 
# create a new column generated from model's score

def get_oof(clf, x_train, y, x_test):
    '''
    clf: the classifer, which can be logistic regression, SVM regression, Bayes classifier, etc.
    x_train: the training x in training dataset
    y: the training y in training dataset
    x_test: the testing x in training dataset 
    '''
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))
    
    for i, (train_index, test_index) in enumerate(kf):
        print('\nFold {}'.format(i))
        x_tr = x_train[train_index]
        y_tr = y[train_index]
        x_te = x_train[test_index]
        
        clf.fit(x_tr, y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
    
    m = stats.mode(oof_test_skf, axis=1)
    oof_test[:] = m[0][0]
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)       

In [133]:
# data preprocess

class DataPreprocess:
    def __init__(self, label_encoder):
        self.lbl = label_encoder
        
    def preprocess_transcation(self, transaction_df):
        categorical_columns = ['MERCHANT_CATEGORY_XCD', 'MERCHANT_CITY_NAME', 'MERCHANT_COUNTRY_XCD', 
                               'DECISION_XCD', 'TRANSACTION_CATEGORY_XCD', 'TRANSACTION_TYPE_XCD', 'SICGROUP']
        
        for col in categorical_columns:
            transaction_df[col].fillna('unknown')
            transaction_df[col] = self.lbl.fit_transform(transaction_df[col].astype(str))
        
        transaction_df = transaction_df.groupby(['ID_CPTE', 'MERCHANT_CATEGORY_XCD'])['TRANSACTION_AMT'].sum()
        transaction_df = transaction_df.reset_index()
        transaction_df = transaction_df.pivot_table('TRANSACTION_AMT', ['ID_CPTE'], 'MERCHANT_CATEGORY_XCD')
        transaction_df.columns = ['MERCHANT_CATEGORY_' + str(i) for i in transaction_df.columns]
        transaction_df = transaction_df.fillna(0)
        
        return transaction_df
    
    def preprocess_payment(self, payment_df):
        payment_df = payment_df.dropna()
        payment_df['TRANSACTION_DTTM'] = payment_df['TRANSACTION_DTTM'].apply(lambda x: str(x).split(' ')[0][:-3])
        payment_df = payment_df.sort_values(['ID_CPTE', 'TRANSACTION_DTTM'])
        payment_df['PAYMENT_N_COUNT'] = payment_df['PAYMENT_REVERSAL_XFLG'] == 'N'
        
        payment_df = payment_df.groupby(['ID_CPTE', 'TRANSACTION_DTTM'])[['TRANSACTION_AMT', 'PAYMENT_N_COUNT']].sum().reset_index()
        payment_df = payment_df.groupby('ID_CPTE').tail(12)
        
        tmp = payment_df.groupby(['ID_CPTE'])['PAYMENT_N_COUNT'].sum().reset_index()
        
        payment_df['TRANSACTION_DTTM'] = payment_df['TRANSACTION_DTTM'].apply(lambda x: x.split('-')[1])
        payment_df = payment_df.pivot_table('TRANSACTION_AMT', ['ID_CPTE'], 'TRANSACTION_DTTM')
        payment_df.columns = ['transaction_' + str(i) for i in payment_df.columns + '_month']
        payment_df = payment_df.reset_index()
        payment_df = payment_df.fillna(0)
        
        payment_df = payment_df.merge(tmp, on='ID_CPTE')
        
        return payment_df
    
    def preprocess_billing(self, billing_df):
        billing_df['PERIODID_MY'] = billing_df['PERIODID_MY'].apply(lambda x: x[:-3])
        billing_df = billing_df.sort_values(['ID_CPTE', 'PERIODID_MY'])
        billing_df = billing_df.reset_index(drop=True)
        billing_df = billing_df.groupby('ID_CPTE').tail(12)
        billing_df = billing_df.reset_index(drop=True)
        billing_df['CreditLeft'] = billing_df['CreditLimit'] - billing_df['CurrentTotalBalance']
        
        billing_df['PERIODID_MY'] = billing_df['PERIODID_MY'].apply(lambda x: x[-2:])
        credit_left = billing_df.pivot_table('CreditLeft', ['ID_CPTE'], 'PERIODID_MY')
        credit_left.columns = ['credit_left_' + str(i) for i in credit_left.columns + '_month']
        cash_balance = billing_df.pivot_table('CashBalance', ['ID_CPTE'], 'PERIODID_MY')
        cash_balance.columns = ['cash_balance_' + str(i) for i in cash_balance.columns + '_month']
        
        delq_cycle = billing_df.groupby(['ID_CPTE'])['DelqCycle'].max().reset_index()
        delq_cycle = delq_cycle.rename(columns={'DelqCycle': 'MaxDelqCycle'})
        
        credit_left = credit_left.reset_index()
        cash_balance = cash_balance.reset_index()
        
        tmp = credit_left.merge(cash_balance, on='ID_CPTE')
        tmp = tmp.merge(credit_left, on='ID_CPTE')
        tmp = tmp.merge(delq_cycle, on='ID_CPTE')
        
        return tmp
    
    def merge(self, payment, billing):
        merge_df = payment.merge(billing, on='ID_CPTE', how='right')
        return merge_df.set_index(['ID_CPTE']) 

In [167]:
billing_test.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,StatementDate,CurrentTotalBalance,CashBalance,CreditLimit,DelqCycle
0,71424379,2013,2013-11-04,1444.3,0.0,3200.0,0
1,71424379,2014,2014-04-30,785.89,0.0,3200.0,0
2,71424379,2014,2014-08-02,1095.48,0.0,3200.0,0
3,71424379,2014,2014-04-02,845.3,0.0,3200.0,0
4,71424379,2013,2013-11-30,1623.28,0.0,3200.0,0


In [169]:
label_encoder = preprocessing.LabelEncoder()

In [170]:
preprocess = DataPreprocess(label_encoder)

In [171]:
processed_payment = preprocess.preprocess_payment(payment_training)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [172]:
processed_billing = preprocess.preprocess_billing(billing_training)

In [173]:
processed_data = preprocess.merge(processed_payment, processed_billing)

In [174]:
transaction_col = processed_data.iloc[:, :12].columns

In [175]:
for col in transaction_col:
    replace_value = processed_data[processed_data[col].notna()][col].mean()
    processed_data[col] = processed_data[col].fillna(replace_value)

In [176]:
processed_data = processed_data.fillna(0)

In [177]:
processed_data = processed_data.merge(performance_training[['ID_CPTE', 'Default']], on='ID_CPTE')

In [180]:
processed_data = processed_data.set_index('ID_CPTE')

In [181]:
X = np.array(processed_data.iloc[:, :-1])
y = np.array(processed_data.iloc[:, -1])

In [182]:
from imblearn.over_sampling import SMOTE

In [183]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(X, y)

In [184]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)

In [185]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
    
clf = LogisticRegression()
rf = RandomForestClassifier(min_samples_split=20, random_state=0)

In [186]:
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [187]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=20,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [188]:
predict_clf = clf.predict(X_test)

In [189]:
roc_auc_score(predict_clf, y_test)

0.7977130905782517

In [190]:
predict_rf = rf.predict(X_test)

In [191]:
roc_auc_score(predict_rf, y_test)

0.8670631223225893

In [192]:
processed_data

Unnamed: 0_level_0,transaction_01_month,transaction_02_month,transaction_03_month,transaction_04_month,transaction_05_month,transaction_06_month,transaction_07_month,transaction_08_month,transaction_09_month,transaction_10_month,...,credit_left_05_month_y,credit_left_06_month_y,credit_left_07_month_y,credit_left_08_month_y,credit_left_09_month_y,credit_left_10_month_y,credit_left_11_month_y,credit_left_12_month_y,MaxDelqCycle,Default
ID_CPTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001822,318.000000,522.500000,374.500000,4200.00000,262.500000,265.000000,267.50000,300.000000,250.000000,231.750000,...,3420.58,2110.03,2184.77,1669.00,1246.00,967.72,-182.30,-432.20,0,0
10007972,784.340000,168.280000,1050.000000,559.50000,664.000000,313.500000,843.71000,191.900000,945.900000,701.470000,...,256.30,258.00,128.66,299.95,180.25,299.14,609.90,700.00,0,0
10012520,0.000000,86.100000,458.000000,1177.31000,315.000000,525.000000,505.00000,50.500000,1115.000000,612.000000,...,1580.97,1789.45,1704.09,1558.32,2323.32,2074.00,1606.10,1922.80,0,0
10025534,131.300000,0.000000,260.000000,0.00000,6264.000000,0.000000,2080.00000,0.000000,318.000000,0.000000,...,4825.89,1550.35,817.41,-212.78,-301.85,-222.98,-169.94,-342.65,1,1
10033579,574.550000,391.300000,412.230000,470.53000,546.800000,419.610000,283.92000,106.000000,84.000000,219.600000,...,114.16,25.12,5.00,-24.28,-52.12,-38.21,-33.52,-129.33,1,0
10034823,982.030000,0.000000,2182.000000,0.00000,1180.690000,0.000000,1400.58000,521.180000,0.000000,1977.000000,...,725.00,1308.87,1390.95,1617.68,179.85,-296.68,1272.12,645.57,1,0
10036020,464.600000,1483.300000,739.540000,0.00000,2328.030000,0.000000,647.35000,583.000000,733.360000,831.230000,...,8176.09,8482.90,8477.26,8387.00,8220.46,8043.28,8518.60,8690.98,0,0
10068805,447.430000,3336.170000,3012.930000,1564.34000,2194.790000,2740.560000,3655.82000,2660.580000,1819.700000,1737.910000,...,9734.06,9411.96,9651.67,9431.00,10005.84,9360.68,10100.00,9662.16,0,0
10069450,83.700000,473.580000,569.610000,303.22000,1054.510000,569.370000,184.29000,1007.320000,176.440000,200.390000,...,1445.00,1490.80,1585.30,1234.35,1584.10,1518.10,1524.74,1283.06,0,0
10081565,52.500000,104.000000,154.500000,107.00000,101.000000,51.500000,294.50000,1408.950000,289.700000,292.000000,...,164.74,220.96,205.66,1425.70,490.18,317.30,224.38,-2.86,0,1


In [193]:
from sklearn.externals import joblib
joblib.dump(clf, 'random_forest.pkl')

['random_forest.pkl']

In [194]:
processed_payment_test = preprocess.preprocess_payment(payment_test)
processed_billing_test = preprocess.preprocess_billing(billing_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [195]:
processed_test = preprocess.merge(processed_payment_test, processed_billing_test)

In [196]:
for col in transaction_col:
    replace_value = processed_test[processed_test[col].notna()][col].mean()
    processed_test[col] = processed_test[col].fillna(replace_value)

In [197]:
processed_test = processed_test.fillna(0)

In [198]:
X_test = np.array(processed_test)

In [200]:
processed_test['Default'] = rf.predict(X_test)

In [202]:
results = processed_test.reset_index()[['ID_CPTE', 'Default']]

In [205]:
results.to_csv('results.csv', index_label=False)

In [204]:
len(results)

5100