In [1]:
import numpy as np
import pandas as pd
from data_prepare import load_data,feature_selection, transforms
from data_aug import frequency_encode, label_encode, combine, aggregate_encode
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score

In [2]:
X_train, X_test, y_train, y_test = load_data('train_transaction.csv','train_identity.csv')

               isFraud  TransactionDT  TransactionAmt ProductCD  card1  card2  \
TransactionID                                                                   
2987000              0          86400            68.5         W  13926    NaN   
2987001              0          86401            29.0         W   2755  404.0   
2987002              0          86469            59.0         W   4663  490.0   
2987003              0          86499            50.0         W  18132  567.0   
2987004              0          86506            50.0         H   4497  514.0   

               card3       card4  card5   card6  ...                id_31  \
TransactionID                                    ...                        
2987000        150.0    discover  142.0  credit  ...                  NaN   
2987001        150.0  mastercard  102.0  credit  ...                  NaN   
2987002        150.0        visa  166.0   debit  ...                  NaN   
2987003        150.0  mastercard  117.0   debit

## Reason for the following feature engineering from Data EDA.ipynb

In [3]:
for i in range(1,16):
    if i in [1,2,3,5,9]: continue
    X_train['D'+str(i)] =  X_train['D'+str(i)] - X_train.TransactionDT/np.float32(24*60*60)
    X_test['D'+str(i)] = X_test['D'+str(i)] - X_test.TransactionDT/np.float32(24*60*60) 

In [4]:
frequency_encode(X_train,X_test,['addr1','card1','card2','card3','P_emaildomain'])
combine('card1','addr1',X_train,X_test)
combine('card1_addr1','P_emaildomain',X_train,X_test)
frequency_encode(X_train,X_test,['card1_addr1','card1_addr1_P_emaildomain'])
aggregate_encode(['TransactionAmt','D9','D11'],['card1','card1_addr1','card1_addr1_P_emaildomain'],['mean','std'],X_train,X_test)

In [5]:
cols = feature_selection(X_train)

In [6]:
X_train_final = X_train.loc[:,cols]
X_test_final = X_test.loc[:,cols]

In [8]:
X_train_final, X_test_final = transforms(X_train_final,X_test_final)

In [9]:
X_train_final.head()

Unnamed: 0_level_0,TransactionAmt,ProductCD,card1,card2,card3,card5,card6,addr1,addr2,dist1,...,D9_card1_addr1_mean,D9_card1_addr1_std,D9_card1_addr1_P_emaildomain_mean,D9_card1_addr1_P_emaildomain_std,D11_card1_mean,D11_card1_std,D11_card1_addr1_mean,D11_card1_addr1_std,D11_card1_addr1_P_emaildomain_mean,D11_card1_addr1_P_emaildomain_std
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3291761,57.95,4,14135,219.0,150.0,226.0,2,204.0,87.0,7.0,...,-1.0,-1.0,-1.0,-1.0,28.738165,159.030136,35.935642,166.377594,98.1632,192.883743
2999755,45.0,4,9485,111.0,150.0,226.0,2,315.0,87.0,-1.0,...,0.675,0.276475,0.958333,-1.0,85.897362,197.939362,86.821419,193.09108,131.040649,215.826797
3020290,57.95,4,5454,532.0,150.0,224.0,2,485.0,87.0,26.0,...,-1.0,-1.0,-1.0,-1.0,157.547424,234.604889,176.145462,251.505508,13.807121,185.832932
3465921,16.907,0,15885,545.0,185.0,138.0,2,-1.0,-1.0,-1.0,...,0.503967,0.34251,0.520903,0.342974,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3138171,17.024,0,3154,408.0,185.0,224.0,2,-1.0,-1.0,-1.0,...,0.517632,0.345902,0.493632,0.341986,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [10]:
idxT = X_train_final.index[:9*len(X_train)//10]
idxV = X_train_final.index[9*len(X_train)//10:]

clf = xgb.XGBClassifier( 
        n_estimators=2000,
        max_depth=12, 
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        missing=-1, 
        eval_metric='auc'
    )
clf.fit(X_train_final.loc[idxT,:], y_train[idxT], 
        eval_set=[(X_train_final.loc[idxV,:],y_train[idxV])],
        verbose=50, early_stopping_rounds=100)

[0]	validation_0-auc:0.84234
Will train until validation_0-auc hasn't improved in 100 rounds.
[50]	validation_0-auc:0.894451
[100]	validation_0-auc:0.914557
[150]	validation_0-auc:0.930453
[200]	validation_0-auc:0.943728
[250]	validation_0-auc:0.952717
[300]	validation_0-auc:0.959395
[350]	validation_0-auc:0.963526
[400]	validation_0-auc:0.966363
[450]	validation_0-auc:0.967852
[500]	validation_0-auc:0.969065
[550]	validation_0-auc:0.969908
[600]	validation_0-auc:0.970566
[650]	validation_0-auc:0.971112
[700]	validation_0-auc:0.971604
[750]	validation_0-auc:0.971928
[800]	validation_0-auc:0.972236
[850]	validation_0-auc:0.972237
[900]	validation_0-auc:0.972393
[950]	validation_0-auc:0.972496
[1000]	validation_0-auc:0.972558
[1050]	validation_0-auc:0.972664
[1100]	validation_0-auc:0.972691
[1150]	validation_0-auc:0.972743
[1200]	validation_0-auc:0.972868
[1250]	validation_0-auc:0.972931
[1300]	validation_0-auc:0.972997
[1350]	validation_0-auc:0.973089
[1400]	validation_0-auc:0.973127
[1

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, eval_metric='auc',
              gamma=0, learning_rate=0.02, max_delta_step=0, max_depth=12,
              min_child_weight=1, missing=-1, n_estimators=2000, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.8, verbosity=1)

In [11]:
pred = clf.predict_proba(X_test_final)
predict = clf.predict(X_test_final)

In [12]:

precision_score(predict,y_test,pos_label=1)

0.6468510888758093

In [13]:
accuracy_score(predict,y_test)

0.9871439699258306