In [1]:
import lightgbm as lgbm
import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype
import matplotlib
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import pickle
from sklearn import preprocessing
from sklearn.metrics import roc_curve
from scipy import stats
from scipy.stats import zscore
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold as SKF
from sklearn import metrics
# from fancyimpute import *

from utils import *
%matplotlib inline

dtype = load_obj('dict_dtype')
my_dict = load_obj('my_dict')

In [2]:
data = pd.read_csv("atec_anti_fraud_train.csv",parse_dates=['date'], dtype = dtype)
test = pd.read_csv("atec_anti_fraud_test_b.csv",parse_dates=['date'], dtype = dtype)

data = process_dates(data, 'date')
test = process_dates(test, 'date')

In [4]:
# set the label of unlabeled data to 1
data.loc[data['label']==-1,'label'] = 1

# sort the data with date
data.sort_values('date',inplace=True)

print(data.shape)

(994731, 304)


**** Tune the parameter ****

In [5]:
# use the last 20% data as validation set
valid = data.iloc[data.shape[0]*4//5:,:]
data = data.iloc[:data.shape[0]*4//5,:]

print(data.shape[0] + valid.shape[0], data.shape[1])

test_id = test.id
cv_pred = np.zeros(len(test_id))
xgtest = xgb.DMatrix(test.drop(['id','date','Day'],axis=1).values)

valid_pred = np.zeros(valid.shape[0])
valid_id = valid.id
xgvalid = xgb.DMatrix(valid.drop(['id','date','label','Day'],axis=1).values, valid.label.values)

994731 304


In [9]:
# set up the parameters
params = {'max_depth': 6, 'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic'}
params['nthread'] = 4
params['eval_metric'] = ['logloss','auc']
params["scale_pos_weight"] = 2
params["subsample"] = 0.7
params["colsample_bytree "] = 0.8
num_rounds = 10000
early_stopping_rounds = 50

# set up the random seed for testing
params["seed"] = 6

xgtrain = xgb.DMatrix(data.drop(['id','date','label','Day'],axis=1).values, data.label.values)

evallist = [(xgtrain, 'train'), (xgvalid, 'valid')]
bst = xgb.train(params,
                xgtrain,
                num_rounds,
                evallist,
                # feval=my_score3,
                early_stopping_rounds=early_stopping_rounds)
# save the model
bst.save_model('model_log/single_1/0001.model')

cv_pred += bst.predict(xgtest)
valid_pred += bst.predict(xgvalid, ntree_limit=bst.best_iteration)

print('Final score of validation is {}'.format(my_score1(valid.label.values, valid_pred)))
pd.DataFrame({'id': test_id, 'score': cv_pred}).to_csv('submission/test_b/0003.csv', index=False)

[0]	train-logloss:0.604044	train-auc:0.941349	valid-logloss:0.604034	valid-auc:0.947099
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[1]	train-logloss:0.530796	train-auc:0.946747	valid-logloss:0.53089	valid-auc:0.951374
[2]	train-logloss:0.469686	train-auc:0.947711	valid-logloss:0.469786	valid-auc:0.953093
[3]	train-logloss:0.417964	train-auc:0.948178	valid-logloss:0.418133	valid-auc:0.953384
[4]	train-logloss:0.373585	train-auc:0.955091	valid-logloss:0.373701	valid-auc:0.95957
[5]	train-logloss:0.335449	train-auc:0.959321	valid-logloss:0.335654	valid-auc:0.961897
[6]	train-logloss:0.302421	train-auc:0.960279	valid-logloss:0.302757	valid-auc:0.962137
[7]	train-logloss:0.273406	train-auc:0.96028	valid-logloss:0.273724	valid-auc:0.962356
[8]	train-logloss:0.248045	train-auc:0.96322	valid-logloss:0.248446	valid-auc:0.964191
[9]	train-logloss:0.225845	train-auc:0.963805	valid-logloss:0.226363	

[92]	train-logloss:0.032869	train-auc:0.987316	valid-logloss:0.03988	valid-auc:0.977827
[93]	train-logloss:0.032799	train-auc:0.987392	valid-logloss:0.039832	valid-auc:0.97789
[94]	train-logloss:0.032745	train-auc:0.987442	valid-logloss:0.039792	valid-auc:0.977925
[95]	train-logloss:0.032699	train-auc:0.987471	valid-logloss:0.039781	valid-auc:0.977949
[96]	train-logloss:0.032641	train-auc:0.987527	valid-logloss:0.039789	valid-auc:0.977924
[97]	train-logloss:0.032567	train-auc:0.987596	valid-logloss:0.039782	valid-auc:0.97796
[98]	train-logloss:0.032537	train-auc:0.987636	valid-logloss:0.039803	valid-auc:0.977909
[99]	train-logloss:0.03245	train-auc:0.987708	valid-logloss:0.039768	valid-auc:0.978046
[100]	train-logloss:0.032369	train-auc:0.987814	valid-logloss:0.039726	valid-auc:0.978116
[101]	train-logloss:0.03234	train-auc:0.987858	valid-logloss:0.03971	valid-auc:0.978151
[102]	train-logloss:0.032228	train-auc:0.987972	valid-logloss:0.039667	valid-auc:0.978143
[103]	train-logloss:0.03

[184]	train-logloss:0.028445	train-auc:0.991421	valid-logloss:0.038935	valid-auc:0.978646
[185]	train-logloss:0.02842	train-auc:0.991435	valid-logloss:0.038937	valid-auc:0.978625
[186]	train-logloss:0.028382	train-auc:0.991464	valid-logloss:0.038928	valid-auc:0.978628
[187]	train-logloss:0.028305	train-auc:0.991513	valid-logloss:0.038941	valid-auc:0.978575
[188]	train-logloss:0.028269	train-auc:0.991541	valid-logloss:0.038905	valid-auc:0.978606
[189]	train-logloss:0.028224	train-auc:0.991589	valid-logloss:0.038898	valid-auc:0.978633
[190]	train-logloss:0.028175	train-auc:0.991644	valid-logloss:0.038886	valid-auc:0.978672
[191]	train-logloss:0.028157	train-auc:0.991666	valid-logloss:0.038892	valid-auc:0.978665
[192]	train-logloss:0.028116	train-auc:0.991687	valid-logloss:0.03887	valid-auc:0.978702
[193]	train-logloss:0.028077	train-auc:0.991715	valid-logloss:0.038858	valid-auc:0.978698
[194]	train-logloss:0.028029	train-auc:0.991759	valid-logloss:0.03887	valid-auc:0.978698
[195]	train-l

[276]	train-logloss:0.025331	train-auc:0.993803	valid-logloss:0.038771	valid-auc:0.9789
[277]	train-logloss:0.025298	train-auc:0.993822	valid-logloss:0.03878	valid-auc:0.978888
[278]	train-logloss:0.025264	train-auc:0.993841	valid-logloss:0.038784	valid-auc:0.978867
[279]	train-logloss:0.025242	train-auc:0.993852	valid-logloss:0.038789	valid-auc:0.978887
[280]	train-logloss:0.025213	train-auc:0.993872	valid-logloss:0.038768	valid-auc:0.978896
[281]	train-logloss:0.025173	train-auc:0.993888	valid-logloss:0.038765	valid-auc:0.978903
[282]	train-logloss:0.025148	train-auc:0.993907	valid-logloss:0.038781	valid-auc:0.978891
[283]	train-logloss:0.02514	train-auc:0.993919	valid-logloss:0.038788	valid-auc:0.978876
[284]	train-logloss:0.025102	train-auc:0.993943	valid-logloss:0.038759	valid-auc:0.978862
[285]	train-logloss:0.025062	train-auc:0.993962	valid-logloss:0.038767	valid-auc:0.978871
[286]	train-logloss:0.025013	train-auc:0.994016	valid-logloss:0.038732	valid-auc:0.978898
[287]	train-lo

In [None]:
for i in range(0,9,2):
    day_target = list(set([tmp+1 for tmp in range(31)])-\
                      set([10*tmp+i+1 if 10*tmp+i+1<=31 else 10*tmp+i-9 for tmp in range(4)]+\
                          [10*tmp+i+2 if 10*tmp+i+2<=31 else 10*tmp+i-8 for tmp in range(4)]))
    # print(day_target)
    train = data[data.Day.isin(day_target)]
    
    # set up the parameters
    params = {'max_depth': 6, 'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic'}
    params['nthread'] = 4
    params['eval_metric'] = ['logloss','auc']
    params["scale_pos_weight"] = 2
    params["subsample"] = 0.7
    params["colsample_bytree "] = 0.8
    num_rounds = 10000
    early_stopping_rounds = 50

    # set up the random seed for testing
    params["seed"] = 6
    
    xgtrain = xgb.DMatrix(train.drop(['id','date','label','Day'],axis=1).values, train.label.values)
    
    evallist = [(xgtrain, 'train'), (xgvalid, 'valid')]
    bst = xgb.train(params,
                    xgtrain,
                    num_rounds,
                    evallist,
                    #feval=my_score3,
                    early_stopping_rounds=early_stopping_rounds)
    # save the model
    bst.save_model('model_log/multi_2/part_{}.model'.format(i//2))
    
    valid_pred += bst.predict(xgvalid, ntree_limit=bst.best_iteration)
    cv_pred += bst.predict(xgtest, ntree_limit=bst.best_iteration)

valid_pred /= 5    
cv_pred /= 5

print('Final score of validation is {}'.format(my_score1(valid.label.values, valid_pred)))

pd.DataFrame({'id': test_id, 'score': cv_pred}).to_csv('submission/test_b/0002.csv', index=False)

**** Predict on the testset ****

In [None]:
test_id = test.id
cv_pred = np.zeros(len(test_id))
xgtest = xgb.DMatrix(test.drop(['id','date','Day'],axis=1).values)

In [None]:
tuned_rounds = [226, 327, 136, 418, 289]
for i in range(0,9,2):
    day_target = list(set([tmp+1 for tmp in range(31)])-\
                      set([10*tmp+i+1 if 10*tmp+i+1<=31 else 10*tmp+i-9 for tmp in range(4)]+\
                          [10*tmp+i+2 if 10*tmp+i+2<=31 else 10*tmp+i-8 for tmp in range(4)]))
    # print(day_target)
    train = data[data.Day.isin(day_target)]
    
    # set up the parameters
    params = {'max_depth': 6, 'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic'}
    params['nthread'] = 4
    params['eval_metric'] = ['logloss','auc']
    params["scale_pos_weight"] = 2
    params["subsample"] = 0.7
    params["colsample_bytree "] = 0.8
    num_rounds = tuned_rounds[i//2]
    early_stopping_rounds = 50

    # set up the random seed for testing
    params["seed"] = 6
    
    xgtrain = xgb.DMatrix(train.drop(['id','date','label','Day'],axis=1).values, train.label.values)
    
    evallist = [(xgtrain, 'train')]
    bst = xgb.train(params,
                    xgtrain,
                    num_rounds,
                    evallist,
                    #feval=my_score3,
                    early_stopping_rounds=early_stopping_rounds)
    # save the model
    bst.save_model('model_log/multi_3/part_{}.model'.format(i//2))
    
    cv_pred += bst.predict(xgtest, ntree_limit=bst.best_iteration)

cv_pred /= 5
pd.DataFrame({'id': test_id, 'score': cv_pred}).to_csv('submission/test_b/0002.csv', index=False)